def get_opt_policy(self, emp_f_counts, learning_rate, num_steps):
        k = len(emp_f_counts)

        #initialize W
        W = np.zeros(k)
        for t in range(num_steps):
            print("---- MaxEnt iteration : ", t)
            #PRINT OUT W's
            print("weights")
            for i in range(k):
                print(W[i], )
            print()
            #compute an epsilon-optimal policy for the MDP
            #with R(s) = W ^T \phi(s)
            reward_fn = rbf.RbfReward(self.rbf_fn, W, self.env)

            value_fn = ValueFunction(self.alpha, self.num_tilings)
            print("solving mdp with sarsa semigrad")
            self.mdp_solver(self.env, value_fn, reward_fn, max_time=500)

            #debug watch policy
            #evaluate_policy(self.env, 4, value_fn)

            #compute an epsilon-good estimate of expected feature counts
            fcounts_pi = rbf.get_expected_feature_counts_softmax(
                self.num_rollouts, self.rbf_fn, value_fn, self.env,
                self.discount)
            print("expected f counts")
            for f in fcounts_pi:
                print(f, )
            print()

            #print size of the gradient
            grad_mag = 0.0
            for i in range(k):
                grad_mag += (emp_f_counts[i] - fcounts_pi[i])**2
            print("Gradient = ", math.sqrt(grad_mag))

            #update W
            for i in range(k):
                W[i] += learning_rate * (emp_f_counts[i] - fcounts_pi[i])
        return value_fn
    def get_opt_policy(self, emp_f_counts, T):
        k = len(emp_f_counts)
        beta = 1.0 / (1.0 + np.sqrt(2.0 * np.log(k) / T))

        #initialize W
        W = np.ones(k)
        for t in range(1, T + 1):
            print("---- MWAL iteration : ", t)
            #normalize the W's
            W = W / np.sum(W)
            #PRINT OUT W's
            print("weights")
            for i in range(k):
                print(W[i], )
            print()
            print(np.sum(np.abs(W)))
            #compute an epsilon-optimal policy for the MDP
            #with R(s) = W_norm ^T \phi(s)
            reward_fn = rbf.RbfReward(self.rbf_fn, W, self.env)

            value_fn = ValueFunction(self.alpha, self.num_tilings)
            self.value_fns.append(value_fn)
            print("solving mdp with sarsa semigrad")
            self.mdp_solver(self.env, value_fn, reward_fn)

            #debug watch policy
            #evaluate_policy(self.env, 4, value_fn)

            #compute an epsilon-good estimate of expected feature counts
            fcounts_pi = rbf.get_expected_feature_counts(
                self.num_rollouts, self.rbf_fn, value_fn, self.env,
                self.discount)
            print("expected f counts")
            for f in fcounts_pi:
                print(f, )
            print()

            #update W values
            #calculate tildeG
            fcount_diffs = np.array(fcounts_pi - emp_f_counts)
            W = W * (beta**fcount_diffs)
        return self.value_fns
    rbf_grid_size = 8
    assert (skip_time < reps)

    env = gym.make('MountainCar-v0')
    env.seed(seed)
    random.seed(seed)

    numOfTilings = 8
    alpha = 0.5
    n = 1

    # use optimistic initial value, so it's ok to set epsilon to 0
    EPSILON = 0
    discount = 0.999  #using high discount factor

    valueFunction = ValueFunction(alpha, numOfTilings)

    ##feature map
    features = []
    #    centers = np.array([[0.0, 0.0], [0.0, 0.2], [0.0, 0.4], [0.0, 0.6], [0.0, 0.8], [0.0, 1.0],
    #                        [0.25, 0.0], [0.25, 0.25], [0.25, 0.5], [0.25, 0.75], [0.25, 1.0],
    #                        [0.5, 0.0], [0.5, 0.25], [0.5, 0.5], [0.5, 0.75], [0.5, 1.0],
    #                        [0.75, 0.0], [0.75, 0.25], [0.75, 0.5], [0.75, 0.75], [0.75, 1.0],
    #                        [1.0, 0.0], [1.0, 0.25], [1.0, 0.5], [1.0, 0.75], [1.0, 1.0]])
    centers = generate_grid_centers(rbf_grid_size)
    print(centers)
    widths = 0.15 * np.ones(len(centers))

    rbfun = RBF(centers, widths, env.action_space.n)
    fMap = Rbf_2D_Feature_Map(rbfun)
    def get_opt_policy(self,
                       demos,
                       num_features,
                       confidence,
                       num_steps,
                       step_size,
                       time_limit=1000):

        #initialize W
        W = 2 * np.random.random(num_features) - 1
        #normalize
        W = W / np.sum(np.abs(W))
        print("initial weights")
        #for i in range(num_features):
        #    print(W[i], end = ",")
        #print()
        #solve for optimal policy with R = W^T \phi(S)
        reward_fn = rbf.RbfReward(self.rbf_fn, W, self.env)
        value_fn = ValueFunction(self.alpha, self.num_tilings)
        print("solving mdp with sarsa semigrad")
        self.mdp_solver(self.env, value_fn, reward_fn, max_time=time_limit)
        likelihood_prev = self.compute_likelihood(value_fn, demos, confidence)
        print("initial likelihood", likelihood_prev)
        #initialize variables to keep track of MAP
        map_value_fn = value_fn
        map_reward_fn = reward_fn
        map_likelihood = likelihood_prev

        accept_cnt = 0

        for t in range(num_steps):
            print("---- BIRL iteration : ", t)
            #randomly tweak weights
            W_new = W + step_size * (2 * np.random.random(num_features) - 1)
            #renormalize
            W_new = W_new / np.sum(np.abs(W_new))
            #PRINT OUT W's
            print("new weights")
            #for i in range(num_features):
            #    print(W_new[i], end = ",")
            #print()
            #compute an epsilon-optimal policy for the MDP
            #with R(s) = W ^T \phi(s)
            reward_fn_new = rbf.RbfReward(self.rbf_fn, W_new, self.env)

            value_fn_new = ValueFunction(self.alpha, self.num_tilings)
            print("solving mdp with sarsa semigrad")
            self.mdp_solver(self.env,
                            value_fn_new,
                            reward_fn_new,
                            max_time=time_limit)

            likelihood_new = self.compute_likelihood(value_fn_new, demos,
                                                     confidence)
            print("new likelihood", likelihood_new)

            if np.random.rand() < min(
                    1, np.exp(likelihood_new - likelihood_prev)):
                print("accept")
                accept_cnt += 1
                likelihood_prev = likelihood_new
                value_fn = value_fn_new
                if likelihood_new > map_likelihood:
                    map_likelihood = likelihood_new
                    map_value_fn = value_fn_new
                    map_reward = reward_fn_new
                    print("updating best")
                    print("best likelihood", map_likelihood)

        print("num accepts = ", accept_cnt)
        return map_value_fn, map_reward_fn