def get_opt_policy(self, emp_f_counts, learning_rate, num_steps): k = len(emp_f_counts) #initialize W W = np.zeros(k) for t in range(num_steps): print("---- MaxEnt iteration : ", t) #PRINT OUT W's print("weights") for i in range(k): print(W[i], ) print() #compute an epsilon-optimal policy for the MDP #with R(s) = W ^T \phi(s) reward_fn = rbf.RbfReward(self.rbf_fn, W, self.env) value_fn = ValueFunction(self.alpha, self.num_tilings) print("solving mdp with sarsa semigrad") self.mdp_solver(self.env, value_fn, reward_fn, max_time=500) #debug watch policy #evaluate_policy(self.env, 4, value_fn) #compute an epsilon-good estimate of expected feature counts fcounts_pi = rbf.get_expected_feature_counts_softmax( self.num_rollouts, self.rbf_fn, value_fn, self.env, self.discount) print("expected f counts") for f in fcounts_pi: print(f, ) print() #print size of the gradient grad_mag = 0.0 for i in range(k): grad_mag += (emp_f_counts[i] - fcounts_pi[i])**2 print("Gradient = ", math.sqrt(grad_mag)) #update W for i in range(k): W[i] += learning_rate * (emp_f_counts[i] - fcounts_pi[i]) return value_fn
def get_opt_policy(self, emp_f_counts, T): k = len(emp_f_counts) beta = 1.0 / (1.0 + np.sqrt(2.0 * np.log(k) / T)) #initialize W W = np.ones(k) for t in range(1, T + 1): print("---- MWAL iteration : ", t) #normalize the W's W = W / np.sum(W) #PRINT OUT W's print("weights") for i in range(k): print(W[i], ) print() print(np.sum(np.abs(W))) #compute an epsilon-optimal policy for the MDP #with R(s) = W_norm ^T \phi(s) reward_fn = rbf.RbfReward(self.rbf_fn, W, self.env) value_fn = ValueFunction(self.alpha, self.num_tilings) self.value_fns.append(value_fn) print("solving mdp with sarsa semigrad") self.mdp_solver(self.env, value_fn, reward_fn) #debug watch policy #evaluate_policy(self.env, 4, value_fn) #compute an epsilon-good estimate of expected feature counts fcounts_pi = rbf.get_expected_feature_counts( self.num_rollouts, self.rbf_fn, value_fn, self.env, self.discount) print("expected f counts") for f in fcounts_pi: print(f, ) print() #update W values #calculate tildeG fcount_diffs = np.array(fcounts_pi - emp_f_counts) W = W * (beta**fcount_diffs) return self.value_fns
rbf_grid_size = 8 assert (skip_time < reps) env = gym.make('MountainCar-v0') env.seed(seed) random.seed(seed) numOfTilings = 8 alpha = 0.5 n = 1 # use optimistic initial value, so it's ok to set epsilon to 0 EPSILON = 0 discount = 0.999 #using high discount factor valueFunction = ValueFunction(alpha, numOfTilings) ##feature map features = [] # centers = np.array([[0.0, 0.0], [0.0, 0.2], [0.0, 0.4], [0.0, 0.6], [0.0, 0.8], [0.0, 1.0], # [0.25, 0.0], [0.25, 0.25], [0.25, 0.5], [0.25, 0.75], [0.25, 1.0], # [0.5, 0.0], [0.5, 0.25], [0.5, 0.5], [0.5, 0.75], [0.5, 1.0], # [0.75, 0.0], [0.75, 0.25], [0.75, 0.5], [0.75, 0.75], [0.75, 1.0], # [1.0, 0.0], [1.0, 0.25], [1.0, 0.5], [1.0, 0.75], [1.0, 1.0]]) centers = generate_grid_centers(rbf_grid_size) print(centers) widths = 0.15 * np.ones(len(centers)) rbfun = RBF(centers, widths, env.action_space.n) fMap = Rbf_2D_Feature_Map(rbfun)
def get_opt_policy(self, demos, num_features, confidence, num_steps, step_size, time_limit=1000): #initialize W W = 2 * np.random.random(num_features) - 1 #normalize W = W / np.sum(np.abs(W)) print("initial weights") #for i in range(num_features): # print(W[i], end = ",") #print() #solve for optimal policy with R = W^T \phi(S) reward_fn = rbf.RbfReward(self.rbf_fn, W, self.env) value_fn = ValueFunction(self.alpha, self.num_tilings) print("solving mdp with sarsa semigrad") self.mdp_solver(self.env, value_fn, reward_fn, max_time=time_limit) likelihood_prev = self.compute_likelihood(value_fn, demos, confidence) print("initial likelihood", likelihood_prev) #initialize variables to keep track of MAP map_value_fn = value_fn map_reward_fn = reward_fn map_likelihood = likelihood_prev accept_cnt = 0 for t in range(num_steps): print("---- BIRL iteration : ", t) #randomly tweak weights W_new = W + step_size * (2 * np.random.random(num_features) - 1) #renormalize W_new = W_new / np.sum(np.abs(W_new)) #PRINT OUT W's print("new weights") #for i in range(num_features): # print(W_new[i], end = ",") #print() #compute an epsilon-optimal policy for the MDP #with R(s) = W ^T \phi(s) reward_fn_new = rbf.RbfReward(self.rbf_fn, W_new, self.env) value_fn_new = ValueFunction(self.alpha, self.num_tilings) print("solving mdp with sarsa semigrad") self.mdp_solver(self.env, value_fn_new, reward_fn_new, max_time=time_limit) likelihood_new = self.compute_likelihood(value_fn_new, demos, confidence) print("new likelihood", likelihood_new) if np.random.rand() < min( 1, np.exp(likelihood_new - likelihood_prev)): print("accept") accept_cnt += 1 likelihood_prev = likelihood_new value_fn = value_fn_new if likelihood_new > map_likelihood: map_likelihood = likelihood_new map_value_fn = value_fn_new map_reward = reward_fn_new print("updating best") print("best likelihood", map_likelihood) print("num accepts = ", accept_cnt) return map_value_fn, map_reward_fn