def sample_one_reward(theta, env, num_actions): this_trajectory_reward = [] this_trajectory_grads = [] #first, initialize the observation observation = env.reset() current_feature = utils.extract_features(observation, num_actions) for time_index in range(0, 200): #compute an action given current observation action_distribution = utils.compute_action_distribution( theta, current_feature) #print("the action distribution is",action_distribution) #action = np.argmax(action_distribution) This is not correct #action = np.random.binomial(1,action_distribution[0][1],1)[0] action = np.random.choice(num_actions, 1, p=action_distribution[0])[0] #print("the action is",action) #apply the action to the environment observation, reward, done, info = env.step(action) this_trajectory_reward.append(reward) log_softmax_grad = utils.compute_log_softmax_grad( theta, current_feature, action) this_trajectory_grads.append(log_softmax_grad) current_feature = utils.extract_features(observation, num_actions) if done: break return this_trajectory_reward, this_trajectory_grads
def sample_one_trajectory(theta, env): this_trajectory_reward = [] this_trajectory_grads = [] #first, initialize the observation observation = env.reset() current_feature = utils.extract_features(observation, C.output_dim) for time_index in range(0, 200): #compute an action given current observation action = utils.compute_action_distribution(theta, current_feature, mode='train') #apply the action to the environment observation, reward, done, info = env.step(action[0]) #record reward and grad this_trajectory_reward.append(reward) computed_grad_log_state_action = utils.compute_log_grad( theta, current_feature, action) this_trajectory_grads.append(computed_grad_log_state_action) current_feature = utils.extract_features(observation, C.output_dim) if done: break return this_trajectory_reward, this_trajectory_grads
def sample_one_trajectory(theta, env, cov_matrix): this_trajectory_rewards = [] this_trajectory_states = [] this_trajectory_actions = [] this_trajectory_log_prob_action_states = [] #first, initialize the observation observation = env.reset() current_feature = utils.extract_features(observation, C.output_dim) for time_index in range(0, 200): #compute an action given current observation action, log_prob = utils.compute_action_distribution(theta, current_feature, cov_matrix, mode='train') #apply the action to the environment observation, reward, done, info = env.step(action) #record reward and grad this_trajectory_rewards.append(reward) this_trajectory_states.append(current_feature) this_trajectory_actions.append(action[0]) this_trajectory_log_prob_action_states.append(log_prob) #next time step current_feature = utils.extract_features(observation, C.output_dim) if done: break sample_dict = {} sample_dict['rewards'] = this_trajectory_rewards sample_dict['states'] = this_trajectory_states sample_dict['actions'] = this_trajectory_actions sample_dict['log_prob'] = this_trajectory_log_prob_action_states total_reward = np.sum(this_trajectory_rewards) return sample_dict, total_reward
def sample_one_trajectory(q, theta, env): this_trajectory_reward = [] this_trajectory_grads = [] execute_sell = False null_objective = True #first, initialize the observation observation = env.reset() current_feature = utils.extract_features(observation, C.output_dim) while True: #compute an action given current observation action = utils.compute_action_distribution(theta, current_feature, mode='train') #apply the action to the environment observation, reward, done, info = env.step(action[0]) #record reward and grad computed_grad_log_state_action = utils.compute_log_grad( theta, current_feature, action) this_trajectory_grads.append(computed_grad_log_state_action) if done: break current_feature = utils.extract_features(observation, C.output_dim) for _ in range(0, len(this_trajectory_grads)): this_trajectory_reward.append(reward) #print('finished one sample and the reward is',np.mean(this_trajectory_reward)) q.put([this_trajectory_reward, this_trajectory_grads])
if __name__ == '__main__': np.random.seed(1234) theta, episode_rewards = train(N=100, T=20, delta=1e-2) env = gym.make('CartPole-v0') env.seed(12345) observation = env.reset() num_actions = 2 current_feature = utils.extract_features(observation, num_actions) for t in range(200): env.render() #compute an action given current observation action_distribution = utils.compute_action_distribution( theta, current_feature) #action = np.argmax(action_distribution) This is not correct action = np.random.binomial(1, action_distribution[0][1], 1)[0] #apply the action to the environment observation, reward, done, info = env.step(action) #compute the next feature vector current_feature = utils.extract_features(observation, num_actions) if done: print("Episode finished after {} timesteps".format(t + 1)) break plt.plot(episode_rewards) plt.title("avg rewards per timestep") plt.xlabel("timestep") plt.ylabel("avg rewards")
import numpy as np import utils with open('test_info.pkl', 'rb') as f: tests_info = pickle.load(f) test_cases = sorted(tests_info.keys()) """ ------------- testing action distribution computation ----------------""" print('-'*10 + ' testing compute_action_distribution ' + '-'*10) for i in test_cases: theta = tests_info[i]['theta'] phis = tests_info[i]['phis'] soln_action_dist = tests_info[i]['action_dst'] action_dist = utils.compute_action_distribution(theta, phis) err = np.linalg.norm(soln_action_dist - action_dist) print('test {} for compute_action_distribution - error = {}'.format(i, err)) """ ------------- testing compute_log_softmax_grad ----------------""" print('-' * 10 + ' testing compute_log_softmax_grad ' + '-' * 10) for i in test_cases: theta = tests_info[i]['theta'] phis = tests_info[i]['phis'] action = tests_info[i]['action'] soln_grad = tests_info[i]['grad'] grad = utils.compute_log_softmax_grad(theta, phis, action) err = np.linalg.norm(soln_grad - grad) print('test {} for compute_log_softmax_grad - error = {}'.format(i, err))
param N: number of trajectories to sample in each time step param T: number of iterations to train the model param delta: trust region size, because we are using trpo param env: the environment for the policy to learn ''' #test the training result observation = env.reset() current_feature = utils.extract_features(observation, C.output_dim) for t in range(200): env.render() #compute an action given current observation action = utils.compute_action_distribution(theta, current_feature, mode='test') #apply the action to the environment observation, reward, done, info = env.step(action) #compute the next feature vector current_feature = utils.extract_features(observation, C.output_dim) if done: print("Episode finished after {} timesteps".format(t + 1)) break #plot the training history plt.plot(episode_rewards) plt.title("avg rewards per timestep")