def train(N, T, delta, env): """ param N: number of trajectories to sample in each time step param T: number of iterations to train the model param delta: trust region size param env: the environment for the policy to learn return: theta: the trained model parameters avg_episodes_rewards: list of average rewards for each time step """ theta = np.random.rand(C.extracted_feature_size, 1) episode_rewards = [] #record the reward during the training process for iteration_index in range(0, T): #first, I sample the grads and the rewards trajectories_grads, trajectories_rewards = sample(theta, env, N) total_reward = 0 for trajectory_index in range(0, len(trajectories_rewards)): current_reward = trajectories_rewards[trajectory_index] total_reward += np.sum(current_reward) total_reward = total_reward / len(trajectories_rewards) print('total reward is', total_reward, 'and this is training epoch', iteration_index) episode_rewards.append(total_reward) #gradient of the value function value_function_gradient = utils.compute_value_gradient( trajectories_grads, trajectories_rewards) #fisher matrix fisher_matrix = utils.compute_fisher_matrix(trajectories_grads) #step size step_size = utils.compute_eta(delta, fisher_matrix, value_function_gradient) #update theta theta += step_size * np.linalg.inv( fisher_matrix) @ value_function_gradient #save the learned parameter theta learned_parameter_theta = {} learned_parameter_theta['learned_parameter_theta'] = theta cwd = os.getcwd() #cwd = os.path.join(cwd, 'data_folder') parameter_file = 'learned_parameter_theta.json' cwd = os.path.join(cwd, parameter_file) with open(cwd, 'w') as statusFile: statusFile.write(jsonpickle.encode(learned_parameter_theta)) return theta, episode_rewards
def train(N, T, delta): """ :param N: number of trajectories to sample in each time step :param T: number of iterations to train the model :param delta: trust region size :return: theta: the trained model parameters avg_episodes_rewards: list of average rewards for each time step """ theta = np.random.rand(200, 1) env = simple_continuous_buy_sell_spy.simple_continuous_buy_sell_spy() episode_rewards = [] for iteration_index in range(0, T): #first, I sample the grads and the rewards trajectories_grads, trajectories_reward = sample(theta, env, N) total_reward = 0 for trajectory_index in range(0, len(trajectories_reward)): current_reward = trajectories_reward[trajectory_index] total_reward += np.sum(current_reward) total_reward = total_reward / len(trajectories_reward) print('total_reward.append(', total_reward, ')') episode_rewards.append(total_reward) #gradient of the value function value_function_gradient = utils.compute_value_gradient( trajectories_grads, trajectories_reward) #fisher matrix fisher_matrix = utils.compute_fisher_matrix(trajectories_grads) #step size step_size = utils.compute_eta(delta, fisher_matrix, value_function_gradient) #update theta update_theta = step_size * np.linalg.inv( fisher_matrix) @ value_function_gradient if np.isfinite(update_theta).all() == True: theta += update_theta else: theta return theta, episode_rewards
def train(N, T, delta): """ :param N: number of trajectories to sample in each time step :param T: number of iterations to train the model :param delta: trust region size :return: theta: the trained model parameters avg_episodes_rewards: list of average rewards for each time step """ theta = np.random.rand(100, 1) env = gym.make('CartPole-v0') env.seed(12345) episode_rewards = [] for iteration_index in range(0, T): #first, I sample the grads and the rewards trajectories_grads, trajectories_reward = sample(theta, env, N) total_reward = 0 for trajectory_index in range(0, len(trajectories_reward)): current_reward = trajectories_reward[trajectory_index] total_reward += np.sum(current_reward) total_reward = total_reward / len(trajectories_reward) print('total reward is', total_reward) episode_rewards.append(total_reward) #gradient of the value function value_function_gradient = utils.compute_value_gradient( trajectories_grads, trajectories_reward) #fisher matrix fisher_matrix = utils.compute_fisher_matrix(trajectories_grads) #step size step_size = utils.compute_eta(delta, fisher_matrix, value_function_gradient) #update theta theta += step_size * np.linalg.inv( fisher_matrix) @ value_function_gradient return theta, episode_rewards
soln_fisher = tests_info[i]['fisher'] fisher = utils.compute_fisher_matrix(total_grads) err = np.linalg.norm(soln_fisher - fisher) print('test {} for compute_fisher_matrix - error = {}'.format(i, err)) """ ------------- testing compute_value_gradient ----------------""" print('-' * 10 + ' testing compute_value_gradient ' + '-' * 10) for i in test_cases: total_grads = tests_info[i]['total_grads'] total_rewards = tests_info[i]['total_rewards'] soln_v_grad = tests_info[i]['v_grad'] #print('the solution grad is',soln_v_grad.tolist()) v_grad = utils.compute_value_gradient(total_grads, total_rewards) #print('the computed grad is',v_grad.tolist()) err = np.linalg.norm(soln_v_grad - v_grad) print('test {} for compute_value_gradient - error = {}'.format(i, err)) """ ------------- testing compute_value_gradient ----------------""" print('-' * 10 + ' testing compute_value_gradient ' + '-' * 10) for i in test_cases: fisher = tests_info[i]['fisher'] delta = 1e-2 v_grad = tests_info[i]['v_grad'] soln_eta = tests_info[i]['eta'] eta = utils.compute_eta(delta, fisher, v_grad)
def train(N, T, delta, env): """ param N: number of trajectories to sample in each time step param T: number of iterations to train the model param delta: trust region size param env: the environment for the policy to learn return: theta: the trained model parameters avg_episodes_rewards: list of average rewards for each time step """ theta = np.random.rand(C.extracted_feature_size, 1) #cov matrix for the exploration part of sampling variance = torch.full(size=(C.output_dim, ), fill_value=C.variance_for_exploration) cov_matrix = torch.diag(variance) #inv_cov_matrix for computing log grad of action distribution inv_cov_matrix_diag = np.ones( C.output_dim) * (1.0 / C.variance_for_exploration) inv_cov_matrix = np.diag(inv_cov_matrix_diag) replay_buffer = [] replay_buffer_rewards = [] optimization_history_list = [] for iteration_index in range(0, T): #first, I sample the grads and the rewards replay_buffer, replay_buffer_rewards, current_batch_reward = sample( theta, env, N, replay_buffer, replay_buffer_rewards, cov_matrix) #record the optimization process optimization_history_list.append(current_batch_reward) optimization_history = {} optimization_history['objective_history'] = optimization_history_list cwd = os.getcwd() #cwd = os.path.join(cwd, 'data_folder') parameter_file = 'optimization_history.json' cwd = os.path.join(cwd, parameter_file) with open(cwd, 'w') as statusFile: statusFile.write(jsonpickle.encode(optimization_history)) print('this is training epoch', iteration_index) print('the current reward is', current_batch_reward) for _ in range(0, C.max_offline_training): #sample experience from the replay buffer for training # new_replay_buffer_rewards = [] # for entry in replay_buffer_rewards: # new_replay_buffer_rewards.append(np.log(entry*-1)*-1) #because the reward is negative here # sample_probability = (np.exp(new_replay_buffer_rewards))/np.sum(np.exp(new_replay_buffer_rewards)) #apply softmax to the total_reward list sampled_off_line_data = [] for sample_counter in range(0, C.batch_size): #sampled_index = np.random.choice(np.arange(0, len(replay_buffer)), p=sample_probability.tolist()) sampled_index = random.randint(0, len(replay_buffer) - 1) sampled_off_line_data.append(replay_buffer[sampled_index]) #update model #gradient of the value function value_function_gradient, grads_for_fisher_matrix = utils.compute_value_gradient( sampled_off_line_data, theta, cov_matrix, inv_cov_matrix) #fisher matrix fisher_matrix = utils.compute_fisher_matrix( grads_for_fisher_matrix) #step size step_size = utils.compute_eta(delta, fisher_matrix, value_function_gradient) #update theta theta += step_size * np.linalg.inv( fisher_matrix) @ value_function_gradient #save the learned parameter theta learned_parameter_theta = {} learned_parameter_theta['learned_parameter_theta'] = theta cwd = os.getcwd() #cwd = os.path.join(cwd, 'data_folder') parameter_file = 'learned_parameter_theta.json' cwd = os.path.join(cwd, parameter_file) with open(cwd, 'w') as statusFile: statusFile.write(jsonpickle.encode(learned_parameter_theta)) return theta