def __init__(self, state_size, action_size, batch_size=128, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size) self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.current_value = Value(state_size).to(device) self.target_value = Value(state_size).to(device) self.softQ = soft_Q(state_size, action_size) self.policy = Policy(state_size, action_size) self.value_optimizer = optim.Adam(self.current_value.parameters(), lr=3e-4) self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)
def trpo(args): env = gym.make(args.env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(args.seed) torch.manual_seed(args.seed) policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1,), demean=False, clip=10) reward_record = [] global_steps = 0 for i_episode in range(args.num_episode): memory = Memory() # sample data: single path method num_steps = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(args.max_step_per_episode): action = select_single_action(policy_net, state) next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if done: break state = next_state num_steps += (t + 1) global_steps += (t + 1) reward_record.append({'steps': global_steps, 'reward': reward_sum}) batch = memory.sample() batch_size = len(memory) # update params rewards = Tensor(batch.reward) masks = Tensor(batch.mask) actions = Tensor(batch.action) states = Tensor(batch.state) values = value_net(states) returns = Tensor(batch_size) deltas = Tensor(batch_size) advantages = Tensor(batch_size) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(batch_size)): returns[i] = rewards[i] + args.gamma * prev_return * masks[i] deltas[i] = rewards[i] + args.gamma * prev_value * masks[i] - values[i] # ref: https://arxiv.org/pdf/1506.02438.pdf (generalization advantage estimate) # notation following PPO paper advantages[i] = deltas[i] + args.gamma * args.lamda * prev_advantage * masks[i] prev_return = returns[i] prev_value = values[i] prev_advantage = advantages[i] advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # optimize value network loss_func_args = (value_net, states, returns) old_loss, _ = get_value_loss(value_net.get_flat_params(), *loss_func_args) flat_params, opt_loss, opt_info = sciopt.fmin_l_bfgs_b(get_value_loss, value_net.get_flat_params(), args=loss_func_args, maxiter=args.value_opt_max_iter) value_net.set_flat_params(flat_params) print('ValueNet optimization: old loss = {}, new loss = {}'.format(old_loss, opt_loss)) # optimize policy network # 1. find search direction for network parameter optimization, use conjugate gradient (CG) # the direction can be found analytically, it s = - A^{-1} g, # where A is the Fisher Information Matrix (FIM) w.r.t. action probability distribution # and g is the gradient w.r.t. loss function \dfrac{\pi_\theta (a|s)}{q(a|s)} Q(s, a) policy_net.set_old_loss(states, actions) loss = policy_net.get_loss(states, actions, advantages) g = torch.autograd.grad(loss, policy_net.parameters()) flat_g = torch.cat([grad.view(-1) for grad in g]).data Av = lambda v: policy_net.kl_hessian_times_vector(states, v) step_dir = conjugate_gradient(Av, - flat_g, nsteps=args.cg_nsteps) # 2. find maximum stepsize along the search direction # the problem: min g * x s.t. 1/2 * x^T * A * x <= delta # can be solved analytically with x = beta * s # where beta = sqrt(2 delta / s^T A s) sAs = 0.5 * (step_dir * Av(step_dir)).sum(0) beta = torch.sqrt(2 * args.max_kl / sAs) full_step = (beta * step_dir).data.numpy() # 3. do line search along the found direction, with maximum change = full_step # the maximum change is restricted by the KL divergence constraint # line search with backtracking method get_policy_loss = lambda x: policy_net.get_loss(states, actions, advantages) old_loss = get_policy_loss(None) success, new_params = line_search(policy_net, get_policy_loss, full_step, flat_g) policy_net.set_flat_params(new_params) new_loss = get_policy_loss(None) print('PolicyNet optimization: old loss = {}, new loss = {}'.format(old_loss, new_loss)) if i_episode % args.log_num_episode == 0: print('Finished episode: {} Mean Reward: {}'.format(i_episode, reward_record[-1])) print('-----------------') policy_net.save_model_policy() value_net.save_model_value() return reward_record
def main(args): policy = Cvae() policy_optimizer = optim.Adam(policy.parameters(), lr=args.lr_cvae) value_network = Value() value_optimizer = optim.Adam(value_network.parameters(), lr=args.lr_value) mse_loss = nn.MSELoss() env = gym.make('Acrobot-v1') time_str, trained_model = cvae_policy_train(env, policy, value_network, policy_optimizer, value_optimizer, mse_loss, args) # Test the trained model using argmax. env = gym.make('Acrobot-v1') if args.record: # Store results from different runs of the model separately. results_directory = ''.join(['/tmp', args.directory_name, '/test/', time_str, '_discounting_', str(args.gamma), '_update_frequency_', str(args.update_frequency), '_value_update_times_', str(args.value_update_times)]) env = gym.wrappers.Monitor(env, results_directory) if not args.cuda: plt.ion() test_returns = [] for i in range(args.test_time): state_ = env.reset() done = False cumulative_return = 0 for timestep in range(0, 500): if not done: if not args.cuda: env.render() state_ = th.from_numpy(state_.reshape(1, -1)) state = Variable(state_, requires_grad=False).type(Tensor) padding = Variable(th.zeros(1, 3), requires_grad=False).type(Tensor) state_padded = th.cat([state, padding], 1) _, _, p = trained_model.forward(state_padded) action = th.max(p, 1)[1].data[0] next_state_, reward_, done, info_ = env.step(action) cumulative_return += (args.gamma ** timestep) * reward_ state_ = next_state_ test_returns.append(cumulative_return) print('====> Cumulative return: {}'.format(cumulative_return)) plt.clf() plt.figure(1) plt.xlabel('episodes') plt.ylabel('cumulative returns') plt.plot(test_returns) plt.show() plt.savefig(''.join(['cvae/test/', time_str, '_discounting_', str(args.gamma), '_update_frequency_', str(args.update_frequency), '_value_update_times_', str(args.value_update_times)]) + '.png') if not args.cuda: plt.ioff() plt.close() env.close()