class Agent(): def __init__(self, state_size, action_size, batch_size=128, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size) self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.current_value = Value(state_size).to(device) self.target_value = Value(state_size).to(device) self.softQ = soft_Q(state_size, action_size) self.policy = Policy(state_size, action_size) self.value_optimizer = optim.Adam(self.current_value.parameters(), lr=3e-4) self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4) def act(self, state): #state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.policy.act(state) if self.memory.__len__() > self.batch_size: self.update() return action def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) def update(self): state, action, reward, next_state, done = self.memory.sample() expected_soft_q_value = self.softQ.forward(state, action) expected_value = self.current_value.forward(state) new_action, log_prob, z, mean, log_std = self.policy.evaluate(state) target_value = self.target_value.forward(next_state) next_soft_q_value = reward + self.gamma * target_value * (1 - done) q_val_mse = F.mse_loss(expected_soft_q_value, next_soft_q_value.detach()) expected_new_q_val = self.softQ.forward(state, new_action) next_value = expected_new_q_val - log_prob val_loss = F.mse_loss(expected_value, next_value.detach()) log_prob_target = expected_new_q_val - expected_value policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() mean_loss = self.mean_lambda * mean.pow(2).mean() std_loss = self.std_lambda * log_std.pow(2).mean() z_loss = self.z_lambda * z.pow(2).sum(1).mean() policy_loss += mean_loss + std_loss + z_loss self.soft_q_optimizer.zero_grad() q_val_mse.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() val_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.soft_update(self.current_value, self.target_value, TAU) def soft_update(self, local_model, target_model, TRANSFER_RATE): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TRANSFER_RATE * local_param.data + (1.0 - TRANSFER_RATE) * target_param.data)
def main(args): policy = Cvae() policy_optimizer = optim.Adam(policy.parameters(), lr=args.lr_cvae) value_network = Value() value_optimizer = optim.Adam(value_network.parameters(), lr=args.lr_value) mse_loss = nn.MSELoss() env = gym.make('Acrobot-v1') time_str, trained_model = cvae_policy_train(env, policy, value_network, policy_optimizer, value_optimizer, mse_loss, args) # Test the trained model using argmax. env = gym.make('Acrobot-v1') if args.record: # Store results from different runs of the model separately. results_directory = ''.join(['/tmp', args.directory_name, '/test/', time_str, '_discounting_', str(args.gamma), '_update_frequency_', str(args.update_frequency), '_value_update_times_', str(args.value_update_times)]) env = gym.wrappers.Monitor(env, results_directory) if not args.cuda: plt.ion() test_returns = [] for i in range(args.test_time): state_ = env.reset() done = False cumulative_return = 0 for timestep in range(0, 500): if not done: if not args.cuda: env.render() state_ = th.from_numpy(state_.reshape(1, -1)) state = Variable(state_, requires_grad=False).type(Tensor) padding = Variable(th.zeros(1, 3), requires_grad=False).type(Tensor) state_padded = th.cat([state, padding], 1) _, _, p = trained_model.forward(state_padded) action = th.max(p, 1)[1].data[0] next_state_, reward_, done, info_ = env.step(action) cumulative_return += (args.gamma ** timestep) * reward_ state_ = next_state_ test_returns.append(cumulative_return) print('====> Cumulative return: {}'.format(cumulative_return)) plt.clf() plt.figure(1) plt.xlabel('episodes') plt.ylabel('cumulative returns') plt.plot(test_returns) plt.show() plt.savefig(''.join(['cvae/test/', time_str, '_discounting_', str(args.gamma), '_update_frequency_', str(args.update_frequency), '_value_update_times_', str(args.value_update_times)]) + '.png') if not args.cuda: plt.ioff() plt.close() env.close()