class A2CAgent(): """ init function input: env, which is the CartPole-v0 gamma, 0.99 in this case lr, learning rate is 1e-4 define: env = env, which is the CartPole-v0 obs_dim: 4 obervations Observation: Type: Box(4) Num Observation Min Max 0 Cart Position -4.8 4.8 1 Cart Velocity -Inf Inf 2 Pole Angle -24 deg 24 deg 3 Pole Velocity At Tip -Inf Inf action_dim: 2 actions Actions: Type: Discrete(2) Num Action 0 Push cart to the left 1 Push cart to the right value_network: two layer network with input 4 (observation dim) and output 1 (reward?) policy_network: two layer network with input 4 (observation dim) and output 2 (action dim) value and policy optimizer using default Adam and learning rate """ def __init__(self, env, gamma, lr): self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) """ input state to get the next action using policy network to get the next state by using softmax """ def get_action(self, state): state = torch.FloatTensor(state) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() """ form trajectory get all of the information, and calculated the discounted_rewards use value network to train the states with new values and compute the loss between the value and target value by using MSE same logic for policy network FloatTensor = FLOAT TYPE ARRAY t tensor([[1, 2, 3], [4, 5, 6]]) t.view(-1,1) tensor([[1], [2], [3], [4], [5], [6]]) """ def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]) actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1) rewards = torch.FloatTensor([sars[2] for sars in trajectory]) next_states = torch.FloatTensor([sars[3] for sars in trajectory]) dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1) # compute value target ## Two for loop to calculate the discounted reward for each one discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma ** i for i in range(rewards[j:].size(0))]) \ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss """ zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. opt.step() causes the optimizer to take a step based on the gradients of the parameters. """ def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env # self.action_range = [env.action_space.low, env.action_space.high] # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters self.action_range = [[-1, 1], [-1, 1]] self.obs_dim = env.observation_space.shape[0] self.action_dim = 2 # self.action_dim = 1 # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) # pi: state -> acton def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): '''if action < 0.5: return 0 else: return 1''' scaled_action = [] for idx, a in enumerate(action): action_range = self.action_range[idx] a = (action_range[1] - action_range[0]) / 2.0 + ( action_range[1] + action_range[0]) / 2.0 scaled_action.append(a) return scaled_action def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) #TODO: Question: why using 2 Q-networks? # To reduce bias in training. # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy net and target value nets # TODO: Question: what does this part do? # The original paper mentioned 2 methods for approximating the value function # 1. the EMA of policy weights to update the Q network # 2. periodical update of the policy network, which is used in this code if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1