class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: DONE for _ in range(self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for _ in range(self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: DONE # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) V_s = self.critic.forward_np(ob_no) V_s_prime = self.critic.forward_np(next_ob_no) Q_s_a = re_n + self.gamma*V_s_prime*(1-terminal_n) adv_n = Q_s_a - V_s if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.gae = self.agent_params['gae'] self.gae_lambda = self.agent_params['gae_lambda'] self.ppo = self.agent_params['ppo'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) if self.ppo: self.old_actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) self.old_actor.load_state_dict(self.actor.state_dict()) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic rewards = np.concatenate([r for r in re_n]) if self.gae else re_n assert rewards.shape == terminal_n.shape for i in range( self.agent_params['num_critic_updates_per_agent_update']): loss_critic = self.critic.update(ob_no, ac_na, next_ob_no, rewards, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) old_log_prob = self.get_old_prob(self.old_actor, ob_no, ac_na) if self.ppo else None # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for i in range( self.agent_params['num_actor_updates_per_agent_update']): loss_actor = self.actor.update(ob_no, ac_na, advantage, old_log_prob) if self.ppo: self.old_actor.load_state_dict(self.actor.state_dict()) loss = OrderedDict() loss['Critic_Loss'] = loss_critic loss['Actor_Loss'] = loss_actor return loss def get_old_prob(self, old_policy, ob_no, ac_na): observations = ptu.from_numpy(ob_no) actions = ptu.from_numpy(ac_na) log_prob = old_policy.forward(observations).log_prob(actions) return ptu.to_numpy(log_prob) def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) v_s = self.critic.forward_np(ob_no) if not self.gae: v_s_next = self.critic.forward_np(next_ob_no) * (1 - terminal_n) adv_n = re_n + self.gamma * v_s_next - v_s else: index = 0 adv_n = np.zeros(len(ob_no)) for rewards in re_n: gae_deltas = [] for i in range(len(rewards) - 1): delta = rewards[i] + self.gamma * v_s[index + i + 1] - v_s[index + i] gae_deltas.append(delta) i = len(rewards) - 1 gae_deltas.append(rewards[i] - v_s[index + i]) assert len(gae_deltas) == len(rewards) sum_deltas = 0 for t in range(len(gae_deltas) - 1, -1, -1): sum_deltas = gae_deltas[ t] + sum_deltas * self.gamma * self.gae_lambda adv_n[t + index] = sum_deltas index += len(rewards) if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): concat_rew = False if self.gae else True return self.replay_buffer.sample_recent_data(batch_size, concat_rew)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super().__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.n_drivers = self.agent_params['n_drivers'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size_ac'], self.agent_params['shared_exp'], self.agent_params['shared_exp_lambda'], self.agent_params['is_city'], self.agent_params['learning_rate'], self.agent_params['n_drivers'] ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic loss = OrderedDict() for i in range(self.agent_params['num_critic_updates_per_agent_update']): if not self.agent_params['shared_exp']: loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) else: action_distributions = self.actor.shared_forward(ptu.from_numpy(ob_no)) loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n, action_distributions) # advantage = estimate_advantage(...) if self.agent_params['shared_exp']: advantage = self.estimate_shared_advantage(ob_no, next_ob_no, re_n, terminal_n) else: advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for i in range(self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantage) return loss def estimate_shared_advantage(self, ob_no, next_ob_no, re_n, terminal_n): value_s = self.critic.shared_forward(ptu.from_numpy(ob_no)) value_next_s = self.critic.shared_forward(ptu.from_numpy(next_ob_no)) adv_n = dict() for i in range(self.n_drivers): for k in range(self.n_drivers): adv_n[(i,k)] = re_n[:,k] + self.gamma*ptu.to_numpy(value_next_s[(i,k)]) - ptu.to_numpy(value_s[(i,k)]) if self.standardize_advantages: adv_n[(i,k)] = (adv_n[(i,k)]- np.mean(adv_n[(i,k)]))/(np.std(adv_n[(i,k)])+1e-8) return adv_n def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) value_s = self.critic.forward_np(ob_no) value_next_s = self.critic.forward_np(next_ob_no) value_next_s[terminal_n==1] = 0 adv_n = re_n + self.gamma*value_next_s - value_s if self.standardize_advantages: for i in range(self.n_drivers): adv_n[:,i] = (adv_n[:,i] - np.mean(adv_n[:,i])) / (np.std(adv_n[:,i]) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)