Example #1
0
class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super().__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        re_n = ptu.from_numpy(re_n)
        ac_na = ptu.from_numpy(ac_na)
        terminal_n = ptu.from_numpy(terminal_n)
        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                             terminal_n)
        # advantage = estimate_advantage(...)
        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                            terminal_n)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss
        loss['Actor_Loss'] = actor_loss

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        # ob_no = torch.from_numpy(ob_no)
        # next_ob_no = torch.from_numpy(next_ob_no)
        # re_n = torch.from_numpy(re_n)

        q = re_n + self.gamma * self.critic.forward(next_ob_no) * (1 -
                                                                   terminal_n)
        adv_n = q - self.critic.forward(ob_no)
        adv_n = adv_n.detach().cpu().numpy()

        # v = self.critic(ob_no)
        # v_next = self.critic(next_ob_no)*(1-terminal_n)
        # q = re_n + self.gamma*v_next
        # adv_n = q - v
        # adv_n = adv_n.detach().cpu().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)
Example #2
0
class ACAgent(BaseAgent):
    def __init__(self, sess, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env 
        self.sess = sess
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(sess, 
                               self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete=self.agent_params['discrete'],
                               learning_rate=self.agent_params['learning_rate'],
                               )
        self.critic = BootstrappedContinuousCritic(sess, self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        
        # TODO Implement the following pseudocode:
            # 1) query the critic with ob_no, to get V(s)
            # 2) query the critic with next_ob_no, to get V(s')
            # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
            # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
            # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        V_s, V_s1 = self.critic.forward(ob_no), self.critic.forward(next_ob_no)
        Q_value = re_n + self.gamma * V_s1 * (1 - terminal_n)
        adv_n = Q_value - V_s
        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        
        # TODO Implement the following pseudocode:
            # for agent_params['num_critic_updates_per_agent_update'] steps,
            #     update the critic

            # advantage = estimate_advantage(...)

            # for agent_params['num_actor_updates_per_agent_update'] steps,
            #     update the actor
        
        for iter in range(self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no, next_ob_no, re_n, terminal_n)

        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for iter in range(self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, advantage)

        # print('critic_loss: {}, actor_loss: {}'.format(critic_loss, actor_loss))
        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss  # put final critic loss here
        loss['Actor_Loss'] = actor_loss  # put final actor loss here
        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)
Example #3
0
class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        terminal_n = ptu.from_numpy(terminal_n)
        re_n = ptu.from_numpy(re_n)

        ac_na = ptu.from_numpy(ac_na)

        loss_critic = 0.
        for i in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                              terminal_n)

        # advantage = estimate_advantage(...) :
        adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                        terminal_n)  # a tensor is returned
        loss_actor = 0.
        for i in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss_actor += self.actor.update(ob_no, ac_na, adv_n)

        loss = OrderedDict()
        loss['Critic_Loss'] = loss_critic
        loss[
            'Actor_Loss'] = loss_actor  # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        # V_s_prime = self.critic.critic_network(next_ob_no)
        # V_s_prime = V_s_prime.squeeze()
        # mask = (terminal_n == 1.)
        # V_s_prime= V_s_prime.masked_fill(mask, 0.)
        #
        # V_s = self.critic.critic_network(ob_no)
        # V_s = V_s.squeeze()
        # # assert V_s_prime.ndim == V_s.ndim     # TODO-assert enable this assert in debug
        # adv_n2 = re_n + self.gamma * V_s_prime - V_s

        # another way to calculate:
        V_s_prime = re_n + (
            1 - terminal_n) * self.gamma * self.critic.forward(next_ob_no)
        adv_n = V_s_prime - self.critic.forward(ob_no)
        # assert adv_n2 == adv_n

        if self.standardize_advantages:
            adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)