Exemple #1
0
    def update(self):
        if not self.memory.full():
            return
        batch = self.memory.sample()

        Zs = self.discount(batch.r)

        # Policy
        mu_policy, sigma_policy = self.policy(batch.s)
        log_prob_policy = Normal(mu_policy, sigma_policy).log_prob(
            batch.a).mean(dim=1, keepdims=True)

        # Credit
        mu_credit, sigma_credit = self.credit(batch.s, Zs)
        log_prob_credit = Normal(mu_credit, sigma_credit).log_prob(
            batch.a).mean(dim=1, keepdims=True)

        ratio = torch.exp(log_prob_policy - log_prob_credit.detach())
        A = (1 - ratio) * Zs.unsqueeze(1)
        policy_loss = -(A.T @ log_prob_policy) / batch.r.size(0)
        self.policy_optim.zero_grad()
        policy_loss.backward()

        credit_loss = -torch.mean(log_prob_policy.detach() * log_prob_credit)

        self.credit_optim.zero_grad()
        credit_loss.backward()

        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.7)
        torch.nn.utils.clip_grad_norm_(self.credit.parameters(), 0.7)

        self.policy_optim.step()
        self.credit_optim.step()
Exemple #2
0
def select_actions(pi, dist_type):
    if dist_type == 'gauss':
        mean, std = pi
        actions = Normal(mean, std).sample()
    else:
        raise NotImplementedError
    # return actions
    return actions.detach().cpu().numpy().squeeze()
def select_actions(pi, dist_type):
    if dist_type == 'gauss':
        mean, std = pi
        actions = Normal(mean, std).sample()
    elif dist_type == 'beta':
        alpha, beta = pi
        actions = Beta(alpha.detach().cpu(), beta.detach().cpu()).sample()

    return actions.detach().cpu().numpy()[0]
 def _action_selection(self, action_mean, action_std, exploration=True):
     if exploration:
         action = Normal(action_mean, action_std).sample()
     else:
         action = action_mean
     return action.detach().cpu().numpy()[0]