def update(self): if not self.memory.full(): return batch = self.memory.sample() Zs = self.discount(batch.r) # Policy mu_policy, sigma_policy = self.policy(batch.s) log_prob_policy = Normal(mu_policy, sigma_policy).log_prob( batch.a).mean(dim=1, keepdims=True) # Credit mu_credit, sigma_credit = self.credit(batch.s, Zs) log_prob_credit = Normal(mu_credit, sigma_credit).log_prob( batch.a).mean(dim=1, keepdims=True) ratio = torch.exp(log_prob_policy - log_prob_credit.detach()) A = (1 - ratio) * Zs.unsqueeze(1) policy_loss = -(A.T @ log_prob_policy) / batch.r.size(0) self.policy_optim.zero_grad() policy_loss.backward() credit_loss = -torch.mean(log_prob_policy.detach() * log_prob_credit) self.credit_optim.zero_grad() credit_loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.7) torch.nn.utils.clip_grad_norm_(self.credit.parameters(), 0.7) self.policy_optim.step() self.credit_optim.step()
def select_actions(pi, dist_type): if dist_type == 'gauss': mean, std = pi actions = Normal(mean, std).sample() else: raise NotImplementedError # return actions return actions.detach().cpu().numpy().squeeze()
def select_actions(pi, dist_type): if dist_type == 'gauss': mean, std = pi actions = Normal(mean, std).sample() elif dist_type == 'beta': alpha, beta = pi actions = Beta(alpha.detach().cpu(), beta.detach().cpu()).sample() return actions.detach().cpu().numpy()[0]
def _action_selection(self, action_mean, action_std, exploration=True): if exploration: action = Normal(action_mean, action_std).sample() else: action = action_mean return action.detach().cpu().numpy()[0]