def forward(self, distribution_old: Distribution, value_old: Tensor, distribution: Distribution, value: Tensor, action: Tensor, reward: Tensor, advantage: Tensor): # Value loss value_old_clipped = value_old + (value - value_old).clamp( -self.v_clip_range, self.v_clip_range) v_old_loss_clipped = (reward - value_old_clipped).pow(2) v_loss = (reward - value).pow(2) value_loss = torch.min(v_old_loss_clipped, v_loss).mean() # Policy loss advantage = (advantage - advantage.mean()) / (advantage.std(unbiased=False) + 1e-8) advantage.detach_() log_prob = distribution.log_prob(action) log_prob_old = distribution_old.log_prob(action) ratio = (log_prob - log_prob_old).exp().view(-1) surrogate = advantage * ratio surrogate_clipped = advantage * ratio.clamp(1 - self.clip_range, 1 + self.clip_range) policy_loss = torch.min(surrogate, surrogate_clipped).mean() # Entropy entropy = distribution.entropy().mean() # Total loss losses = policy_loss + self.c_entropy * entropy - self.c_value * value_loss total_loss = -losses self.reporter.scalar('ppo_loss/policy', -policy_loss.item()) self.reporter.scalar('ppo_loss/entropy', -entropy.item()) self.reporter.scalar('ppo_loss/value_loss', value_loss.item()) self.reporter.scalar('ppo_loss/total', total_loss) return total_loss
def _compute_entropy(dist: td.Distribution): if isinstance(dist, td.TransformedDistribution): # TransformedDistribution is used by NormalProjectionNetwork with # scale_distribution=True, in which case we estimate with sampling. entropy, entropy_for_gradient = estimated_entropy(dist) else: entropy = dist.entropy() entropy_for_gradient = entropy return entropy, entropy_for_gradient
def loss(distr: Distribution, actions: Tensor, critic_value: Tensor, c_entropy: float) -> Tensor: """Computes A2C actor loss, i.e. -C(s, a) log pi(a | s) - c_entropy H(pi(.|s)). :args distr: distribution on actions, accepting actions of the same size as actions :args actions: actions performed :args critic_value: advantage corresponding to the actions performed :args c_entropy: entropy loss weighting :return: loss """ logp_action = distr.log_prob(actions) entropy = distr.entropy() loss_critic = (-logp_action * critic_value.detach()).mean() return loss_critic - c_entropy * entropy.mean()
def loss(distr: Distribution, actions: Tensor, critic_value: Tensor, c_entropy: float, eps_clamp: float, c_kl: float, old_logp: Tensor, old_distr: Optional[Distribution] = None) -> None: """Computes PPO actor loss. See https://spinningup.openai.com/en/latest/algorithms/ppo.html for a detailled explanation :args distr: current distribution of actions, accepting actions of the same size as actions :args actions: actions performed :args critic_value: advantage corresponding to the actions performed :args c_entropy: entropy loss weighting :args eps_clamp: clamping parameter :args c_kl: kl penalty coefficient :args old_logp: log probabilities of the old distribution of actions :args old_distr: old ditribution of actions :return: loss """ logp_action = distr.log_prob(actions) logr = (logp_action - old_logp) r_clipped = torch.where(critic_value.detach() > 0, torch.clamp(logr, max=np.log(1 + eps_clamp)), torch.clamp(logr, min=np.log(1 - eps_clamp))).exp() loss = -r_clipped * critic_value.detach() if c_entropy != 0.: loss -= c_entropy * distr.entropy() if c_kl != 0.: if old_distr is None: raise ValueError( "Optional argument old_distr is required if c_kl > 0") loss += c_kl * kl_divergence(old_distr, distr) return loss.mean()
def _compute_entropy(dist: td.Distribution): entropy = dist.entropy() return entropy