Ejemplo n.º 1
0
    def value_loss(self, trajectory, act=None):
        """ Compute value loss. """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        values = act["values"]
        value_targets = self.torch_from_numpy(trajectory["value_targets"])

        if values.shape != value_targets.shape:
            raise ValueError("trajectory has mismatched shapes "
                             f"values.shape={values.shape} "
                             f"value_targets.shape={value_targets.shape}")

        value_loss = torch.mean(torch.pow(values - value_targets, 2))

        if summary.should_record():
            summaries = dict(value_targets=torch.mean(value_targets),
                             value_preds=torch.mean(values),
                             value_loss=value_loss,
                             r_squared=r_squared(values, value_targets))
            for key, val in summaries.items():
                summary.add_scalar(f"{self.name}/{key}",
                                   val,
                                   global_step=self.call_count)

        return value_loss
Ejemplo n.º 2
0
    def value_loss(self, trajectory, act=None):
        """ Computes value loss. """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        if "value_targets" not in trajectory:
            raise ValueError("trajectory does not contain 'value_targets'")

        value_targets = self.torch_from_numpy(trajectory["value_targets"])
        old_value_preds = self.torch_from_numpy(trajectory["values"])
        values = act["values"]

        if values.shape != value_targets.shape:
            raise ValueError("trajectory has mismatched shapes "
                             f"values.shape={values.shape} "
                             f"value_targets.shape={value_targets.shape}")

        value_loss = torch.pow(values - value_targets, 2)
        if self.cliprange is not None:
            values_clipped = old_value_preds + torch.clamp(
                values - old_value_preds, -self.cliprange, self.cliprange)
            value_loss_clipped = torch.pow(values_clipped - value_targets, 2)
            value_loss = torch.max(value_loss, value_loss_clipped)

        value_loss = torch.mean(value_loss)
        if summary.should_record():
            summaries = dict(value_loss=value_loss,
                             value_targets=torch.mean(value_targets),
                             value_preds=torch.mean(values),
                             r_squared=r_squared(value_targets, values))
            for key, val in summaries.items():
                summary.add_scalar(f"ppo/{key}",
                                   val,
                                   global_step=self.call_count)
        value_loss = torch.mean(value_loss)
        return value_loss
Ejemplo n.º 3
0
    def policy_loss(self, trajectory, act=None):
        """ Compute policiy loss including entropy regularization. """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        log_prob = act["distribution"].log_prob(
            self.torch_from_numpy(trajectory["actions"]))
        advantages = self.torch_from_numpy(trajectory["advantages"])

        if log_prob.shape != advantages.shape:
            raise ValueError("trajectory has mismatched shapes: "
                             f"log_prob.shape={log_prob.shape} "
                             f"advantages.shape={advantages.shape}")

        policy_loss = -torch.mean(log_prob * advantages)
        entropy = torch.mean(act["distribution"].entropy())

        if summary.should_record():
            summaries = dict(advantages=torch.mean(advantages),
                             entropy=torch.mean(entropy),
                             policy_loss=policy_loss)
            for key, val in summaries.items():
                summary.add_scalar(f"{self.name}/{key}",
                                   val,
                                   global_step=self.call_count)

        return policy_loss - self.entropy_coef * entropy
Ejemplo n.º 4
0
    def __call__(self, data):
        obs, actions, rewards, resets, next_obs = (self.torch_from_numpy(
            data[k]) for k in ("observations", "actions", "rewards", "resets",
                               "next_observations"))

        qtargets = self.compute_targets(rewards, resets, next_obs)
        qvalues = self.make_predictions(obs, actions)
        if "update_priorities" in data:
            data["update_priorities"](
                torch.abs(qtargets - qvalues).cpu().detach().numpy())

        weights = None
        if "weights" in data:
            weights = self.torch_from_numpy(data["weights"])
        loss = huber_loss(qtargets, qvalues, weights=weights)

        if summary.should_record():
            summary.add_scalar(f"{self.name}/r_squared",
                               r_squared(qtargets, qvalues),
                               global_step=self.call_count)
            summary.add_scalar(f"{self.name}/loss",
                               loss,
                               global_step=self.call_count)
        self.call_count += 1
        return loss
Ejemplo n.º 5
0
 def __call__(self, data):
     act = self.policy.act(data, training=True)
     policy_loss = self.policy_loss(data, act)
     value_loss = self.value_loss(data, act)
     loss = policy_loss + self.value_loss_coef * value_loss
     if summary.should_record():
         summary.add_scalar("ppo/loss", loss, global_step=self.call_count)
     self.call_count += 1
     return loss
Ejemplo n.º 6
0
 def preprocess_gradients(self, parameters, name):
     """ Applies gradient preprocessing. """
     grad_norm = None
     if self.max_grad_norm is not None:
         grad_norm = torch.nn.utils.clip_grad_norm_(parameters,
                                                    self.max_grad_norm)
     if summary.should_record():
         if grad_norm is None:
             grad_norm = total_norm(p.grad for p in parameters
                                    if p.grad is not None)
         summary.add_scalar(f"{name}/grad_norm",
                            grad_norm,
                            global_step=self.step_count)
Ejemplo n.º 7
0
  def add_summaries(self):
    """ Writes summaries. """
    summaries = dict(
        total_reward=np.mean([q[-1] for q in self.reward_queues]),
        episode_length=np.mean(self.episode_lengths),
        min_reward=min(q[-1] for q in self.reward_queues),
        max_reward=max(q[-1] for q in self.reward_queues),
    )
    summaries[f"reward_mean_{self.reward_queues[0].maxlen}"] = (
        np.mean([np.mean(q) for q in self.reward_queues]))

    for key, val in summaries.items():
      summary.add_scalar(f"{self.prefix}/{key}", val,
                         global_step=self.step_count)
Ejemplo n.º 8
0
    def policy_loss(self, trajectory, act=None):
        """ Compute policy loss (including entropy regularization). """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        if "advantages" not in trajectory:
            raise ValueError("trajectory does not contain 'advantages'")

        old_log_prob = self.torch_from_numpy(trajectory["log_prob"])
        advantages = self.torch_from_numpy(trajectory["advantages"])
        actions = self.torch_from_numpy(trajectory["actions"])

        log_prob = act["distribution"].log_prob(actions)
        if log_prob.shape != old_log_prob.shape:
            raise ValueError("trajectory has mismatched shapes: "
                             f"log_prob.shape={log_prob.shape} "
                             f"old_log_prob.shape={old_log_prob.shape}")
        if log_prob.shape != advantages.shape:
            raise ValueError("trajectory has mismatched shapes: "
                             f"log_prob.shape={log_prob.shape} "
                             f"advantages.shape={advantages.shape}")

        ratio = torch.exp(log_prob - old_log_prob)
        policy_loss = -ratio * advantages
        if self.cliprange is not None:
            ratio_clipped = torch.clamp(ratio, 1. - self.cliprange,
                                        1. + self.cliprange)
            policy_loss_clipped = -ratio_clipped * advantages
            policy_loss = torch.max(policy_loss, policy_loss_clipped)

        policy_loss = torch.mean(policy_loss)
        entropy = torch.mean(act["distribution"].entropy())

        if summary.should_record():
            summaries = dict(advantages=torch.mean(advantages),
                             policy_loss=policy_loss,
                             entropy=entropy)
            for key, val in summaries.items():
                summary.add_scalar(f"ppo/{key}",
                                   val,
                                   global_step=self.call_count)

        return policy_loss - self.entropy_coef * entropy
Ejemplo n.º 9
0
 def summarize(self, global_step):
     """ Writes summary of the value for tensorboard. """
     summary.add_scalar(f"anneal/{self.name}",
                        self.get_tensor(),
                        global_step=global_step)