Example #1
0
    def update_from_imitation(self, experiences, take_grad_step, vmax):
        """Updates the Q values to match the reward to go and a margin loss.

            regression_loss = ||G_t - Q(s_t, a_t)||_2
            G_t = return of the episode from timestep t and onward

            margin_loss = max_a (Q(s, a) + l(a_e, a)) - Q(s, a_e)
            a_e is "expert" action from the experience
            l(a_e, a) = 0 if a_e = a,
                      = K otherwise. (following DQfD)

        Args:
            experiences (list[Experience]): batch of experiences, state and
              next_state may be LazyFrames or np.arrays
            take_grad_step (Callable(loss)): takes the loss and updates
            vmax (float): optimal value of best state
    """
        states = [e.state for e in experiences]
        actions = GPUVariable(
            torch.LongTensor(
                np.array([np.array(e.action) for e in experiences])))

        # Computes V-max (trajectories assumed to achieve V-max reward)
        rewards_to_go = GPUVariable(
            torch.FloatTensor(
                np.array([vmax - e.state.goal.cum_reward
                          for e in experiences])))
        q_values = self._Q(states)
        expert_q_values = q_values.gather(1, actions.unsqueeze(1))
        regression_loss = torch.mean((rewards_to_go - expert_q_values)**2)

        margin = 0.5 * torch.ones_like(q_values).scatter(
            1, actions.unsqueeze(1), 0.)
        max_margin_q_values, _ = torch.max(q_values + margin, dim=1)
        margin_loss = torch.mean(max_margin_q_values - expert_q_values)
        grad_norm = take_grad_step(regression_loss + margin_loss)[1]

        if self._debug_stats:
            self._margin_losses.append(margin_loss)
            self._regression_losses.append(regression_loss)
            self._imitation_grad_norms.append(grad_norm)
Example #2
0
  def update_from_experiences(self,
                              experiences,
                              weights,
                              take_grad_step,
                              vmax=None,
                              vmin=None):
    """Updates parameters from a batch of experiences

        Minimizing the loss:

            (target - Q(s, a))^2

            target = r if done
                     r + \gamma * max_a' Q(s', a')

            target is clamped between [vmin, vmax - G_t] if vmin, vmax provided

        Args:
            experiences (list[Experience]): batch of experiences, state and
              next_state may be LazyFrames or np.arrays
            weights (list[float]): importance weights on each experience
            take_grad_step (Callable(loss)): takes the loss and updates
              parameters
            vmax (float | None): if None, no clamping
            vmin (float | None): if None, no clamping

        Returns:
            td_error (GPUVariable[FloatTensor]): (batch_size), error per
                experience
        """
    batch_size = len(experiences)
    states = [e.state for e in experiences]
    actions = GPUVariable(
        torch.LongTensor(np.array([np.array(e.action) for e in experiences])))
    next_states = [e.next_state for e in experiences]
    rewards = GPUVariable(
        torch.FloatTensor(np.array([e.reward for e in experiences])))

    # (batch_size,) 1 if was not done, otherwise 0
    not_done_mask = GPUVariable(
        torch.FloatTensor(np.array([1 - e.done for e in experiences])))
    weights = GPUVariable(torch.FloatTensor(np.array(weights)))

    current_state_q_values = self._Q(states).gather(1, actions.unsqueeze(1))

    # DDQN
    next_state_q_values = self._Q(next_states)
    best_q_values, best_actions = torch.max(next_state_q_values, 1)
    best_actions = best_actions.unsqueeze(1)
    target_q_values = self._target_Q(next_states).gather(
        1, best_actions).squeeze(1)
    targets = rewards + self._gamma * (target_q_values * not_done_mask)

    if vmax is not None:
      # targets - (targets - G_t)_+
      max_reward_to_go = GPUVariable(
          torch.FloatTensor(
              np.array([vmax - e.state.goal.cum_reward for e in experiences])))
      clip_amount = torch.clamp(targets - max_reward_to_go, min=0.)
      targets = targets - clip_amount
    if vmin is not None:
      targets = torch.clamp(targets, min=vmin)

    targets.detach_()  # Don't backprop through targets
    td_error = current_state_q_values.squeeze() - targets
    loss = torch.mean((td_error**2) * weights)
    grad_norm = take_grad_step(loss)[1]

    if grad_norm > 100:
      logging.warning("Large grad norm: {}".format(grad_norm))
      logging.warning("TD Errors: {}".format(td_error))
      logging.warning("Predicted Q-values: {}".format(current_state_q_values))
      logging.warning("Targets: {}".format(targets))

    if self._debug_stats:
      max_target = torch.max(targets)[0]
      min_target = torch.min(targets)[0]
      self._max_target.append(max_target)
      self._min_target.append(min_target)
      self._td_losses.append(loss)
      self._grad_norms.append(grad_norm)
    return td_error