Python GPUVariable.unsqueeze Examples

Programming Language: Python

Namespace/Package Name: gtd.ml.torch.utils

Class/Type: GPUVariable

Method/Function: unsqueeze

Examples at hotexamples.com: 2

Python GPUVariable.unsqueeze - 2 examples found. These are the top rated real world Python examples of gtd.ml.torch.utils.GPUVariable.unsqueeze extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GPUVariable(30)

unsqueeze(2)

mul(1)

reshape(1)

Example #1

Show file

    def update_from_imitation(self, experiences, take_grad_step, vmax):
        """Updates the Q values to match the reward to go and a margin loss.

            regression_loss = ||G_t - Q(s_t, a_t)||_2
            G_t = return of the episode from timestep t and onward

            margin_loss = max_a (Q(s, a) + l(a_e, a)) - Q(s, a_e)
            a_e is "expert" action from the experience
            l(a_e, a) = 0 if a_e = a,
                      = K otherwise. (following DQfD)

        Args:
            experiences (list[Experience]): batch of experiences, state and
              next_state may be LazyFrames or np.arrays
            take_grad_step (Callable(loss)): takes the loss and updates
            vmax (float): optimal value of best state
    """
        states = [e.state for e in experiences]
        actions = GPUVariable(
            torch.LongTensor(
                np.array([np.array(e.action) for e in experiences])))

        # Computes V-max (trajectories assumed to achieve V-max reward)
        rewards_to_go = GPUVariable(
            torch.FloatTensor(
                np.array([vmax - e.state.goal.cum_reward
                          for e in experiences])))
        q_values = self._Q(states)
        expert_q_values = q_values.gather(1, actions.unsqueeze(1))
        regression_loss = torch.mean((rewards_to_go - expert_q_values)**2)

        margin = 0.5 * torch.ones_like(q_values).scatter(
            1, actions.unsqueeze(1), 0.)
        max_margin_q_values, _ = torch.max(q_values + margin, dim=1)
        margin_loss = torch.mean(max_margin_q_values - expert_q_values)
        grad_norm = take_grad_step(regression_loss + margin_loss)[1]

        if self._debug_stats:
            self._margin_losses.append(margin_loss)
            self._regression_losses.append(regression_loss)
            self._imitation_grad_norms.append(grad_norm)

Example #2

Show file

  def update_from_experiences(self,
                              experiences,
                              weights,
                              take_grad_step,
                              vmax=None,
                              vmin=None):
    """Updates parameters from a batch of experiences

        Minimizing the loss:

            (target - Q(s, a))^2

            target = r if done
                     r + \gamma * max_a' Q(s', a')

            target is clamped between [vmin, vmax - G_t] if vmin, vmax provided

        Args:
            experiences (list[Experience]): batch of experiences, state and
              next_state may be LazyFrames or np.arrays
            weights (list[float]): importance weights on each experience
            take_grad_step (Callable(loss)): takes the loss and updates
              parameters
            vmax (float | None): if None, no clamping
            vmin (float | None): if None, no clamping

        Returns:
            td_error (GPUVariable[FloatTensor]): (batch_size), error per
                experience
        """
    batch_size = len(experiences)
    states = [e.state for e in experiences]
    actions = GPUVariable(
        torch.LongTensor(np.array([np.array(e.action) for e in experiences])))
    next_states = [e.next_state for e in experiences]
    rewards = GPUVariable(
        torch.FloatTensor(np.array([e.reward for e in experiences])))

    # (batch_size,) 1 if was not done, otherwise 0
    not_done_mask = GPUVariable(
        torch.FloatTensor(np.array([1 - e.done for e in experiences])))
    weights = GPUVariable(torch.FloatTensor(np.array(weights)))

    current_state_q_values = self._Q(states).gather(1, actions.unsqueeze(1))

    # DDQN
    next_state_q_values = self._Q(next_states)
    best_q_values, best_actions = torch.max(next_state_q_values, 1)
    best_actions = best_actions.unsqueeze(1)
    target_q_values = self._target_Q(next_states).gather(
        1, best_actions).squeeze(1)
    targets = rewards + self._gamma * (target_q_values * not_done_mask)

    if vmax is not None:
      # targets - (targets - G_t)_+
      max_reward_to_go = GPUVariable(
          torch.FloatTensor(
              np.array([vmax - e.state.goal.cum_reward for e in experiences])))
      clip_amount = torch.clamp(targets - max_reward_to_go, min=0.)
      targets = targets - clip_amount
    if vmin is not None:
      targets = torch.clamp(targets, min=vmin)

    targets.detach_()  # Don't backprop through targets
    td_error = current_state_q_values.squeeze() - targets
    loss = torch.mean((td_error**2) * weights)
    grad_norm = take_grad_step(loss)[1]

    if grad_norm > 100:
      logging.warning("Large grad norm: {}".format(grad_norm))
      logging.warning("TD Errors: {}".format(td_error))
      logging.warning("Predicted Q-values: {}".format(current_state_q_values))
      logging.warning("Targets: {}".format(targets))

    if self._debug_stats:
      max_target = torch.max(targets)[0]
      min_target = torch.min(targets)[0]
      self._max_target.append(max_target)
      self._min_target.append(min_target)
      self._td_losses.append(loss)
      self._grad_norms.append(grad_norm)
    return td_error