Esempio n. 1
0
    def agent_train_step(experience):
        # preprocess experience
        if image_aug_type == 'random_shifting':
            experience, cropped_frames = experience
            x0, x1, x0a, x1a, y0, y1 = cropped_frames
            experience = tf.nest.map_structure(
                lambda t: composite.slice_to(t, axis=1, end=2), experience)
            time_steps, actions, next_time_steps = (
                tf_agent.experience_to_transitions(experience))  # pylint: disable=protected-access
        elif image_aug_type is None:
            experience = tf.nest.map_structure(
                lambda t: composite.slice_to(t, axis=1, end=2), experience)
            time_steps, actions, next_time_steps = (
                tf_agent.experience_to_transitions(experience))  # pylint: disable=protected-access
            x0 = time_steps.observation['pixels']
            x1 = next_time_steps.observation['pixels']
        else:
            raise NotImplementedError

        tf_agent.train_pix(time_steps,
                           actions,
                           next_time_steps,
                           x0,
                           x1,
                           x0a=x0a if use_augmented_q else None,
                           x1a=x1a if use_augmented_q else None,
                           e_enc=e_enc,
                           e_enc_t=e_enc_t,
                           q_aug=use_augmented_q,
                           use_critic_grad=use_critic_grad)
Esempio n. 2
0
    def testSliceTo(self):
        to_1 = composite.slice_to(self._x, axis=1, end=1)
        to_n1 = composite.slice_to(self._x, axis=1, end=-1)
        x, to_1, to_n1 = self.evaluate((self._x, to_1, to_n1))
        self.assertAllEqual(to_1, x[:, :1, :])
        self.assertAllEqual(to_n1, x[:, :-1, :])

        s_from_1 = _to_dense(composite.slice_to(self._sx, axis=1, end=1))
        s_from_n1 = _to_dense(composite.slice_to(self._sx, axis=1, end=-1))
        sx = _to_dense(self._sx)
        sx, s_from_1, s_from_n1 = self.evaluate((sx, s_from_1, s_from_n1))
        self.assertAllEqual(s_from_1, sx[:, :1, :])
        self.assertAllEqual(s_from_n1, sx[:, :-1, :])
Esempio n. 3
0
def to_transition(
    trajectory: Trajectory,
    next_trajectory: Optional[Trajectory] = None
) -> Transition:
  """Create a transition from a trajectory or two adjacent trajectories.

  **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are
  sliced along their *second* (`time`) dimension; for example:

  ```
  time_steps.step_type = trajectory.step_type[:,:-1]
  time_steps.observation = trajectory.observation[:,:-1]
  next_time_steps.observation = trajectory.observation[:,1:]
  next_time_steps. step_type = trajectory. next_step_type[:,:-1]
  next_time_steps.reward = trajectory.reward[:,:-1]
  next_time_steps. discount = trajectory. discount[:,:-1]

  ```
  Notice that reward and discount for time_steps are undefined, therefore filled
  with zero.

  Args:
    trajectory: An instance of `Trajectory`. The tensors in Trajectory must have
      shape `[B, T, ...]` when next_trajectory is `None`.  `discount` is assumed
      to be a scalar float; hence the shape of `trajectory.discount` must
      be `[B, T]`.
    next_trajectory: (optional) An instance of `Trajectory`.

  Returns:
    A tuple `(time_steps, policy_steps, next_time_steps)`.  The `reward` and
    `discount` fields of `time_steps` are filled with zeros because these
    cannot be deduced (please do not use them).

  Raises:
    ValueError: if `discount` rank is not within the range [1, 2].
  """
  _validate_rank(trajectory.discount, min_rank=1, max_rank=2)

  if next_trajectory is not None:
    _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2)

  if next_trajectory is None:
    next_trajectory = tf.nest.map_structure(
        lambda t: composite.slice_from(t, axis=1, start=1), trajectory)
    trajectory = tf.nest.map_structure(
        lambda t: composite.slice_to(t, axis=1, end=-1), trajectory)
  policy_steps = policy_step.PolicyStep(
      action=trajectory.action, state=(), info=trajectory.policy_info)
  # TODO(b/130244652): Consider replacing 0 rewards & discounts with ().
  time_steps = ts.TimeStep(
      trajectory.step_type,
      reward=tf.nest.map_structure(tf.zeros_like, trajectory.reward),  # unknown
      discount=tf.zeros_like(trajectory.discount),  # unknown
      observation=trajectory.observation)
  next_time_steps = ts.TimeStep(
      step_type=trajectory.next_step_type,
      reward=trajectory.reward,
      discount=trajectory.discount,
      observation=next_trajectory.observation)
  return Transition(time_steps, policy_steps, next_time_steps)
Esempio n. 4
0
def to_n_step_transition(
    trajectory: Trajectory,
    gamma: types.Float
) -> Transition:
  """Create an n-step transition from a trajectory with `T=N + 1` frames.

  **NOTE** Tensors of `trajectory` are sliced along their *second* (`time`)
  dimension, to pull out the appropriate fields for the n-step transitions.

  The output transition's `next_time_step.{reward, discount}` will contain
  N-step discounted reward and discount values calculated as:

  ```
  next_time_step.reward = r_t +
                          g^{1} * d_t * r_{t+1} +
                          g^{2} * d_t * d_{t+1} * r_{t+2} +
                          g^{3} * d_t * d_{t+1} * d_{t+2} * r_{t+3} +
                          ...
                          g^{N-1} * d_t * ... * d_{t+N-2} * r_{t+N-1}
  next_time_step.discount = g^{N-1} * d_t * d_{t+1} * ... * d_{t+N-1}
  ```

  In python notation:

  ```python
  discount = gamma**(N-1) * reduce_prod(trajectory.discount[:, :-1])
  reward = discounted_return(
      rewards=trajectory.reward[:, :-1],
      discounts=gamma * trajectory.discount[:, :-1])
  ```

  When `trajectory.discount[:, :-1]` is an all-ones tensor, this is equivalent
  to:

  ```python
  next_time_step.discount = (
      gamma**(N-1) * tf.ones_like(trajectory.discount[:, 0]))
  next_time_step.reward = (
      sum_{n=0}^{N-1} gamma**n * trajectory.reward[:, n])
  ```

  Args:
    trajectory: An instance of `Trajectory`. The tensors in Trajectory must have
      shape `[B, T, ...]`.  `discount` is assumed to be a scalar float,
      hence the shape of `trajectory.discount` must be `[B, T]`.
    gamma: A floating point scalar; the discount factor.

  Returns:
    An N-step `Transition` where `N = T - 1`.  The reward and discount in
    `time_step.{reward, discount}` are NaN.  The n-step discounted reward
    and final discount are stored in `next_time_step.{reward, discount}`.
    All tensors in the `Transition` have shape `[B, ...]` (no time dimension).

  Raises:
    ValueError: if `discount.shape.rank != 2`.
    ValueError: if `discount.shape[1] < 2`.
  """
  _validate_rank(trajectory.discount, min_rank=2, max_rank=2)

  # Use static values when available, so that we can use XLA when the time
  # dimension is fixed.
  time_dim = (tf.compat.dimension_value(trajectory.discount.shape[1])
              or tf.shape(trajectory.discount)[1])

  static_time_dim = tf.get_static_value(time_dim)
  if static_time_dim in (0, 1):
    raise ValueError(
        'Trajectory frame count must be at least 2, but saw {}.  Shape of '
        'trajectory.discount: {}'.format(static_time_dim,
                                         trajectory.discount.shape))

  n = time_dim - 1

  # Use composite calculations to ensure we properly handle SparseTensor etc in
  # the observations.

  # pylint: disable=g-long-lambda

  # Pull out x[:,0] for x in trajectory
  first_frame = tf.nest.map_structure(
      lambda t: composite.squeeze(
          composite.slice_to(t, axis=1, end=1),
          axis=1),
      trajectory)

  # Pull out x[:,-1] for x in trajectory
  final_frame = tf.nest.map_structure(
      lambda t: composite.squeeze(
          composite.slice_from(t, axis=1, start=-1),
          axis=1),
      trajectory)
  # pylint: enable=g-long-lambda

  # When computing discounted return, we need to throw out the last time
  # index of both reward and discount, which are filled with dummy values
  # to match the dimensions of the observation.
  reward = trajectory.reward[:, :-1]
  discount = trajectory.discount[:, :-1]

  policy_steps = policy_step.PolicyStep(
      action=first_frame.action, state=(), info=first_frame.policy_info)

  discounted_reward = value_ops.discounted_return(
      rewards=reward,
      discounts=gamma * discount,
      time_major=False,
      provide_all_returns=False)

  # NOTE: `final_discount` will have one less discount than `discount`.
  # This is so that when the learner/update uses an additional
  # discount (e.g. gamma) we don't apply it twice.
  final_discount = gamma**(n-1) * tf.math.reduce_prod(discount, axis=1)

  time_steps = ts.TimeStep(
      first_frame.step_type,
      # unknown
      reward=tf.nest.map_structure(
          lambda r: np.nan * tf.ones_like(r), first_frame.reward),
      # unknown
      discount=np.nan * tf.ones_like(first_frame.discount),
      observation=first_frame.observation)
  next_time_steps = ts.TimeStep(
      step_type=final_frame.step_type,
      reward=discounted_reward,
      discount=final_discount,
      observation=final_frame.observation)
  return Transition(time_steps, policy_steps, next_time_steps)