コード例 #1
0
 def setUp(self):
   super(PengsQLambdaTest, self).setUp()
   # Tensor dimensions below: TxBxA (time, batch id, action).
   self.q_tm1 = tf.constant([[[1.1, 2.1], [2.1, 3.1]],
                             [[-1.1, 1.1], [-1.1, 0.1]],
                             [[3.1, -3.1], [-2.1, -1.1]]], dtype=tf.float32)
   self.q_t = tf.constant([[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]],
                           [[2.2, -1.2], [-1.2, -2.2]]],
                          dtype=tf.float32)
   # Tensor dimensions below: TxB (time, batch id).
   self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32)
   self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]],
                              dtype=tf.float32)
   self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]],
                          dtype=tf.float32)
   self.lambda_scalar = 0.5
   self.lambda_ = tf.constant([[self.lambda_scalar, self.lambda_scalar],
                               [self.lambda_scalar, self.lambda_scalar],
                               [self.lambda_scalar, self.lambda_scalar]],
                              dtype=tf.float32)
   # Evaluate trusted values by defining lambda_ as a tensor.
   self.qlearning_reference = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t,
                                         self.pcont_t, self.q_t, self.lambda_)
   # Evaluate values by defining lambda_ as a python number.
   self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t, self.pcont_t,
                               self.q_t, self.lambda_scalar)
コード例 #2
0
 def setUp(self):
     super(PengsQLambdaTest, self).setUp()
     # Tensor dimensions below: TxBxA (time, batch id, action).
     self.q_tm1 = tf.constant(
         [[[1.1, 2.1], [2.1, 3.1]], [[-1.1, 1.1], [-1.1, 0.1]],
          [[3.1, -3.1], [-2.1, -1.1]]],
         dtype=tf.float32)
     self.q_t = tf.constant(
         [[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]],
          [[2.2, -1.2], [-1.2, -2.2]]],
         dtype=tf.float32)
     # Tensor dimensions below: TxB (time, batch id).
     self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32)
     self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]],
                                dtype=tf.float32)
     self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]],
                            dtype=tf.float32)
     self.lambda_scalar = 0.5
     self.lambda_ = tf.constant([[self.lambda_scalar, self.lambda_scalar],
                                 [self.lambda_scalar, self.lambda_scalar],
                                 [self.lambda_scalar, self.lambda_scalar]],
                                dtype=tf.float32)
     # Evaluate trusted values by defining lambda_ as a tensor.
     self.qlearning_reference = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t,
                                           self.pcont_t, self.q_t,
                                           self.lambda_)
     # Evaluate values by defining lambda_ as a python number.
     self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t,
                                 self.pcont_t, self.q_t, self.lambda_scalar)
コード例 #3
0
 def testCompatibilityCheck(self):
     r_t = tf.placeholder(tf.float32, [4, 2])
     with self.assertRaisesRegexp(
             ValueError,
             "QLambda: Error in rank and/or compatibility check"):
         self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, r_t,
                                     self.pcont_t, self.q_t, self.lambda_)
コード例 #4
0
 def setUp(self):
     super(QLambdaTest, self).setUp()
     # Tensor dimensions below: TxBxA (time, batch id, action).
     self.q_tm1 = tf.constant(
         [[[1.1, 2.1], [2.1, 3.1]], [[-1.1, 1.1], [-1.1, 0.1]],
          [[3.1, -3.1], [-2.1, -1.1]]],
         dtype=tf.float32)
     self.q_t = tf.constant(
         [[[1.2, 2.2], [4.2, 2.2]], [[-1.2, 0.2], [1.2, 1.2]],
          [[2.2, -1.2], [-1.2, -2.2]]],
         dtype=tf.float32)
     # Tensor dimensions below: TxB (time, batch id).
     self.a_tm1 = tf.constant([[0, 1], [1, 0], [0, 0]], dtype=tf.int32)
     self.pcont_t = tf.constant([[0.00, 0.88], [0.89, 1.00], [0.85, 0.83]],
                                dtype=tf.float32)
     self.r_t = tf.constant([[-1.3, 1.3], [-1.3, 5.3], [2.3, -3.3]],
                            dtype=tf.float32)
     self.lambda_ = tf.constant([[0.67, 0.68], [0.65, 0.69], [0.66, 0.64]],
                                dtype=tf.float32)
     self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t,
                                 self.pcont_t, self.q_t, self.lambda_)
     # Evaluate target Q-values used for testing.
     # t20 is Target for timestep 2, batch 0
     self.t20 = 2.2 * 0.85 + 2.3
     self.t10 = (self.t20 * 0.65 + 0.2 * (1 - 0.65)) * 0.89 - 1.3
     self.t00 = (self.t10 * 0.67 + 2.2 * (1 - 0.67)) * 0.00 - 1.3
     self.t21 = -1.2 * 0.83 - 3.3
     self.t11 = (self.t21 * 0.69 + 1.2 * (1 - 0.69)) * 1.00 + 5.3
     self.t01 = (self.t11 * 0.68 + 4.2 * (1 - 0.68)) * 0.88 + 1.3
コード例 #5
0
 def setUp(self):
   super(QLambdaTest, self).setUp()
   # Tensor dimensions below: TxBxA (time, batch id, action).
   self.q_tm1 = tf.constant([[[1.1, 2.1], [2.1, 3.1]],
                             [[-1.1, 1.1], [-1.1, 0.1]],
                             [[3.1, -3.1], [-2.1, -1.1]]], dtype=tf.float32)
   self.q_t = tf.constant([[[1.2, 2.2], [4.2, 2.2]],
                           [[-1.2, 0.2], [1.2, 1.2]],
                           [[2.2, -1.2], [-1.2, -2.2]]], dtype=tf.float32)
   # Tensor dimensions below: TxB (time, batch id).
   self.a_tm1 = tf.constant([[0, 1],
                             [1, 0],
                             [0, 0]], dtype=tf.int32)
   self.pcont_t = tf.constant([[0.00, 0.88],
                               [0.89, 1.00],
                               [0.85, 0.83]], dtype=tf.float32)
   self.r_t = tf.constant([[-1.3, 1.3],
                           [-1.3, 5.3],
                           [2.3, -3.3]], dtype=tf.float32)
   self.lambda_ = tf.constant([[0.67, 0.68],
                               [0.65, 0.69],
                               [0.66, 0.64]], dtype=tf.float32)
   self.qlearning = rl.qlambda(self.q_tm1, self.a_tm1, self.r_t,
                               self.pcont_t, self.q_t, self.lambda_)
   # Evaluate target Q-values used for testing.
   # t20 is Target for timestep 2, batch 0
   self.t20 = 2.2 * 0.85 + 2.3
   self.t10 = (self.t20 * 0.65 + 0.2 * (1 - 0.65)) * 0.89 - 1.3
   self.t00 = (self.t10 * 0.67 + 2.2 * (1 - 0.67)) * 0.00 - 1.3
   self.t21 = -1.2 * 0.83 - 3.3
   self.t11 = (self.t21 * 0.69 + 1.2 * (1 - 0.69)) * 1.00 + 5.3
   self.t01 = (self.t11 * 0.68 + 4.2 * (1 - 0.68)) * 0.88 + 1.3
コード例 #6
0
 def testRankCheck(self):
     q_tm1 = tf.placeholder(tf.float32, [None, 3])
     with self.assertRaisesRegexp(
             ValueError,
             "QLambda: Error in rank and/or compatibility check"):
         self.qlearning = rl.qlambda(q_tm1, self.a_tm1, self.r_t,
                                     self.pcont_t, self.q_t,
                                     self.lambda_scalar)
コード例 #7
0
def pixel_control_loss(
    observations, actions, action_values, cell_size, discount_factor,
    scale, crop_height_dim=(None, None), crop_width_dim=(None, None)):
  """Calculate n-step Q-learning loss for pixel control auxiliary task.

  For each pixel-based pseudo reward signal, the corresponding action-value
  function is trained off-policy, using Q(lambda). A discount of 0.9 is
  commonly used for learning the value functions.

  Note that, since pseudo rewards have a spatial structure, with neighbouring
  cells exhibiting strong correlations, it is convenient to predict the action
  values for all the cells through a deconvolutional head.

  See "Reinforcement Learning with Unsupervised Auxiliary Tasks" by Jaderberg,
  Mnih, Czarnecki et al. (https://arxiv.org/abs/1611.05397).

  Args:
    observations: A tensor of shape `[T+1,B, ...]`; `...` is the observation
      shape, `T` the sequence length, and `B` the batch size. `T` and `B` can
      be statically unknown for `observations`, `actions` and `action_values`.
    actions: A tensor, shape `[T,B]`, of the actions across each sequence.
    action_values: A tensor, shape `[T+1,B,H,W,N]` of pixel control action
      values, where `H`, `W` are the number of pixel control cells/tasks, and
      `N` is the number of actions.
    cell_size: size of the cells used to derive the pixel based pseudo-rewards.
    discount_factor: discount used for learning the value function associated
      to the pseudo rewards; must be a scalar or a Tensor of shape [T,B].
    scale: scale factor for pixels in `observations`.
    crop_height_dim: tuple (min_height, max_height) specifying how
      to crop the input observations before computing the pseudo-rewards.
    crop_width_dim: tuple (min_width, max_width) specifying how
      to crop the input observations before computing the pseudo-rewards.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
        * `td_error`: batch of temporal difference errors, shape [B].

  Raises:
    ValueError: if the shape of `action_values` is not compatible with that of
      the pseudo-rewards derived from the observations.
  """
  # Useful shapes.
  sequence_length, batch_size = base_ops.best_effort_shape(actions)
  num_actions = action_values.get_shape().as_list()[-1]
  height_width_q = action_values.get_shape().as_list()[2:-1]
  # Calculate rewards using the observations. Crop observations if appropriate.
  if crop_height_dim[0] is not None:
    h_low, h_high = crop_height_dim
    observations = observations[:, :, h_low:h_high, :]
  if crop_width_dim[0] is not None:
    w_low, w_high = crop_width_dim
    observations = observations[:, :, :, w_low:w_high]
  # Rescale observations by a constant factor.
  observations *= tf.constant(scale)
  # Compute pseudo-rewards and get their shape.
  pseudo_rewards = pixel_control_rewards(observations, cell_size)
  height_width = pseudo_rewards.get_shape().as_list()[2:]
  # Check that pseudo-rewards and Q-values are compatible in shape.
  if height_width != height_width_q:
    raise ValueError(
        "Pixel Control values are not compatible with the shape of the"
        "pseudo-rewards derived from the observation. Pseudo-rewards have shape"
        "{}, while Pixel Control values have shape {}".format(
            height_width, height_width_q))
  # We now have Q(s,a) and rewards, so can calculate the n-step loss. The
  # QLambda loss op expects inputs of shape [T,B,N] and [T,B], but our tensors
  # are in a variety of incompatible shapes. The state-action values have
  # shape [T,B,H,W,N] and rewards have shape [T,B,H,W]. We can think of the
  # [H,W] dimensions as extra batch dimensions for the purposes of the loss
  # calculation, so we first collapse [B,H,W] into a single dimension.
  q_tm1 = tf.reshape(
      action_values[:-1],  # [T,B,H,W,N].
      [sequence_length, -1, num_actions],
      name="q_tm1")  # [T,BHW,N].
  r_t = tf.reshape(
      pseudo_rewards,  # [T,B,H,W].
      [sequence_length, -1],
      name="r_t")  # [T,BHW].
  q_t = tf.reshape(
      action_values[1:],  # [T,B,H,W,N].
      [sequence_length, -1, num_actions],
      name="q_t")  # [T,BHW,N].
  # The actions tensor is of shape [T,B], and is the same for each H and W.
  # We thus expand it to be same shape as the reward tensor, [T,BHW].
  expanded_actions = tf.expand_dims(tf.expand_dims(actions, -1), -1)
  a_tm1 = tf.tile(
      expanded_actions, multiples=[1, 1] + height_width)  # [T,B,H,W].
  a_tm1 = tf.reshape(a_tm1, [sequence_length, -1])  # [T,BHW].
  # We similarly expand-and-tile the discount to [T,BHW].
  discount_factor = tf.convert_to_tensor(discount_factor)
  if discount_factor.shape.ndims == 0:
    pcont_t = tf.reshape(discount_factor, [1, 1])  # [1,1].
    pcont_t = tf.tile(pcont_t, tf.shape(a_tm1))  # [T,BHW].
  elif discount_factor.shape.ndims == 2:
    tiled_pcont = tf.tile(
        tf.expand_dims(tf.expand_dims(discount_factor, -1), -1),
        [1, 1] + height_width)
    pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1])
  else:
    raise ValueError(
        "The discount_factor must be a scalar or a tensor of rank 2."
        "instead is a tensor of shape {}".format(
            discount_factor.shape.as_list()))
  # Compute a QLambda loss of shape [T,BHW]
  loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1)
  # Take sum over sequence, sum over cells.
  expanded_shape = [sequence_length, batch_size] + height_width
  spatial_loss = tf.reshape(loss, expanded_shape)  # [T,B,H,W].
  # Return.
  extra = PixelControlExtra(
      spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards)
  return base_ops.LossOutput(
      tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra)  # [B]
コード例 #8
0
ファイル: pixel_control_ops.py プロジェクト: wmiao1769/trfl
def pixel_control_loss(
    observations, actions, action_values, cell_size, discount_factor,
    scale, crop_height_dim=(None, None), crop_width_dim=(None, None)):
  """Calculate n-step Q-learning loss for pixel control auxiliary task.

  For each pixel-based pseudo reward signal, the corresponding action-value
  function is trained off-policy, using Q(lambda). A discount of 0.9 is
  commonly used for learning the value functions.

  Note that, since pseudo rewards have a spatial structure, with neighbouring
  cells exhibiting strong correlations, it is convenient to predict the action
  values for all the cells through a deconvolutional head.

  See "Reinforcement Learning with Unsupervised Auxiliary Tasks" by Jaderberg,
  Mnih, Czarnecki et al. (https://arxiv.org/abs/1611.05397).

  Args:
    observations: A tensor of shape `[T+1,B, ...]`; `...` is the observation
      shape, `T` the sequence length, and `B` the batch size. `T` and `B` can
      be statically unknown for `observations`, `actions` and `action_values`.
    actions: A tensor, shape `[T,B]`, of the actions across each sequence.
    action_values: A tensor, shape `[T+1,B,H,W,N]` of pixel control action
      values, where `H`, `W` are the number of pixel control cells/tasks, and
      `N` is the number of actions.
    cell_size: size of the cells used to derive the pixel based pseudo-rewards.
    discount_factor: discount used for learning the value function associated
      to the pseudo rewards; must be a scalar or a Tensor of shape [T,B].
    scale: scale factor for pixels in `observations`.
    crop_height_dim: tuple (min_height, max_height) specifying how
      to crop the input observations before computing the pseudo-rewards.
    crop_width_dim: tuple (min_width, max_width) specifying how
      to crop the input observations before computing the pseudo-rewards.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
        * `td_error`: batch of temporal difference errors, shape [B].

  Raises:
    ValueError: if the shape of `action_values` is not compatible with that of
      the pseudo-rewards derived from the observations.
  """
  # Useful shapes.
  sequence_length, batch_size = base_ops.best_effort_shape(actions)
  num_actions = action_values.get_shape().as_list()[-1]
  height_width_q = action_values.get_shape().as_list()[2:-1]
  # Calculate rewards using the observations. Crop observations if appropriate.
  if crop_height_dim[0] is not None:
    h_low, h_high = crop_height_dim
    observations = observations[:, :, h_low:h_high, :]
  if crop_width_dim[0] is not None:
    w_low, w_high = crop_width_dim
    observations = observations[:, :, :, w_low:w_high]
  # Rescale observations by a constant factor.
  observations *= tf.constant(scale)
  # Compute pseudo-rewards and get their shape.
  pseudo_rewards = pixel_control_rewards(observations, cell_size)
  height_width = pseudo_rewards.get_shape().as_list()[2:]
  # Check that pseudo-rewards and Q-values are compatible in shape.
  if height_width != height_width_q:
    raise ValueError(
        "Pixel Control values are not compatible with the shape of the"
        "pseudo-rewards derived from the observation. Pseudo-rewards have shape"
        "{}, while Pixel Control values have shape {}".format(
            height_width, height_width_q))
  # We now have Q(s,a) and rewards, so can calculate the n-step loss. The
  # QLambda loss op expects inputs of shape [T,B,N] and [T,B], but our tensors
  # are in a variety of incompatible shapes. The state-action values have
  # shape [T,B,H,W,N] and rewards have shape [T,B,H,W]. We can think of the
  # [H,W] dimensions as extra batch dimensions for the purposes of the loss
  # calculation, so we first collapse [B,H,W] into a single dimension.
  q_tm1 = tf.reshape(
      action_values[:-1],  # [T,B,H,W,N].
      [sequence_length, -1, num_actions],
      name="q_tm1")  # [T,BHW,N].
  r_t = tf.reshape(
      pseudo_rewards,  # [T,B,H,W].
      [sequence_length, -1],
      name="r_t")  # [T,BHW].
  q_t = tf.reshape(
      action_values[1:],  # [T,B,H,W,N].
      [sequence_length, -1, num_actions],
      name="q_t")  # [T,BHW,N].
  # The actions tensor is of shape [T,B], and is the same for each H and W.
  # We thus expand it to be same shape as the reward tensor, [T,BHW].
  expanded_actions = tf.expand_dims(tf.expand_dims(actions, -1), -1)
  a_tm1 = tf.tile(
      expanded_actions, multiples=[1, 1] + height_width)  # [T,B,H,W].
  a_tm1 = tf.reshape(a_tm1, [sequence_length, -1])  # [T,BHW].
  # We similarly expand-and-tile the discount to [T,BHW].
  discount_factor = tf.convert_to_tensor(discount_factor)
  if discount_factor.shape.ndims == 0:
    pcont_t = tf.reshape(discount_factor, [1, 1])  # [1,1].
    pcont_t = tf.tile(pcont_t, tf.shape(a_tm1))  # [T,BHW].
  elif discount_factor.shape.ndims == 2:
    tiled_pcont = tf.tile(
        tf.expand_dims(tf.expand_dims(discount_factor, -1), -1),
        [1, 1] + height_width)
    pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1])
  else:
    raise ValueError(
        "The discount_factor must be a scalar or a tensor of rank 2."
        "instead is a tensor of shape {}".format(
            discount_factor.shape.as_list()))
  # Compute a QLambda loss of shape [T,BHW]
  loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1)
  # Take sum over sequence, sum over cells.
  expanded_shape = [sequence_length, batch_size] + height_width
  spatial_loss = tf.reshape(loss, expanded_shape)  # [T,B,H,W].
  # Return.
  extra = PixelControlExtra(
      spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards)
  return base_ops.LossOutput(
      tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra)  # [B]
コード例 #9
0
 def testCompatibilityCheck(self):
   a_tm1 = tf.placeholder(tf.int32, [5, 2])
   with self.assertRaisesRegexp(
       ValueError, "QLambda: Error in rank and/or compatibility check"):
     self.qlearning = rl.qlambda(self.q_tm1, a_tm1, self.r_t, self.pcont_t,
                                 self.q_t, self.lambda_scalar)
コード例 #10
0
 def testRankCheck(self):
   q_tm1 = tf.placeholder(tf.float32, [None, 3])
   with self.assertRaisesRegexp(
       ValueError, "QLambda: Error in rank and/or compatibility check"):
     self.qlearning = rl.qlambda(q_tm1, self.a_tm1, self.r_t, self.pcont_t,
                                 self.q_t, self.lambda_scalar)