Ejemplo n.º 1
0
def _materialised_conv_layer_dual_objective(w, b, padding, strides, lam_in,
                                            mu_out, lb, ub):
    """Materialised version of `conv_layer_dual_objective`."""
    # Flatten the inputs, as the materialised convolution will have no
    # spatial structure.
    mu_out_flat = snt.BatchFlatten(preserve_dims=2)(mu_out)

    # Materialise the convolution as a (sparse) fully connected linear layer.
    w_flat, b_flat = layer_utils.materialise_conv(w,
                                                  b,
                                                  lb.shape[1:].as_list(),
                                                  padding=padding,
                                                  strides=strides)

    activation_coeffs = -tf.tensordot(
        mu_out_flat, tf.transpose(w_flat), axes=1)
    dual_obj_bias = -tf.tensordot(mu_out_flat, b_flat, axes=1)

    # Flatten the inputs, as the materialised convolution will have no
    # spatial structure.
    if lam_in is not None:
        lam_in = snt.FlattenTrailingDimensions(2)(lam_in)
    lb = snt.BatchFlatten()(lb)
    ub = snt.BatchFlatten()(ub)

    return standard_layer_calcs.linear_dual_objective(lam_in,
                                                      activation_coeffs,
                                                      dual_obj_bias, lb, ub)
Ejemplo n.º 2
0
def pixel_control_rewards(observations, cell_size):
    """Calculates pixel control task rewards from observation sequence.

  The observations are first split in a grid of KxK cells. For each cell a
  distinct pseudo reward is computed as the average absolute change in pixel
  intensity for all pixels in the cell. The change in intensity is averaged
  across both pixels and channels (e.g. RGB).

  The `observations` provided to this function should be cropped suitably, to
  ensure that the observations' height and width are a multiple of `cell_size`.
  The values of the `observations` tensor should be rescaled to [0, 1]. In the
  UNREAL agent observations are cropped to 80x80, and each cell is 4x4 in size.

  See "Reinforcement Learning with Unsupervised Auxiliary Tasks" by Jaderberg,
  Mnih, Czarnecki et al. (https://arxiv.org/abs/1611.05397).

  Args:
    observations: A tensor of shape `[T+1,B,H,W,C...]`, where
      * `T` is the sequence length, `B` is the batch size.
      * `H` is height, `W` is width.
      * `C...` is at least one channel dimension (e.g., colour, stack).
      * `T` and `B` can be statically unknown.
    cell_size: The size of each cell.

  Returns:
    A tensor of pixel control rewards calculated from the observation. The
    shape is `[T,B,H',W']`, where `H'` and `W'` are determined by the
    `cell_size`. If evenly-divisible, `H' = H/cell_size`, and similar for `W`.
  """
    # Calculate the absolute differences across the sequence.
    abs_observation_diff = tf.abs(observations[1:] - observations[:-1])
    # Average over cells. abs_observation_diff has shape [T,B,H,W,C...], e.g.,
    # [T,B,H,W,C] if we have a colour channel. We want to use the TF avg_pool
    # op, but it expects 4D inputs. We collapse T and B then collapse all channel
    # dimensions. After pooling, we can then undo the sequence/batch collapse.
    obs_shape = abs_observation_diff.get_shape().as_list()
    # Collapse sequence and batch into one: [TB,H,W,C...].
    abs_diff = tf.reshape(abs_observation_diff, [-1] + obs_shape[2:])
    # Merge remaining dimensions after W: [TB,H,W,C'].
    abs_diff = snt.FlattenTrailingDimensions(dim_from=3)(abs_diff)
    # Apply the averaging using average pooling and reducing over channel.
    avg_abs_diff = tf.nn.avg_pool(abs_diff,
                                  ksize=[1, cell_size, cell_size, 1],
                                  strides=[1, cell_size, cell_size, 1],
                                  padding="VALID")  # [TB, H', W', C'].
    avg_abs_diff = tf.reduce_mean(avg_abs_diff, axis=[3])  # [TB,H',W'].
    # Restore sequence and batch dimensions, and static shape info where possible.
    pseudo_rewards = tf.reshape(avg_abs_diff, [
        tf.shape(abs_observation_diff)[0],
        tf.shape(abs_observation_diff)[1],
        tf.shape(avg_abs_diff)[1],
        tf.shape(avg_abs_diff)[2]
    ],
                                name="pseudo_rewards")  # [T,B,H',W'].
    sequence_batch = abs_observation_diff.get_shape()[:2]
    new_height_width = avg_abs_diff.get_shape()[1:]
    pseudo_rewards.set_shape(sequence_batch.concatenate(new_height_width))
    return pseudo_rewards
Ejemplo n.º 3
0
    def _torso(self, input_):
        """Processing of all the visual and language inputs to the LSTM core."""

        # Extract the inputs
        last_action, env_output = input_
        last_reward, _, _, observation = env_output
        frame = observation[self._idx_frame]
        goal = observation[self._idx_goal]
        goal = tf.to_float(goal)

        # Convert to image to floats and normalise.
        frame = tf.to_float(frame)
        frame = snt.FlattenTrailingDimensions(dim_from=3)(frame)
        frame /= 255.0

        # Feed image through convnet.
        with tf.variable_scope('convnet'):
            # Convolutional layers.
            conv_out = self._convnet(frame)
            # Fully connected layer.
            conv_out = snt.BatchFlatten()(conv_out)
            conv_out = snt.Linear(256)(conv_out)
            conv_out = tf.nn.relu(conv_out)

        # Concatenate outputs of the visual and instruction pathways.
        if self._feed_action_and_reward:
            # Append clipped last reward and one hot last action.
            tf.logging.info('Append last reward clipped to: %f',
                            self._max_reward)
            clipped_last_reward = tf.expand_dims(
                tf.clip_by_value(last_reward, -self._max_reward,
                                 self._max_reward), -1)
            tf.logging.info('Append last action (one-hot of %d)',
                            self._num_actions)
            one_hot_last_action = tf.one_hot(last_action, self._num_actions)
            tf.logging.info('Append goal:')
            tf.logging.info(goal)
            action_and_reward = tf.concat(
                [clipped_last_reward, one_hot_last_action], axis=1)
        else:
            action_and_reward = tf.constant([0], dtype=tf.float32)
        return conv_out, action_and_reward, goal