Esempio n. 1
0
    def inject_additional_input(self, layer, inputs, name, mode="concat"):
        layer_shape = common_layers.shape_list(layer)
        input_shape = common_layers.shape_list(inputs)
        zeros_mask = tf.zeros(layer_shape, dtype=tf.float32)
        if mode == "concat":
            emb = common_video.encode_to_shape(inputs, layer_shape, name)
            layer = tf.concat(values=[layer, emb], axis=-1)
        elif mode == "multiplicative":
            filters = layer_shape[-1]
            input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]])
            input_mask = tf.layers.dense(input_reshaped, filters, name=name)
            input_broad = input_mask + zeros_mask
            layer *= input_broad
        elif mode == "multi_additive":
            filters = layer_shape[-1]
            input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]])
            input_mul = tf.layers.dense(input_reshaped,
                                        filters,
                                        name=name + "_mul")
            layer *= tf.nn.sigmoid(input_mul)
            input_add = tf.layers.dense(input_reshaped,
                                        filters,
                                        name=name + "_add")
            layer += input_add
        else:
            raise ValueError("Unknown injection mode: %s" % mode)

        return layer
Esempio n. 2
0
  def video_features(
      self, all_frames, all_actions, all_rewards, all_raw_frames):
    """Video wide latent."""
    del all_actions, all_rewards, all_raw_frames

    hparams = self.hparams
    frames = tf.stack(all_frames, axis=1)
    mean, std = self.construct_latent_tower(frames, time_axis=1)
    tower_output = tf.concat([mean, std], axis=-1)
    tower_output_shape = common_layers.shape_list(tower_output)
    batch_size = tower_output_shape[0]

    if not self.is_training:
      rand = tf.random_uniform([batch_size, hparams.bottleneck_bits])
      d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
    else:
      x = tfl.flatten(tower_output)
      x = tfl.dense(x, hparams.bottleneck_bits, name="bits_enc")
      x_shape = common_layers.shape_list(x)
      x += tf.truncated_normal(x_shape, mean=0.0, stddev=0.2)
      x = tf.tanh(x)
      noise = tf.random_uniform(x_shape)
      noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0
      x *= noise
      d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x)
      p = common_layers.inverse_lin_decay(hparams.discrete_warmup_steps)
      d = tf.where(tf.less(tf.random_uniform([batch_size]), p), d, x)

    decoded_bits = common_video.encode_to_shape(
        d, tower_output_shape, "bits_dec")
    return [decoded_bits, None, None]
Esempio n. 3
0
 def inject_additional_input(self, layer, inputs, scope, concatenate=True):
     layer_shape = common_layers.shape_list(layer)
     input_shape = common_layers.shape_list(inputs)
     if concatenate:
         emb = common_video.encode_to_shape(inputs, layer_shape, scope)
         layer = tf.concat(values=[layer, emb], axis=-1)
     else:
         filters = layer_shape[-1]
         input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]])
         input_mask = tf.layers.dense(input_reshaped, filters, name=scope)
         zeros_mask = tf.zeros(layer_shape, dtype=tf.float32)
         input_broad = input_mask + zeros_mask
         layer *= input_broad
     return layer
Esempio n. 4
0
def inject_additional_input(layer, inputs, name, mode="multi_additive"):
    """Injects the additional input into the layer.

  Args:
    layer: layer that the input should be injected to.
    inputs: inputs to be injected.
    name: TF scope name.
    mode: how the infor should be added to the layer:
      "concat" concats as additional channels.
      "multiplicative" broadcasts inputs and multiply them to the channels.
      "multi_additive" broadcasts inputs and multiply and add to the channels.

  Returns:
    updated layer.

  Raises:
    ValueError: in case of unknown mode.
  """
    layer_shape = common_layers.shape_list(layer)
    input_shape = common_layers.shape_list(inputs)
    zeros_mask = tf.zeros(layer_shape, dtype=tf.float32)
    if mode == "concat":
        emb = common_video.encode_to_shape(inputs, layer_shape, name)
        layer = tf.concat(values=[layer, emb], axis=-1)
    elif mode == "multiplicative":
        filters = layer_shape[-1]
        input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]])
        input_mask = tf.layers.dense(input_reshaped, filters, name=name)
        input_broad = input_mask + zeros_mask
        layer *= input_broad
    elif mode == "multi_additive":
        filters = layer_shape[-1]
        input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]])
        input_mul = tf.layers.dense(input_reshaped,
                                    filters,
                                    name=name + "_mul")
        layer *= tf.nn.sigmoid(input_mul)
        input_add = tf.layers.dense(input_reshaped,
                                    filters,
                                    name=name + "_add")
        layer += input_add
    else:
        raise ValueError("Unknown injection mode: %s" % mode)

    return layer
Esempio n. 5
0
    def bottom_part_tower(self,
                          input_image,
                          input_reward,
                          action,
                          latent,
                          lstm_state,
                          lstm_size,
                          conv_size,
                          concat_latent=False):
        """The bottom part of predictive towers.

    With the current (early) design, the main prediction tower and
    the reward prediction tower share the same arcitecture. TF Scope can be
    adjusted as required to either share or not share the weights between
    the two towers.

    Args:
      input_image: the current image.
      input_reward: the current reward.
      action: the action taken by the agent.
      latent: the latent vector.
      lstm_state: the current internal states of conv lstms.
      lstm_size: the size of lstms.
      conv_size: the size of convolutions.
      concat_latent: whether or not to concatenate the latent at every step.

    Returns:
      - the output of the partial network.
      - intermidate outputs for skip connections.
    """
        lstm_func = common_video.conv_lstm_2d
        tile_and_concat = common_video.tile_and_concat

        input_image = common_layers.make_even_size(input_image)
        concat_input_image = tile_and_concat(input_image,
                                             latent,
                                             concat_latent=concat_latent)

        enc0 = tfl.conv2d(concat_input_image,
                          conv_size[0], [5, 5],
                          strides=(2, 2),
                          activation=tf.nn.relu,
                          padding="SAME",
                          name="scale1_conv1")
        enc0 = tfcl.layer_norm(enc0, scope="layer_norm1")

        hidden1, lstm_state[0] = lstm_func(enc0,
                                           lstm_state[0],
                                           lstm_size[0],
                                           name="state1")
        hidden1 = tile_and_concat(hidden1, latent, concat_latent=concat_latent)
        hidden1 = tfcl.layer_norm(hidden1, scope="layer_norm2")
        hidden2, lstm_state[1] = lstm_func(hidden1,
                                           lstm_state[1],
                                           lstm_size[1],
                                           name="state2")
        hidden2 = tfcl.layer_norm(hidden2, scope="layer_norm3")
        hidden2 = common_layers.make_even_size(hidden2)
        enc1 = tfl.conv2d(hidden2,
                          hidden2.get_shape()[3], [3, 3],
                          strides=(2, 2),
                          padding="SAME",
                          activation=tf.nn.relu,
                          name="conv2")
        enc1 = tile_and_concat(enc1, latent, concat_latent=concat_latent)

        hidden3, lstm_state[2] = lstm_func(enc1,
                                           lstm_state[2],
                                           lstm_size[2],
                                           name="state3")
        hidden3 = tile_and_concat(hidden3, latent, concat_latent=concat_latent)
        hidden3 = tfcl.layer_norm(hidden3, scope="layer_norm4")
        hidden4, lstm_state[3] = lstm_func(hidden3,
                                           lstm_state[3],
                                           lstm_size[3],
                                           name="state4")
        hidden4 = tile_and_concat(hidden4, latent, concat_latent=concat_latent)
        hidden4 = tfcl.layer_norm(hidden4, scope="layer_norm5")
        hidden4 = common_layers.make_even_size(hidden4)
        enc2 = tfl.conv2d(hidden4,
                          hidden4.get_shape()[3], [3, 3],
                          strides=(2, 2),
                          padding="SAME",
                          activation=tf.nn.relu,
                          name="conv3")

        # Pass in action if exists.
        if action is not None:
            emb_action = common_video.encode_to_shape(action, enc2.get_shape(),
                                                      "action_enc")
            enc2 = tf.concat(values=[enc2, emb_action], axis=3)

        # Pass in reward if exists.
        if input_reward is not None:
            emb_reward = common_video.encode_to_shape(input_reward,
                                                      enc2.get_shape(),
                                                      "reward_enc")
            enc2 = tf.concat(values=[enc2, emb_reward], axis=3)

        if latent is not None and not concat_latent:
            with tf.control_dependencies([latent]):
                enc2 = tf.concat([enc2, latent], axis=3)

        enc3 = tfl.conv2d(enc2,
                          hidden4.get_shape()[3], [1, 1],
                          strides=(1, 1),
                          padding="SAME",
                          activation=tf.nn.relu,
                          name="conv4")

        hidden5, lstm_state[4] = lstm_func(enc3,
                                           lstm_state[4],
                                           lstm_size[4],
                                           name="state5")  # last 8x8
        hidden5 = tfcl.layer_norm(hidden5, scope="layer_norm6")
        hidden5 = tile_and_concat(hidden5, latent, concat_latent=concat_latent)
        return hidden5, (enc0, enc1)
Esempio n. 6
0
 def decode_bits(b):
     return common_video.encode_to_shape(b, tower_output_shape,
                                         "bits_dec")