def inject_additional_input(self, layer, inputs, name, mode="concat"): layer_shape = common_layers.shape_list(layer) input_shape = common_layers.shape_list(inputs) zeros_mask = tf.zeros(layer_shape, dtype=tf.float32) if mode == "concat": emb = common_video.encode_to_shape(inputs, layer_shape, name) layer = tf.concat(values=[layer, emb], axis=-1) elif mode == "multiplicative": filters = layer_shape[-1] input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]]) input_mask = tf.layers.dense(input_reshaped, filters, name=name) input_broad = input_mask + zeros_mask layer *= input_broad elif mode == "multi_additive": filters = layer_shape[-1] input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]]) input_mul = tf.layers.dense(input_reshaped, filters, name=name + "_mul") layer *= tf.nn.sigmoid(input_mul) input_add = tf.layers.dense(input_reshaped, filters, name=name + "_add") layer += input_add else: raise ValueError("Unknown injection mode: %s" % mode) return layer
def video_features( self, all_frames, all_actions, all_rewards, all_raw_frames): """Video wide latent.""" del all_actions, all_rewards, all_raw_frames hparams = self.hparams frames = tf.stack(all_frames, axis=1) mean, std = self.construct_latent_tower(frames, time_axis=1) tower_output = tf.concat([mean, std], axis=-1) tower_output_shape = common_layers.shape_list(tower_output) batch_size = tower_output_shape[0] if not self.is_training: rand = tf.random_uniform([batch_size, hparams.bottleneck_bits]) d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 else: x = tfl.flatten(tower_output) x = tfl.dense(x, hparams.bottleneck_bits, name="bits_enc") x_shape = common_layers.shape_list(x) x += tf.truncated_normal(x_shape, mean=0.0, stddev=0.2) x = tf.tanh(x) noise = tf.random_uniform(x_shape) noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0 x *= noise d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x) p = common_layers.inverse_lin_decay(hparams.discrete_warmup_steps) d = tf.where(tf.less(tf.random_uniform([batch_size]), p), d, x) decoded_bits = common_video.encode_to_shape( d, tower_output_shape, "bits_dec") return [decoded_bits, None, None]
def inject_additional_input(self, layer, inputs, scope, concatenate=True): layer_shape = common_layers.shape_list(layer) input_shape = common_layers.shape_list(inputs) if concatenate: emb = common_video.encode_to_shape(inputs, layer_shape, scope) layer = tf.concat(values=[layer, emb], axis=-1) else: filters = layer_shape[-1] input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]]) input_mask = tf.layers.dense(input_reshaped, filters, name=scope) zeros_mask = tf.zeros(layer_shape, dtype=tf.float32) input_broad = input_mask + zeros_mask layer *= input_broad return layer
def inject_additional_input(layer, inputs, name, mode="multi_additive"): """Injects the additional input into the layer. Args: layer: layer that the input should be injected to. inputs: inputs to be injected. name: TF scope name. mode: how the infor should be added to the layer: "concat" concats as additional channels. "multiplicative" broadcasts inputs and multiply them to the channels. "multi_additive" broadcasts inputs and multiply and add to the channels. Returns: updated layer. Raises: ValueError: in case of unknown mode. """ layer_shape = common_layers.shape_list(layer) input_shape = common_layers.shape_list(inputs) zeros_mask = tf.zeros(layer_shape, dtype=tf.float32) if mode == "concat": emb = common_video.encode_to_shape(inputs, layer_shape, name) layer = tf.concat(values=[layer, emb], axis=-1) elif mode == "multiplicative": filters = layer_shape[-1] input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]]) input_mask = tf.layers.dense(input_reshaped, filters, name=name) input_broad = input_mask + zeros_mask layer *= input_broad elif mode == "multi_additive": filters = layer_shape[-1] input_reshaped = tf.reshape(inputs, [-1, 1, 1, input_shape[-1]]) input_mul = tf.layers.dense(input_reshaped, filters, name=name + "_mul") layer *= tf.nn.sigmoid(input_mul) input_add = tf.layers.dense(input_reshaped, filters, name=name + "_add") layer += input_add else: raise ValueError("Unknown injection mode: %s" % mode) return layer
def bottom_part_tower(self, input_image, input_reward, action, latent, lstm_state, lstm_size, conv_size, concat_latent=False): """The bottom part of predictive towers. With the current (early) design, the main prediction tower and the reward prediction tower share the same arcitecture. TF Scope can be adjusted as required to either share or not share the weights between the two towers. Args: input_image: the current image. input_reward: the current reward. action: the action taken by the agent. latent: the latent vector. lstm_state: the current internal states of conv lstms. lstm_size: the size of lstms. conv_size: the size of convolutions. concat_latent: whether or not to concatenate the latent at every step. Returns: - the output of the partial network. - intermidate outputs for skip connections. """ lstm_func = common_video.conv_lstm_2d tile_and_concat = common_video.tile_and_concat input_image = common_layers.make_even_size(input_image) concat_input_image = tile_and_concat(input_image, latent, concat_latent=concat_latent) enc0 = tfl.conv2d(concat_input_image, conv_size[0], [5, 5], strides=(2, 2), activation=tf.nn.relu, padding="SAME", name="scale1_conv1") enc0 = tfcl.layer_norm(enc0, scope="layer_norm1") hidden1, lstm_state[0] = lstm_func(enc0, lstm_state[0], lstm_size[0], name="state1") hidden1 = tile_and_concat(hidden1, latent, concat_latent=concat_latent) hidden1 = tfcl.layer_norm(hidden1, scope="layer_norm2") hidden2, lstm_state[1] = lstm_func(hidden1, lstm_state[1], lstm_size[1], name="state2") hidden2 = tfcl.layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = tfl.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="conv2") enc1 = tile_and_concat(enc1, latent, concat_latent=concat_latent) hidden3, lstm_state[2] = lstm_func(enc1, lstm_state[2], lstm_size[2], name="state3") hidden3 = tile_and_concat(hidden3, latent, concat_latent=concat_latent) hidden3 = tfcl.layer_norm(hidden3, scope="layer_norm4") hidden4, lstm_state[3] = lstm_func(hidden3, lstm_state[3], lstm_size[3], name="state4") hidden4 = tile_and_concat(hidden4, latent, concat_latent=concat_latent) hidden4 = tfcl.layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = tfl.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="conv3") # Pass in action if exists. if action is not None: emb_action = common_video.encode_to_shape(action, enc2.get_shape(), "action_enc") enc2 = tf.concat(values=[enc2, emb_action], axis=3) # Pass in reward if exists. if input_reward is not None: emb_reward = common_video.encode_to_shape(input_reward, enc2.get_shape(), "reward_enc") enc2 = tf.concat(values=[enc2, emb_reward], axis=3) if latent is not None and not concat_latent: with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], axis=3) enc3 = tfl.conv2d(enc2, hidden4.get_shape()[3], [1, 1], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="conv4") hidden5, lstm_state[4] = lstm_func(enc3, lstm_state[4], lstm_size[4], name="state5") # last 8x8 hidden5 = tfcl.layer_norm(hidden5, scope="layer_norm6") hidden5 = tile_and_concat(hidden5, latent, concat_latent=concat_latent) return hidden5, (enc0, enc1)
def decode_bits(b): return common_video.encode_to_shape(b, tower_output_shape, "bits_dec")