def __call__(self, inputs, state):
     l_outputs, l_next_state = self.language_lstm(inputs, state)
     s_inputs = tf.concat([l_outputs, inputs], 1)
     image_height = tf.shape(self.spatial_image_features)[1]
     image_width = tf.shape(self.spatial_image_features)[2]
     image_features = collapse_dims(self.spatial_image_features, [1, 2])
     attn_inputs = tf.concat([ image_features, tile_with_new_axis(l_outputs, [
         image_height * image_width], [1]) ], 2)
     attended_features = tf.reduce_sum(image_features * self.attn_layer(attn_inputs), [1])
     return tf.concat([attended_features, l_outputs], 1), l_next_state
 def __call__(self, inputs, state):
     v_inputs = tf.concat([tf.concat(state.language, 1), self.mean_image_features, inputs], 1)
     v_outputs, v_next_state = self.visual_lstm(v_inputs, state.visual)
     attention_inputs = tf.concat([ self.mean_object_features, tile_with_new_axis(v_outputs, [
         tf.shape(self.mean_object_features)[1]], [1]) ], 2)
     attended_features = tf.reduce_sum(self.mean_object_features * self.attention_layer(
         attention_inputs), 1)
     l_inputs = tf.concat([v_outputs, attended_features], 1)
     l_outputs, l_next_state = self.language_lstm(l_inputs, state.language)
     return l_outputs, UpDownStateTuple(v_next_state, l_next_state)
 def __call__(self, inputs, state):
     spatial_size = tf.shape(self.spatial_image_features)[1]
     attention_inputs = tf.concat([
         self.spatial_image_features,
         tile_with_new_axis(tf.concat(state, 1), [spatial_size], [1])
     ], 2)
     attended_features = tf.reduce_sum(
         self.spatial_image_features *
         self.attention_layer(attention_inputs), [1])
     l_inputs = tf.concat([attended_features, inputs], 1)
     l_outputs, l_next_state = self.language_lstm(l_inputs, state)
     return tf.concat([l_outputs, attended_features], 1), l_next_state
 def __call__(self, inputs, state):
     l_inputs = tf.concat([tf.reduce_mean(self.spatial_image_features, [1]), inputs], 1)
     l_outputs, l_next_state = self.language_lstm(l_inputs, state)
     sentinel_embeddings = self.sentinel_embeddings_layer(tf.nn.tanh(
         l_next_state.c) * self.sentinel_gate_layer(tf.concat([state.h, inputs], 1)))
     spatial_size = tf.shape(self.spatial_image_features)[1]
     sentinel_image_features = tf.concat([self.spatial_image_features, tf.expand_dims(
         sentinel_embeddings, 1)], 1)
     attention_inputs = tf.nn.tanh(tf.concat([ sentinel_image_features, tile_with_new_axis(
         l_outputs, [spatial_size + 1], [1]) ], 2))
     attended_features = tf.reduce_sum(sentinel_image_features * self.attention_layer(
         attention_inputs), [1])
     return tf.concat([attended_features, l_outputs], 1), l_next_state
    def __call__(self,
                 image_features,
                 caption_ids,
                 previous_ids,
                 indicators=None,
                 pointer_ids=None):

        if indicators is None:
            indicators = tf.ones(tf.shape(caption_ids))

        # The Bidirectional RNN sequence encoder
        caption_embeddings = tf.nn.embedding_lookup(self.word_embeddings,
                                                    caption_ids)
        lengths = tf.cast(tf.reduce_sum(indicators, axis=1), tf.int32)
        outputs_tuple, _ = tf.nn.bidirectional_dynamic_rnn(
            self.fw_cell,
            self.bw_cell,
            caption_embeddings,
            sequence_length=lengths,
            dtype=tf.float32)
        outputs = tf.concat(outputs_tuple, 2)
        slots = tf.concat([outputs[:, :-1, :], outputs[:, 1:, :]], 2)

        # The Pointer Network mechanism
        previous_embeddings = tf.nn.embedding_lookup(self.word_embeddings,
                                                     previous_ids)
        num_slots = tf.shape(slots)[1]
        pointer_inputs = tf.concat(
            [slots,
             tile_with_new_axis(previous_embeddings, [num_slots], [1])], 2)
        pointer_logits = tf.squeeze(self.pointer_layer(pointer_inputs))
        pointer_logits = pointer_logits * indicators[:, :-1] * indicators[:,
                                                                          1:]

        if pointer_ids is None:
            pointer_ids = tf.argmax(pointer_logits,
                                    axis=1,
                                    output_type=tf.int32)

        # The word prediction mechanism
        batch_size = tf.shape(slots)[0]
        expansion_slots = tf.gather_nd(
            slots, tf.stack([tf.range(batch_size), pointer_ids], axis=1))
        word_inputs = tf.concat([expansion_slots, image_features], 1)
        word_logits = self.logits_layer(word_inputs)

        return pointer_logits, word_logits
 def __call__(self, inputs, state):
     l_outputs, l_next_state = self.language_lstm(inputs, state)
     sentinel_embeddings = self.sentinel_embeddings_layer(
         tf.nn.tanh(l_next_state.c) *
         self.sentinel_gate_layer(tf.concat([state.h, inputs], 1)))
     image_height = tf.shape(self.spatial_image_features)[1]
     image_width = tf.shape(self.spatial_image_features)[2]
     image_features = collapse_dims(self.spatial_image_features, [1, 2])
     sentinel_image_features = tf.concat(
         [image_features,
          tf.expand_dims(sentinel_embeddings, 1)], 1)
     attn_inputs = tf.nn.tanh(
         tf.concat([
             sentinel_image_features,
             tile_with_new_axis(l_outputs, [image_height * image_width + 1],
                                [1])
         ], 2))
     attended_sif = tf.reduce_sum(
         sentinel_image_features * self.attn_layer(attn_inputs), [1])
     return tf.concat([attended_sif, l_outputs], 1), l_next_state
Beispiel #7
0
 def __call__(self, inputs, state):
     region_inputs = tf.concat([
         tf.concat(state.language, 1),
         tf.concat(state.attribute, 1), self.mean_image_features, inputs
     ], 1)
     region_outputs, region_next_state = self.region_lstm(
         region_inputs, state.region)
     region_sentinel = self.region_sentinel_embeddings_layer(
         tf.nn.tanh(region_next_state.c) * self.region_sentinel_gate_layer(
             tf.concat([state.region.h, inputs], 1)))
     region_features = tf.concat([
         self.mean_object_features,
         tf.expand_dims(self.mean_image_features, 1),
         tf.expand_dims(region_sentinel, 1)
     ], 1)
     region_attention_inputs = tf.concat([
         region_features,
         tile_with_new_axis(region_outputs, [tf.shape(region_features)[1]],
                            [1])
     ], 2)
     region_attention_mask = self.region_attention_layer(
         region_attention_inputs)
     region_pointer_ids = tf.argmax(tf.squeeze(region_attention_mask, 2),
                                    1,
                                    output_type=tf.int32)
     attended_region_features = tf.reduce_sum(
         region_features * region_attention_mask, 1)
     attribute_inputs = tf.concat([
         region_outputs,
         tf.concat(state.language, 1), attended_region_features
     ], 1)
     attribute_outputs, attribute_next_state = self.attribute_lstm(
         attribute_inputs, state.attribute)
     attribute_sentinel = self.attribute_sentinel_embeddings_layer(
         tf.nn.tanh(attribute_next_state.c) *
         self.attribute_sentinel_gate_layer(
             tf.concat([state.attribute.h, inputs], 1)))
     all_attribute_features = tf.concat([
         self.object_attribute_features,
         tf.expand_dims(self.image_attribute_features, 1),
         tf.expand_dims(
             tile_with_new_axis(
                 attribute_sentinel,
                 [tf.shape(self.image_attribute_features)[1]], [1]), 1)
     ], 1)
     attribute_features = tf.gather_nd(
         all_attribute_features,
         tf.concat([
             tf.expand_dims(tf.range(tf.shape(region_pointer_ids)[0]), 1),
             tf.expand_dims(region_pointer_ids, 1)
         ], 1))
     attribute_attention_inputs = tf.concat([
         attribute_features,
         tile_with_new_axis(attribute_outputs,
                            [tf.shape(attribute_features)[1]], [1])
     ], 2)
     attribute_attention_mask = self.attribute_attention_layer(
         attribute_attention_inputs)
     attended_attribute_features = tf.reduce_sum(
         attribute_features * attribute_attention_mask, 1)
     language_inputs = tf.concat(
         [attribute_outputs, attended_attribute_features], 1)
     language_outputs, language_next_state = self.language_lstm(
         language_inputs, state.language)
     return language_outputs, GroundedAttributeStateTuple(
         region_next_state, attribute_next_state, language_next_state)