Example #1
0
def matrix_attention_layer(input_ts,
                           context_ts,
                           att_dim,
                           scope_name,
                           weights_only=False):
    output_dim = int(input_ts.shape[1])  # time_step, L
    input_dim = int(input_ts.shape[2])  # video_dims, k
    context_dim = int(context_ts.shape[1])  # question_dims, c
    #print 'input_ts:', input_ts.shape
    #print 'context_ts:', context_ts.shape
    with tf.variable_scope(scope_name):
        tiled_context = tf.tile(tf.expand_dims(context_ts, 1),
                                tf.stack([1, output_dim, 1]))
        #print 'tiled_context:', tiled_context.shape
        w_c = tf.get_variable(
            'w_c',
            shape=[context_dim, att_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer(
            ))  # initializer=tf.random_normal_initializer(stddev=0.003))
        w_i = tf.get_variable(
            'w_i',
            shape=[input_dim, att_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer(
            ))  # initializer=tf.random_normal_initializer(stddev=0.001))
        b_i = tf.get_variable(
            'b_i',
            shape=[att_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer(
            ))  # initializer=tf.random_normal_initializer(stddev=0.01))
        # (batch_size, time_step, att_dim)
        attention_input = tf.tanh(
            utils.tensor_matmul(input_ts, w_i) +
            utils.tensor_matmul(tiled_context, w_c) + b_i)
        #print 'attention_input:', attention_input.shape
        w_a = tf.get_variable(
            'w_a',
            shape=[att_dim, 1],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer(
            ))  # initializer=tf.random_normal_initializer(stddev=0.01))
        # (batch_size, time_step)
        attention_score = tf.nn.softmax(
            tf.squeeze(utils.tensor_matmul(attention_input, w_a), axis=[2]))
        if weights_only:
            return attention_score
        #attention_score = tf.nn.softmax(tools.tensor_matmul(attention_input, w_a))
        #print 'attention_score:', attention_score.shape
        # (batch_size, output_dim)
        else:
            attention_output = tf.reduce_sum(
                tf.multiply(input_ts, tf.expand_dims(attention_score, 2)), 1)
            return attention_output, attention_score
Example #2
0
def bilinear_attention_layer(input_ts, context_ts, att_dim, scope_name):
    output_dim = int(input_ts.shape[1])  # time_step, L
    input_dim = int(input_ts.shape[2])  # video_dims, k
    context_dim = int(context_ts.shape[1])  # question_dims, c
    with tf.variable_scope(scope_name):
        reshaped_context = tf.reshape(context_ts, [-1, context_dim, 1])
        p = tf.get_variable(
            'p',
            shape=[input_dim, context_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer(
            ))  # initializer=tf.random_normal_initializer(stddev=0.003))
        # (batch_size, time_step, video_dims) * (video_dims, context_dim) * (batch_size, context_dim, 1) -> (batch_size, time_step, 1)
        attention_input = tf.matmul(utils.tensor_matmul(input_ts, p),
                                    reshaped_context)
        #print 'attention_input:', attention_input.shape
        # (batch_size, time_step)
        attention_score = tf.nn.softmax(tf.squeeze(attention_input, axis=[2]))
        #print 'attention_score:', attention_score.shape
        # (batch_size, output_dim)
        attention_output = tf.reduce_sum(
            tf.multiply(input_ts, tf.expand_dims(attention_score, 2)), 1)
        #print 'attention_output:', attention_output.shape
        return attention_output
    def generate_answer_on_testing(self):
        with tf.variable_scope("decoder"):
            answer_test = []
            decoder_state = self.decoder_cell.zero_state(
                self.batch_size, tf.float32)
            loss = 0.0

            with tf.variable_scope("lstm") as scope:
                for i in range(self.max_n_a_words):
                    scope.reuse_variables()
                    if i == 0:
                        current_emb = self.decoder_input
                    else:
                        next_word_vec = tf.nn.embedding_lookup(
                            self.Wemb, max_prob_word)
                        current_emb = tf.nn.xw_plus_b(next_word_vec,
                                                      self.word_to_lstm_w,
                                                      self.word_to_lstm_b)

                    # decoder_state
                    tiled_decoder_state_h = tf.tile(
                        tf.expand_dims(decoder_state, 1),
                        tf.stack([1, self.input_n_frames - 1, 1]))
                    tiled_q_last_state = tf.tile(
                        tf.expand_dims(self.q_last_state, 1),
                        tf.stack([1, self.input_n_frames - 1, 1]))
                    attention_input = tf.tanh(
                        utils.tensor_matmul(self.v_first_lstm_output,
                                            self.attention_w_x) +
                        utils.tensor_matmul(tiled_q_last_state,
                                            self.attention_w_q) +
                        utils.tensor_matmul(tiled_decoder_state_h,
                                            self.attention_w_h) +
                        self.attention_b)
                    attention_score = tf.nn.softmax(
                        tf.squeeze(utils.tensor_matmul(attention_input,
                                                       self.attention_a),
                                   axis=[2]))
                    attention_output = tf.reduce_sum(
                        tf.multiply(self.v_first_lstm_output,
                                    tf.expand_dims(attention_score, 2)), 1)
                    attention_decoder = tf.matmul(attention_output,
                                                  self.attention_to_decoder)

                    # decoder : GRU with attention
                    decoder_input = tf.concat(
                        [decoder_state, attention_decoder, current_emb],
                        axis=1)
                    decoder_r_t = tf.nn.sigmoid(
                        tf.matmul(decoder_input, self.decoder_r))
                    decoder_z_t = tf.nn.sigmoid(
                        tf.matmul(decoder_input, self.decoder_z))
                    decoder_middle = tf.concat([
                        tf.multiply(decoder_r_t, decoder_state),
                        tf.multiply(decoder_r_t, attention_decoder),
                        current_emb
                    ],
                                               axis=1)
                    decoder_state_ = tf.tanh(
                        tf.matmul(decoder_middle, self.decoder_w))
                    decoder_state = tf.multiply(
                        (1 - decoder_z_t), decoder_state) + tf.multiply(
                            decoder_z_t, decoder_state_)

                    output = decoder_state

                    # ground truth
                    labels = tf.expand_dims(self.y[:, i], 1)
                    indices = tf.expand_dims(tf.range(0, self.batch_size, 1),
                                             1)
                    concated = tf.concat([indices, labels], 1)
                    onehot_labels = tf.sparse_to_dense(
                        concated, tf.stack([self.batch_size, self.n_words]),
                        1.0, 0.0)

                    logit_words = tf.nn.xw_plus_b(output, self.embed_word_W,
                                                  self.embed_word_b)
                    max_prob_word = tf.argmax(logit_words, 1)
                    answer_test.append(max_prob_word)

                    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                        labels=onehot_labels, logits=logit_words)
                    # cross_entropy = cross_entropy * self.reward
                    cross_entropy = cross_entropy * self.y_mask[:, i]
                    current_loss = tf.reduce_sum(cross_entropy)
                    loss = loss + current_loss

            loss = loss / tf.reduce_sum(self.y_mask[:, 1:])
            return answer_test, loss