Exemple #1
0
def vgg_fc7(input_batch, name, apply_dropout, reuse=None):
    pool5 = vgg_pool5(input_batch, name, reuse)
    with tf.variable_scope(name, reuse=reuse):
        # layer 6
        fc6 = fc_relu('fc6', pool5, output_dim=4096)
        if apply_dropout: fc6 = drop(fc6, 0.5)
        # layer 7
        fc7 = fc_relu('fc7', fc6, output_dim=4096)
        if apply_dropout: fc7 = drop(fc7, 0.5)
        return fc7
def vgg_fc7(input_batch, name, apply_dropout):
    pool5 = vgg_pool5(input_batch, name)
    with tf.variable_scope(name):
        # layer 6
        fc6 = fc_relu('fc6', pool5, output_dim=4096)
        if apply_dropout: fc6 = drop(fc6, 0.5)
        # layer 7
        fc7 = fc_relu('fc7', fc6, output_dim=4096)
        if apply_dropout: fc7 = drop(fc7, 0.5)
        return fc7
Exemple #3
0
def vgg_fc7_full_conv(input_batch, name, apply_dropout, reuse=None):
    pool5 = vgg_pool5(input_batch, name, reuse)
    with tf.variable_scope(name, reuse=reuse):
        # layer 6
        fc6 = conv_relu('fc6', pool5, kernel_size=7, stride=1, output_dim=4096)
        if apply_dropout: fc6 = drop(fc6, 0.5)
        # layer 7
        fc7 = conv_relu('fc7', fc6, kernel_size=1, stride=1, output_dim=4096)
        if apply_dropout: fc7 = drop(fc7, 0.5)
        return fc7
def deeplab_fc8(input_batch, name, apply_dropout=False):
    pool5a = deeplab_pool5(input_batch, name)
    with tf.variable_scope(name):
        fc6 = fc_relu('fc6', pool5a, output_dim=1024)
        if apply_dropout: fc6 = drop(fc6, 0.5)

        fc7 = fc_relu('fc7', fc6, output_dim=1024)
        if apply_dropout: fc7 = drop(fc7, 0.5)
        fc8 = fc('fc8', fc7, output_dim=1000)
        return fc8
def vgg_fc7_full_conv(input_batch, name, apply_dropout):
    pool5 = vgg_pool5(input_batch, name)
    with tf.variable_scope(name):
        # layer 6
        fc6 = conv_relu('fc6', pool5, kernel_size=7, stride=1, output_dim=4096)
        if apply_dropout: fc6 = drop(fc6, 0.5)
        # layer 7
        fc7 = conv_relu('fc7', fc6, kernel_size=1, stride=1, output_dim=4096)
        if apply_dropout: fc7 = drop(fc7, 0.5)
        return fc7
Exemple #6
0
def vgg_roi_fc7_from_conv5(conv5, roi_batch, name, apply_dropout, reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        # ROI Pooling
        roi_pool5, _ = roi_pool(conv5,
                                roi_batch,
                                pooled_height=7,
                                pooled_width=7,
                                spatial_scale=1. / 16,
                                name='roi_pool5')
        # layer 6
        fc6 = fc_relu('fc6', roi_pool5, output_dim=4096)
        if apply_dropout: fc6 = drop(fc6, 0.5)
        # layer 7
        fc7 = fc_relu('fc7', fc6, output_dim=4096)
        if apply_dropout: fc7 = drop(fc7, 0.5)
    return fc7
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab,
                       embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout,
                       mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                  lstm_dim)

    # Local image feature
    feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch,
                                             'deeplab',
                                             apply_dropout=deeplab_dropout)
    input_dim = 1
    for d in feat_vis.get_shape().as_list()[1:]:
        input_dim *= d
    feat_vis_flatten = tf.reshape(feat_vis, [-1, input_dim])

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them
    feat_all = tf.concat(axis=1,
                         values=[
                             tf.nn.l2_normalize(feat_lang, 1),
                             tf.nn.l2_normalize(feat_vis_flatten, 1),
                             spatial_batch
                         ])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)

    return mlp_l2
Exemple #8
0
def my_fc_layer(input_batch, name, output_dim, apply_dropout=False):
    with tf.variable_scope(name):
        print("input_batch: ", input_batch)
        fc7 = fc('fc', input_batch, output_dim=output_dim)
        print("fc7: ", fc7)
    if apply_dropout: fc7 = drop(fc7, 0.5)
    return fc7
Exemple #9
0
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim,
    lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)

    # Local image feature
    feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local',
        apply_dropout=vgg_dropout)

    # Reshape and tile LSTM top
    featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]
    N, D_text = feat_lang.get_shape().as_list()
    feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
        [1, featmap_H, featmap_W, 1])

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them along axis 3 (channel dimension)
    spatial_batch = tf.convert_to_tensor(generate_spatial_batch(N, featmap_H, featmap_W))
    feat_all = tf.concat(3, [tf.nn.l2_normalize(feat_lang, 3),
                             tf.nn.l2_normalize(feat_vis, 3),
                             spatial_batch])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1,
            output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1)

    return mlp_l2
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab,
                       embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout,
                       mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                  lstm_dim)[0]

    # Local image feature
    feat_vis = vgg_net.vgg_fc8(imcrop_batch,
                               'vgg_local',
                               apply_dropout=vgg_dropout)

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them
    feat_all = tf.concat(axis=1,
                         values=[
                             tf.nn.l2_normalize(feat_lang, 1),
                             tf.nn.l2_normalize(feat_vis, 1), spatial_batch
                         ])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)

    return mlp_l2
Exemple #11
0
def attbilstm(text_seq_batch, name, num_vocab, embed_dim, lstm_dim,
    apply_dropout, reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        T = tf.shape(text_seq_batch)[0]
        N = tf.shape(text_seq_batch)[1]

        # 0. Word embedding
        embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim])
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

        # 1. Encode the sentence into a vector representation, using the final
        # hidden states in a two-layer bidirectional LSTM network
        seq_length = tf.ones(to_T([N]), dtype=tf.int32)*T
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True)
        outputs1_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell,
            embedded_seq, seq_length, dtype=tf.float32, time_major=True,
            scope="bidirectional_lstm1")
        outputs1 = tf.concat(outputs1_raw, axis=2)
        outputs2_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell,
            outputs1, seq_length, dtype=tf.float32, time_major=True,
            scope="bidirectional_lstm2")
        outputs2 = tf.concat(outputs2_raw, axis=2)
        # q_reshape has shape [T, N, lstm_dim*4]
        q_reshape = tf.concat([outputs1, outputs2], axis=2)
        if apply_dropout:
            q_reshape = drop(q_reshape, 0.5)

        # 2. three attention units over the words in each sentence
        with tf.variable_scope("attention"):
            q_reshape_flat = tf.reshape(q_reshape, to_T([T*N, lstm_dim*4]))

            score_shape = to_T([T, N, 1])
            scores_obj1 = tf.reshape(fc('fc_scores_obj1', q_reshape_flat, output_dim=1), score_shape)
            scores_obj2 = tf.reshape(fc('fc_scores_obj2', q_reshape_flat, output_dim=1), score_shape)
            scores_rel = tf.reshape(fc('fc_scores_rel', q_reshape_flat, output_dim=1), score_shape)

            # 2.4 Compute probability and average BoW representation
            # probs_obj1, probs_obj2 and probs_rel has shape [T, N, 1]
            # Remove the probability over <pad> (<pad> is 0)
            is_not_pad = tf.cast(tf.not_equal(text_seq_batch, 0)[..., tf.newaxis], tf.float32)
            probs_obj1 = tf.nn.softmax(scores_obj1, dim=0)*is_not_pad
            probs_obj2 = tf.nn.softmax(scores_obj2, dim=0)*is_not_pad
            probs_rel = tf.nn.softmax(scores_rel, dim=0)*is_not_pad
            probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 0, keep_dims=True)
            probs_obj2 = probs_obj2 / tf.reduce_sum(probs_obj2, 0, keep_dims=True)
            probs_rel = probs_rel / tf.reduce_sum(probs_rel, 0, keep_dims=True)

            tf.add_to_collection("attention_probs", (probs_obj1, probs_obj2, probs_rel))

            # BoW_obj1, BoW_obj2 and BoW_rel has shape [N, embed_dim]
            BoW_obj1 = tf.reduce_sum(probs_obj1*embedded_seq, reduction_indices=0)
            BoW_obj2 = tf.reduce_sum(probs_obj2*embedded_seq, reduction_indices=0)
            BoW_rel = tf.reduce_sum(probs_rel*embedded_seq, reduction_indices=0)
            BoW_obj1.set_shape([None, embed_dim])
            BoW_obj2.set_shape([None, embed_dim])
            BoW_rel.set_shape([None, embed_dim])

    return (BoW_obj1, BoW_obj2, BoW_rel)
Exemple #12
0
    def build_text_feature(self):
        """Generate text feature using bidirectional LSTM
        
        Outputs:
            self.text_bilstm_feat
            self.text_word_embed_feat
            self.word_is_not_pad
        """
        num_vocab = self.config.num_vocab
        embed_dim = self.config.embed_dim
        lstm_dim = self.config.lstm_dim

        text_seq = self.text_seqs

        with tf.variable_scope('lstm'):
            L = tf.shape(text_seq)[0]  #seq length
            N1 = tf.shape(text_seq)[1]  #batch size

            # Word embedding
            embedding_mat = tf.get_variable(name="embedding_mat",
                                            shape=[num_vocab, embed_dim])
            text_word_embed_feat = tf.nn.embedding_lookup(
                embedding_mat, text_seq)  # [L, N1, embed_dim]

            # Encode the sentence into a vector representation, using the final
            # hidden states in a two-layer bidirectional LSTM network
            seq_length = tf.ones(to_T([N1]), dtype=tf.int32) * L
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim,
                                                     state_is_tuple=True)
            outputs1_raw, _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=lstm_cell,
                cell_bw=lstm_cell,
                inputs=text_word_embed_feat,
                sequence_length=seq_length,
                dtype=tf.float32,
                time_major=True,
                scope="bidirectional_lstm1")
            outputs1 = tf.concat(outputs1_raw, axis=2)
            lstm_cell2 = tf.contrib.rnn.BasicLSTMCell(lstm_dim,
                                                      state_is_tuple=True)
            outputs2_raw, _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=lstm_cell2,
                cell_bw=lstm_cell2,
                inputs=outputs1,
                sequence_length=seq_length,
                dtype=tf.float32,
                time_major=True,
                scope="bidirectional_lstm2")
            outputs2 = tf.concat(outputs2_raw, axis=2)
            text_bilstm_feat = tf.concat([outputs1, outputs2], axis=2)
            if self.config.lstm_dropout:
                text_bilstm_feat = drop(text_bilstm_feat, 0.5)

            self.text_bilstm_feat = text_bilstm_feat
            self.text_word_embed_feat = text_word_embed_feat
            self.word_is_not_pad = tf.cast(
                tf.not_equal(text_seq, 0)[..., tf.newaxis], tf.float32)
Exemple #13
0
    def forward(self, imcrop_batch, text_seq_batch, is_training=True):
        num_vocab, embed_dim, lstm_dim, mlp_hidden_dims = self.num_vocab, self.embed_dim, self.lstm_dim, self.mlp_hidden_dims
        deeplab_dropout = self.kwargs[
            'deeplab_dropout'] if 'deeplab_dropout' in self.kwargs else False
        mlp_dropout = self.kwargs[
            'mlp_dropout'] if 'mlp_dropout' in self.kwargs else False

        with tf.variable_scope(self.model_name):
            # Language feature (LSTM hidden state)
            feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                          lstm_dim)[0]

            # Local image feature
            feat_vis = deeplab.deeplab_fc8_full_conv(
                imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout)

            # Reshape and tile LSTM top
            featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]
            N, D_text = feat_lang.get_shape().as_list()
            feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
                                [1, featmap_H, featmap_W, 1])

            # L2-normalize the features (except for spatial_batch)
            # and concatenate them along axis 3 (channel dimension)
            spatial_batch = tf.convert_to_tensor(
                generate_spatial_batch(N, featmap_H, featmap_W))
            feat_all = tf.concat(axis=3,
                                 values=[
                                     tf.nn.l2_normalize(feat_lang, 3),
                                     tf.nn.l2_normalize(feat_vis, 3),
                                     spatial_batch
                                 ])

            # MLP Classifier over concatenate feature
            with tf.variable_scope('classifier'):
                mlp_l1 = conv_relu('mlp_l1',
                                   feat_all,
                                   kernel_size=1,
                                   stride=1,
                                   output_dim=mlp_hidden_dims)
                if mlp_dropout:
                    mlp_l1 = drop(mlp_l1, 0.5)
                mlp_l2 = conv('mlp_l2',
                              mlp_l1,
                              kernel_size=1,
                              stride=1,
                              output_dim=1)

                upsample8s = deconv('upsample8s',
                                    mlp_l2,
                                    kernel_size=16,
                                    stride=8,
                                    output_dim=1,
                                    bias_term=False)

        return upsample8s
Exemple #14
0
def vs_multilayer(input_batch,
                  name,
                  middle_layer_dim=1000,
                  reuse=False,
                  test=False):
    with tf.variable_scope(name):
        if reuse == True:
            # print name+" reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            pass
            # print name+" doesn't reuse variables"

        layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim)
        if test:
            layer1 = drop(layer1, 1)
        else:
            layer1 = drop(layer1, 0.5)
        outputs = fc('layer2', layer1, output_dim=4)
    return outputs
Exemple #15
0
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
    with tf.variable_scope(name):
        if reuse==True:
            print name+" reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            print name+" doesn't reuse variables"
        
        layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim)
        layer1=drop(layer1,0.5)
        outputs = fc('layer2', layer1,output_dim=4)
    return outputs
def question_prior_net(encoder_states,
                       num_choices,
                       qpn_dropout,
                       hidden_dim=500,
                       scope='question_prior_net',
                       reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # concate the LSTM states from all layers
        assert (isinstance(encoder_states, tuple))
        h_list = []
        for s in encoder_states:
            assert (isinstance(s, tf.contrib.rnn.LSTMStateTuple))
            h_list.append(s.h)
        # h_concat has shape [N, D_lstm1 + ... + D_lstm_n]
        h_concat = tf.concat(h_list, axis=1)

        if qpn_dropout:
            h_concat = drop(h_concat, 0.5)
        fc1 = fc_relu('fc1', h_concat, output_dim=hidden_dim)
        if qpn_dropout:
            fc1 = drop(fc1, 0.5)
        fc2 = fc('fc2', fc1, output_dim=num_choices)
        return fc2
Exemple #17
0
def vs_multilayer(input_batch,
                  name,
                  middle_layer_dim=1000,
                  output_layer_dim=21 * 3,
                  dropout=True,
                  reuse=False):
    with tf.variable_scope(name):
        if reuse == True:
            print name + " reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            print name + " doesn't reuse variables"

        layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim)
        if dropout:
            layer1 = drop(layer1, 0.5)
        sim_score = fc('layer2', layer1, output_dim=output_layer_dim)
    return sim_score
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim,
                          lstm_dim, mlp_hidden_dims, deeplab_dropout,
                          mlp_dropout, is_training):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                  lstm_dim)[0]

    #deeplab101
    net = deeplab101.DeepLabResNetModel({'data': imcrop_batch},
                                        is_training=is_training)
    feat_vis = net.layers['fc1_voc12']

    # # Local image feature
    # feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab',
    #     apply_dropout=deeplab_dropout)

    # Reshape and tile LSTM top
    featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]
    N, D_text = feat_lang.get_shape().as_list()
    feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
                        [1, featmap_H, featmap_W, 1])

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them along axis 3 (channel dimension)
    spatial_batch = tf.convert_to_tensor(
        generate_spatial_batch(N, featmap_H, featmap_W))
    feat_all = tf.concat(axis=3,
                         values=[
                             tf.nn.l2_normalize(feat_lang, 3),
                             tf.nn.l2_normalize(feat_vis, 3), spatial_batch
                         ])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = conv_relu('mlp_l1',
                           feat_all,
                           kernel_size=1,
                           stride=1,
                           output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1)

    return mlp_l2
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab,
    embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)

    # Local image feature
    feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout)

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them
    feat_all = tf.concat(1, [tf.nn.l2_normalize(feat_lang, 1),
                             tf.nn.l2_normalize(feat_vis, 1),
                             spatial_batch])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)

    return mlp_l2
Exemple #20
0
def vs_multilayer(input_batch,
                  name,
                  middle_layer_dim=1000,
                  class_num=20,
                  dropout=False,
                  reuse=False):
    """This function is inherited from CBR project(https://github.com/jiyanggao/CBR)
    """
    print('--I am using vs_multilayer--')

    with tf.variable_scope(name):
        if reuse == True:
            print(name + " reuse variables")
            tf.get_variable_scope().reuse_variables()
        else:
            print(name + " doesn't reuse variables")

        layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim)
        if dropout:
            layer1 = drop(layer1, 0.5)
        sim_score = fc('layer2', layer1, output_dim=(class_num + 1) * 3)
    return sim_score
Exemple #21
0
def attbilstm_simple(text_seq_batch, name, num_vocab, embed_dim,
                     lstm_dim, apply_dropout, reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        T = tf.shape(text_seq_batch)[0]
        N = tf.shape(text_seq_batch)[1]

        # 0. Word embedding
        embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim])
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

        # 1. Encode the sentence into a vector representation, using the final
        # hidden states in a bidirectional LSTM network
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True)
        seq_length = tf.ones(to_T([N]), dtype=tf.int32)*T
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell,
            embedded_seq, seq_length, dtype=tf.float32, time_major=True,
            scope="bidirectional_stm")
        q_reshape = tf.concat(outputs, axis=2)
        if apply_dropout:
            q_reshape = drop(q_reshape, 0.5)

        # 2. three attention units over the words in each sentence
        with tf.variable_scope("attention"):
            # 2.1 Map the word embedding vectors to the same dimension as q
            embedded_seq_reshape = tf.reshape(embedded_seq, [-1, embed_dim])
            word_seq_embed = fc('attention_embed', embedded_seq_reshape, output_dim=lstm_dim*2)
            word_seq_embed = tf.reshape(word_seq_embed, to_T([T, N, lstm_dim*2]))

            # 2.2 Elementwise multiply word_seq_embed with q and l2-normalization
            eltwise_mult = tf.nn.l2_normalize(word_seq_embed * q_reshape, 2)

            # 2.3 Classification for attention scores
            eltwise_mult = tf.reshape(eltwise_mult, [-1, lstm_dim*2])
            # scores_obj1, scores_obj2 and scores_rel has shape [T, N, 1]
            score_shape = to_T([T, N, 1])
            scores_obj1 = tf.reshape(fc('fc_scores_obj1', eltwise_mult, output_dim=1), score_shape)
            scores_obj2 = tf.reshape(fc('fc_scores_obj2', eltwise_mult, output_dim=1), score_shape)
            scores_rel = tf.reshape(fc('fc_scores_rel', eltwise_mult, output_dim=1), score_shape)

            # 2.4 Compute probability and average BoW representation
            # probs_obj1, probs_obj2 and probs_rel has shape [T, N, 1]
            # Remove the probability over <pad> (<pad> is 0)
            is_not_pad = tf.cast(tf.not_equal(text_seq_batch, 0)[..., tf.newaxis], tf.float32)
            probs_obj1 = tf.nn.softmax(scores_obj1, dim=0)*is_not_pad
            probs_obj2 = tf.nn.softmax(scores_obj2, dim=0)*is_not_pad
            probs_rel = tf.nn.softmax(scores_rel, dim=0)*is_not_pad
            probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 0, keep_dims=True)
            probs_obj2 = probs_obj2 / tf.reduce_sum(probs_obj2, 0, keep_dims=True)
            probs_rel = probs_rel / tf.reduce_sum(probs_rel, 0, keep_dims=True)

            # BoW_obj1, BoW_obj2 and BoW_rel has shape [N, embed_dim]
            BoW_obj1 = tf.reduce_sum(probs_obj1*embedded_seq, reduction_indices=0)
            BoW_obj2 = tf.reduce_sum(probs_obj2*embedded_seq, reduction_indices=0)
            BoW_rel = tf.reduce_sum(probs_rel*embedded_seq, reduction_indices=0)
            BoW_obj1.set_shape([None, embed_dim])
            BoW_obj2.set_shape([None, embed_dim])
            BoW_rel.set_shape([None, embed_dim])

            tf.add_to_collection("attention_probs", (probs_obj1, probs_obj2, probs_rel))

    return (BoW_obj1, BoW_obj2, BoW_rel)