Ejemplo n.º 1
0
    def gated_exchange_fusion_lstm_2times(self, feat4, feat5, lang_feat, threshold = 0.5):
        '''
        Fuse exchanged features of level3, level4, level5
        LSTM is used to fuse the exchanged features
        :param feat3: [B, H, W, C]
        :param feat4: [B, H, W, C]
        :param feat5: [B, H, W, C]
        :param lang_feat: [B, 1, 1, C]
        :return: fused feat3, feat4, feat5
        '''

        feat_exg4 = self.gated_exchange_module(feat4, feat5, lang_feat, 'c4')
        feat_exg4 = tf.nn.l2_normalize(feat_exg4, 3)
        feat_exg5 = self.gated_exchange_module(feat5, feat4, lang_feat, 'c5')
        feat_exg5 = tf.nn.l2_normalize(feat_exg5, 3)

        # Second time

        feat_exg4_2 = self.gated_exchange_module(feat_exg4, feat_exg5, lang_feat, 'c4_2')
        feat_exg4_2 = tf.nn.l2_normalize(feat_exg4_2, 3)
        feat_exg5_2 = self.gated_exchange_module(feat_exg5, feat_exg4, lang_feat, 'c5_2')
        feat_exg5_2 = tf.nn.l2_normalize(feat_exg5_2, 3)
        
        # Convolutional LSTM Fuse
        convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim, [1, 1])
        convlstm_input = tf.stack((feat_exg4_2, feat_exg5_2), axis=1)
        convlstm_outputs, states = tf.nn.dynamic_rnn(convlstm_cell, tf.convert_to_tensor(
            convlstm_input), dtype=tf.float32)
        fused_feat = convlstm_outputs[:,-1]
        print("Build Gated Fusion with ConvLSTM two times.")

        return fused_feat
Ejemplo n.º 2
0
    def gated_exchange_fusion_lstm_2times(self,
                                          feat4,
                                          feat5,
                                          lang_feat,
                                          spatial,
                                          threshold=0.5):
        '''
        Fuse exchanged features of level3, level4, level5
        LSTM is used to fuse the exchanged features
        :param feat3: [B, H, W, C]
        :param feat4: [B, H, W, C]
        :param feat5: [B, H, W, C]
        :param lang_feat: [B, 1, 1, C]
        :return: fused feat3, feat4, feat5
        '''
        #         feat_exg3 = self.gated_exchange_module(feat3, feat4, feat5, lang_feat, 'c3')
        #         feat_exg3 = tf.nn.l2_normalize(feat_exg3, 3)
        # feat5 = tf.cond(self.consitency_score > threshold,
        #                     lambda: feat5,
        #                     lambda: tf.identity(feat4))
        feat_exg4 = self.gated_exchange_module(feat4, feat5, lang_feat, 'c4')
        feat_exg4 = tf.nn.l2_normalize(feat_exg4, 3)
        feat_exg5 = self.gated_exchange_module(feat5, feat4, lang_feat, 'c5')
        feat_exg5 = tf.nn.l2_normalize(feat_exg5, 3)

        # Second time
        #         feat_exg3_2 = self.gated_exchange_module(feat_exg3, feat_exg4, feat_exg5, lang_feat, 'c3_2')
        #         feat_exg3_2 = tf.nn.l2_normalize(feat_exg3_2, 3)
        feat_exg4_2 = self.gated_exchange_module(feat_exg4, feat_exg5,
                                                 lang_feat, 'c4_2')
        feat_exg4_2 = tf.nn.l2_normalize(feat_exg4_2, 3)
        feat_exg5_2 = self.gated_exchange_module(feat_exg5, feat_exg4,
                                                 lang_feat, 'c5_2')
        feat_exg5_2 = tf.nn.l2_normalize(feat_exg5_2, 3)

        # Convolutional LSTM Fuse
        feat_exg4_mutan = self.mutan_fusion(lang_feat,
                                            spatial,
                                            feat_exg4_2,
                                            level='c4')
        feat_exg5_mutan = self.mutan_fusion(lang_feat,
                                            spatial,
                                            feat_exg5_2,
                                            level='c5')
        convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim,
                                     [1, 1])
        convlstm_input = tf.stack((feat_exg4_mutan, feat_exg5_mutan), axis=1)
        # convlstm_input = tf.cond(self.consitency_score > threshold,
        #                             lambda: tf.stack((feat_exg4_2, feat_exg5_2), axis=1),
        #                             lambda: tf.stack((feat_exg4_2, feat_exg4_2), axis=1))
        convlstm_outputs, states = tf.nn.dynamic_rnn(
            convlstm_cell,
            tf.convert_to_tensor(convlstm_input),
            dtype=tf.float32)
        fused_feat = convlstm_outputs[:, -1]
        print("Build Gated Fusion with ConvLSTM two times.")

        return fused_feat
Ejemplo n.º 3
0
    def build_graph(self):
        print("#" * 30)
        print("LSCM_model_p2345, function version")
        print("#" * 30)

        embedding_mat = tf.Variable(self.glove)
        embedded_seq = tf.nn.embedding_lookup(
            embedding_mat,
            tf.transpose(self.words))  # [num_step, batch_size, glove_emb]
        print("Build Glove Embedding.")

        rnn_cell_basic = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size,
                                                      state_is_tuple=False)
        if self.mode == 'train' and self.keep_prob_rnn < 1:
            rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper(
                rnn_cell_basic, output_keep_prob=self.keep_prob_rnn)
        cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] *
                                           self.num_rnn_layers,
                                           state_is_tuple=False)

        state = cell.zero_state(self.batch_size, tf.float32)
        state_shape = state.get_shape().as_list()
        state_shape[0] = self.batch_size
        state.set_shape(state_shape)

        words_feat_list = []

        def f1():
            # return tf.constant(0.), state
            return tf.zeros([self.batch_size, self.rnn_size]), state

        def f2():
            # Word input to embedding layer
            w_emb = embedded_seq[n, :, :]
            if self.mode == 'train' and self.keep_prob_emb < 1:
                w_emb = tf.nn.dropout(w_emb, self.keep_prob_emb)
            return cell(w_emb, state)

        with tf.variable_scope("RNN"):
            for n in range(self.num_steps):
                if n > 0:
                    tf.get_variable_scope().reuse_variables()

                # rnn_output, state = cell(w_emb, state)
                rnn_output, state = tf.cond(
                    tf.equal(self.words[0, n], tf.constant(0)), f1, f2)
                word_feat = tf.reshape(rnn_output,
                                       [self.batch_size, 1, self.rnn_size])
                words_feat_list.append(word_feat)

        # words_feat: [B, num_steps, rnn_size]
        words_feat = tf.concat(words_feat_list, 1)
        words_feat = tf.slice(
            words_feat, [0, self.valid_idx[0, 0], 0],
            [-1, self.num_steps - self.valid_idx[0, 0], -1])  # [B, T, C]

        lang_feat = tf.reduce_max(words_feat, 1)  # [rnn_dim]
        lang_feat = tf.reshape(lang_feat,
                               [self.batch_size, 1, 1, self.rnn_size])
        lang_feat = tf.nn.l2_normalize(lang_feat, 3)  # [B, 1, 1, C]

        words_feat = tf.nn.l2_normalize(words_feat, 2)
        # words_feat: [B, 1, num_words, rnn_size]
        words_feat = tf.expand_dims(words_feat, 1)

        visual_feat_c5 = self._conv("c5_lateral", self.visual_feat_c5, 1,
                                    self.vf_dim, self.v_emb_dim, [1, 1, 1, 1])
        visual_feat_c5 = tf.nn.l2_normalize(visual_feat_c5, 3)
        visual_feat_c4 = self._conv("c4_lateral", self.visual_feat_c4, 1, 1024,
                                    self.v_emb_dim, [1, 1, 1, 1])
        visual_feat_c4 = tf.nn.l2_normalize(visual_feat_c4, 3)
        visual_feat_c3 = self._conv("c3_lateral", self.visual_feat_c3, 1, 512,
                                    self.v_emb_dim, [1, 1, 1, 1])
        visual_feat_c3 = tf.nn.l2_normalize(visual_feat_c3, 3)
        visual_feat_c2 = self._conv("c2_lateral", self.visual_feat_c2, 3, 256,
                                    self.v_emb_dim, [1, 2, 2, 1])
        visual_feat_c2 = tf.nn.l2_normalize(visual_feat_c2, 3)

        # Generate spatial grid
        spatial = tf.convert_to_tensor(
            generate_spatial_batch(self.batch_size, self.vf_h, self.vf_w))

        fusion_c5 = self.build_full_module(visual_feat_c5,
                                           words_feat,
                                           lang_feat,
                                           spatial,
                                           level="c5")
        fusion_c4 = self.build_full_module(visual_feat_c4,
                                           words_feat,
                                           lang_feat,
                                           spatial,
                                           level="c4")
        fusion_c3 = self.build_full_module(visual_feat_c3,
                                           words_feat,
                                           lang_feat,
                                           spatial,
                                           level="c3")
        fusion_c2 = self.build_full_module(visual_feat_c2,
                                           words_feat,
                                           lang_feat,
                                           spatial,
                                           level="c2")

        score_c5 = self._conv("score_c5", fusion_c5, 3, self.mlp_dim, 1,
                              [1, 1, 1, 1])
        self.up_c5 = tf.image.resize_bilinear(score_c5, [self.H, self.W])
        score_c4 = self._conv("score_c4", fusion_c4, 3, self.mlp_dim, 1,
                              [1, 1, 1, 1])
        self.up_c4 = tf.image.resize_bilinear(score_c4, [self.H, self.W])
        score_c3 = self._conv("score_c3", fusion_c3, 3, self.mlp_dim, 1,
                              [1, 1, 1, 1])
        self.up_c3 = tf.image.resize_bilinear(score_c3, [self.H, self.W])
        score_c2 = self._conv("score_c2", fusion_c2, 3, self.mlp_dim, 1,
                              [1, 1, 1, 1])
        self.up_c2 = tf.image.resize_bilinear(score_c2, [self.H, self.W])

        # Convolutional LSTM
        convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim,
                                     [1, 1])
        convlstm_outputs, states = tf.nn.dynamic_rnn(
            convlstm_cell,
            tf.convert_to_tensor([[
                fusion_c5[0], fusion_c4[0], fusion_c3[0], fusion_c2[0],
                fusion_c3[0], fusion_c4[0], fusion_c5[0]
            ]]),
            dtype=tf.float32)

        score = self._conv("score", convlstm_outputs[:, -1], 3, self.mlp_dim,
                           1, [1, 1, 1, 1])

        self.pred = score
        self.up = tf.image.resize_bilinear(self.pred, [self.H, self.W])
        self.sigm = tf.sigmoid(self.up)
    def build_graph(self):

        if self.weights == 'deeplab':
            visual_feat = self._conv("mlp0", self.visual_feat, 1, self.vf_dim,
                                     self.v_emb_dim, [1, 1, 1, 1])
        elif self.weights == 'resnet':
            visual_feat = self.visual_feat

        embedding_mat = tf.get_variable(
            "embedding", [self.vocab_size, self.w_emb_dim],
            initializer=tf.random_uniform_initializer(minval=-0.08,
                                                      maxval=0.08))
        embedded_seq = tf.nn.embedding_lookup(embedding_mat,
                                              tf.transpose(self.words))

        rnn_cell_basic = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size,
                                                      state_is_tuple=False)
        if self.mode == 'train' and self.keep_prob_rnn < 1:
            rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper(
                rnn_cell_basic, output_keep_prob=self.keep_prob_rnn)
        cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] *
                                           self.num_rnn_layers,
                                           state_is_tuple=False)

        state = cell.zero_state(self.batch_size, tf.float32)
        state_shape = state.get_shape().as_list()
        state_shape[0] = self.batch_size
        state.set_shape(state_shape)

        def f1():
            return tf.constant(0.), state

        def f2():
            # Word input to embedding layer
            w_emb = embedded_seq[n, :, :]
            if self.mode == 'train' and self.keep_prob_emb < 1:
                w_emb = tf.nn.dropout(w_emb, self.keep_prob_emb)
            return cell(w_emb, state)

        with tf.variable_scope("RNN"):
            for n in range(self.num_steps):
                if n > 0:
                    tf.get_variable_scope().reuse_variables()

                # rnn_output, state = cell(w_emb, state)
                rnn_output, state = tf.cond(
                    tf.equal(self.words[0, n], tf.constant(0)), f1, f2)

        lang_feat = tf.reshape(rnn_output,
                               [self.batch_size, 1, 1, self.rnn_size])
        lang_feat = tf.nn.l2_normalize(lang_feat, 3)
        lang_feat = tf.tile(lang_feat, [1, self.vf_h, self.vf_w, 1])

        # Generate spatial grid
        visual_feat = tf.nn.l2_normalize(visual_feat, 3)
        spatial = tf.convert_to_tensor(
            generate_spatial_batch(self.batch_size, self.vf_h, self.vf_w))

        feat_all = tf.concat([visual_feat, lang_feat, spatial], 3)

        # RNN output to visual weights
        fusion = self._conv("fusion", feat_all, 1,
                            self.v_emb_dim + self.rnn_size + 8, self.mlp_dim,
                            [1, 1, 1, 1])
        fusion = tf.nn.relu(fusion)

        c5_lateral = self._conv("c5_lateral", self.visual_feat, 1, self.vf_dim,
                                self.mlp_dim, [1, 1, 1, 1])
        c5_lateral = tf.nn.relu(c5_lateral)

        c4_lateral = self._conv("c4_lateral", self.visual_feat_c4, 1, 1024,
                                self.mlp_dim, [1, 1, 1, 1])
        c4_lateral = tf.nn.relu(c4_lateral)

        c3_lateral = self._conv("c3_lateral", self.visual_feat_c3, 1, 512,
                                self.mlp_dim, [1, 1, 1, 1])
        c3_lateral = tf.nn.relu(c3_lateral)

        # Convolutional LSTM
        convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim,
                                     [1, 1])
        convlstm_outputs, states = tf.nn.dynamic_rnn(convlstm_cell,
                                                     tf.convert_to_tensor([[
                                                         fusion[0],
                                                         c5_lateral[0],
                                                         c4_lateral[0],
                                                         c3_lateral[0]
                                                     ]]),
                                                     dtype=tf.float32)

        score = self._conv("score", convlstm_outputs[:, -1], 3, self.mlp_dim,
                           1, [1, 1, 1, 1])

        self.pred = score
        self.up = tf.image.resize_bilinear(self.pred, [self.H, self.W])
        self.sigm = tf.sigmoid(self.up)
Ejemplo n.º 5
0
    def build_graph(self):
        # Obtain visual feature
        visual_feat = self._conv('mlp0', self.visual_feat, 1, self.vf_dim,
            self.v_emb_dim, [1, 1, 1, 1])

        # word embedding
        embed_mat = tf.get_variable('embedding', [self.vocab_size, self.w_emb_dim],
            initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08))
        embed_seq = tf.nn.embedding_lookup(embed_mat, tf.transpose(self.words))

        # LSTM cell for language feature extraction
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size)
        state = lstm_cell.zero_state(self.batch_size, tf.float32)

        def skip():
            return tf.constant(0.), state

        def update_cell():
            return lstm_cell(embed_seq[n, :, :], state)

        with tf.variable_scope('RNN'):
            for n in range(self.num_steps):
                if n > 0: tf.get_variable_scope().reuse_variables()
                rnn_output, state = tf.cond(
                    tf.equal(self.words[0, n], tf.constant(0)),
                    skip, update_cell)

        # Obtain language feature
        lang_feat = tf.reshape(rnn_output, [self.batch_size, 1, 1, self.rnn_size])
        lang_feat = tf.nn.l2_normalize(lang_feat, 3)
        lang_feat = tf.tile(lang_feat, [1, self.vf_h, self.vf_w, 1])

        # Generate spatial grid
        spatial_feat = tf.convert_to_tensor(generate_spatial_batch(
            self.batch_size, self.vf_h, self.vf_w))

        # Fuse all features
        feat_all = tf.concat([visual_feat, lang_feat, spatial_feat], 3)

        fusion = self._conv('fusion', feat_all, 1,
            self.v_emb_dim + self.rnn_size + 8, self.convlstm_dim, [1, 1, 1, 1])
        fusion = tf.nn.relu(fusion)

        # Hierarchical feature extraction
        c5_lateral = self._conv('c5_lateral', self.visual_feat_c5, 1, 2048,
            self.convlstm_dim, [1, 1, 1, 1])
        c5_lateral = tf.nn.relu(c5_lateral)

        c4_lateral = self._conv('c4_lateral', self.visual_feat_c4, 1, 1024,
            self.convlstm_dim, [1, 1, 1, 1])
        c4_lateral = tf.nn.relu(c4_lateral)

        c3_lateral = self._conv('c3_lateral', self.visual_feat_c3, 1, 512,
            self.convlstm_dim, [1, 1, 1, 1])
        c3_lateral = tf.nn.relu(c3_lateral)

        # Recurrent refinement via Convolutional LSTM
        convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.convlstm_dim, [1, 1])
        convlstm_inputs = tf.convert_to_tensor(
            [[fusion[0], c5_lateral[0], c4_lateral[0], c3_lateral[0]]])
        convlstm_outputs, states = tf.nn.dynamic_rnn(convlstm_cell,
            convlstm_inputs, dtype=tf.float32)

        # Obtain score and prediction
        self.score = self._conv('score', convlstm_outputs[:, -1], 3, self.convlstm_dim, 1, [1, 1, 1, 1])
        self.pred = tf.image.resize_bilinear(self.score, [self.H, self.W])
        if self.mode == 'test':
            self.sigm = tf.sigmoid(self.pred)