def gated_exchange_fusion_lstm_2times(self, feat4, feat5, lang_feat, threshold = 0.5): ''' Fuse exchanged features of level3, level4, level5 LSTM is used to fuse the exchanged features :param feat3: [B, H, W, C] :param feat4: [B, H, W, C] :param feat5: [B, H, W, C] :param lang_feat: [B, 1, 1, C] :return: fused feat3, feat4, feat5 ''' feat_exg4 = self.gated_exchange_module(feat4, feat5, lang_feat, 'c4') feat_exg4 = tf.nn.l2_normalize(feat_exg4, 3) feat_exg5 = self.gated_exchange_module(feat5, feat4, lang_feat, 'c5') feat_exg5 = tf.nn.l2_normalize(feat_exg5, 3) # Second time feat_exg4_2 = self.gated_exchange_module(feat_exg4, feat_exg5, lang_feat, 'c4_2') feat_exg4_2 = tf.nn.l2_normalize(feat_exg4_2, 3) feat_exg5_2 = self.gated_exchange_module(feat_exg5, feat_exg4, lang_feat, 'c5_2') feat_exg5_2 = tf.nn.l2_normalize(feat_exg5_2, 3) # Convolutional LSTM Fuse convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim, [1, 1]) convlstm_input = tf.stack((feat_exg4_2, feat_exg5_2), axis=1) convlstm_outputs, states = tf.nn.dynamic_rnn(convlstm_cell, tf.convert_to_tensor( convlstm_input), dtype=tf.float32) fused_feat = convlstm_outputs[:,-1] print("Build Gated Fusion with ConvLSTM two times.") return fused_feat
def gated_exchange_fusion_lstm_2times(self, feat4, feat5, lang_feat, spatial, threshold=0.5): ''' Fuse exchanged features of level3, level4, level5 LSTM is used to fuse the exchanged features :param feat3: [B, H, W, C] :param feat4: [B, H, W, C] :param feat5: [B, H, W, C] :param lang_feat: [B, 1, 1, C] :return: fused feat3, feat4, feat5 ''' # feat_exg3 = self.gated_exchange_module(feat3, feat4, feat5, lang_feat, 'c3') # feat_exg3 = tf.nn.l2_normalize(feat_exg3, 3) # feat5 = tf.cond(self.consitency_score > threshold, # lambda: feat5, # lambda: tf.identity(feat4)) feat_exg4 = self.gated_exchange_module(feat4, feat5, lang_feat, 'c4') feat_exg4 = tf.nn.l2_normalize(feat_exg4, 3) feat_exg5 = self.gated_exchange_module(feat5, feat4, lang_feat, 'c5') feat_exg5 = tf.nn.l2_normalize(feat_exg5, 3) # Second time # feat_exg3_2 = self.gated_exchange_module(feat_exg3, feat_exg4, feat_exg5, lang_feat, 'c3_2') # feat_exg3_2 = tf.nn.l2_normalize(feat_exg3_2, 3) feat_exg4_2 = self.gated_exchange_module(feat_exg4, feat_exg5, lang_feat, 'c4_2') feat_exg4_2 = tf.nn.l2_normalize(feat_exg4_2, 3) feat_exg5_2 = self.gated_exchange_module(feat_exg5, feat_exg4, lang_feat, 'c5_2') feat_exg5_2 = tf.nn.l2_normalize(feat_exg5_2, 3) # Convolutional LSTM Fuse feat_exg4_mutan = self.mutan_fusion(lang_feat, spatial, feat_exg4_2, level='c4') feat_exg5_mutan = self.mutan_fusion(lang_feat, spatial, feat_exg5_2, level='c5') convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim, [1, 1]) convlstm_input = tf.stack((feat_exg4_mutan, feat_exg5_mutan), axis=1) # convlstm_input = tf.cond(self.consitency_score > threshold, # lambda: tf.stack((feat_exg4_2, feat_exg5_2), axis=1), # lambda: tf.stack((feat_exg4_2, feat_exg4_2), axis=1)) convlstm_outputs, states = tf.nn.dynamic_rnn( convlstm_cell, tf.convert_to_tensor(convlstm_input), dtype=tf.float32) fused_feat = convlstm_outputs[:, -1] print("Build Gated Fusion with ConvLSTM two times.") return fused_feat
def build_graph(self): print("#" * 30) print("LSCM_model_p2345, function version") print("#" * 30) embedding_mat = tf.Variable(self.glove) embedded_seq = tf.nn.embedding_lookup( embedding_mat, tf.transpose(self.words)) # [num_step, batch_size, glove_emb] print("Build Glove Embedding.") rnn_cell_basic = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size, state_is_tuple=False) if self.mode == 'train' and self.keep_prob_rnn < 1: rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper( rnn_cell_basic, output_keep_prob=self.keep_prob_rnn) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * self.num_rnn_layers, state_is_tuple=False) state = cell.zero_state(self.batch_size, tf.float32) state_shape = state.get_shape().as_list() state_shape[0] = self.batch_size state.set_shape(state_shape) words_feat_list = [] def f1(): # return tf.constant(0.), state return tf.zeros([self.batch_size, self.rnn_size]), state def f2(): # Word input to embedding layer w_emb = embedded_seq[n, :, :] if self.mode == 'train' and self.keep_prob_emb < 1: w_emb = tf.nn.dropout(w_emb, self.keep_prob_emb) return cell(w_emb, state) with tf.variable_scope("RNN"): for n in range(self.num_steps): if n > 0: tf.get_variable_scope().reuse_variables() # rnn_output, state = cell(w_emb, state) rnn_output, state = tf.cond( tf.equal(self.words[0, n], tf.constant(0)), f1, f2) word_feat = tf.reshape(rnn_output, [self.batch_size, 1, self.rnn_size]) words_feat_list.append(word_feat) # words_feat: [B, num_steps, rnn_size] words_feat = tf.concat(words_feat_list, 1) words_feat = tf.slice( words_feat, [0, self.valid_idx[0, 0], 0], [-1, self.num_steps - self.valid_idx[0, 0], -1]) # [B, T, C] lang_feat = tf.reduce_max(words_feat, 1) # [rnn_dim] lang_feat = tf.reshape(lang_feat, [self.batch_size, 1, 1, self.rnn_size]) lang_feat = tf.nn.l2_normalize(lang_feat, 3) # [B, 1, 1, C] words_feat = tf.nn.l2_normalize(words_feat, 2) # words_feat: [B, 1, num_words, rnn_size] words_feat = tf.expand_dims(words_feat, 1) visual_feat_c5 = self._conv("c5_lateral", self.visual_feat_c5, 1, self.vf_dim, self.v_emb_dim, [1, 1, 1, 1]) visual_feat_c5 = tf.nn.l2_normalize(visual_feat_c5, 3) visual_feat_c4 = self._conv("c4_lateral", self.visual_feat_c4, 1, 1024, self.v_emb_dim, [1, 1, 1, 1]) visual_feat_c4 = tf.nn.l2_normalize(visual_feat_c4, 3) visual_feat_c3 = self._conv("c3_lateral", self.visual_feat_c3, 1, 512, self.v_emb_dim, [1, 1, 1, 1]) visual_feat_c3 = tf.nn.l2_normalize(visual_feat_c3, 3) visual_feat_c2 = self._conv("c2_lateral", self.visual_feat_c2, 3, 256, self.v_emb_dim, [1, 2, 2, 1]) visual_feat_c2 = tf.nn.l2_normalize(visual_feat_c2, 3) # Generate spatial grid spatial = tf.convert_to_tensor( generate_spatial_batch(self.batch_size, self.vf_h, self.vf_w)) fusion_c5 = self.build_full_module(visual_feat_c5, words_feat, lang_feat, spatial, level="c5") fusion_c4 = self.build_full_module(visual_feat_c4, words_feat, lang_feat, spatial, level="c4") fusion_c3 = self.build_full_module(visual_feat_c3, words_feat, lang_feat, spatial, level="c3") fusion_c2 = self.build_full_module(visual_feat_c2, words_feat, lang_feat, spatial, level="c2") score_c5 = self._conv("score_c5", fusion_c5, 3, self.mlp_dim, 1, [1, 1, 1, 1]) self.up_c5 = tf.image.resize_bilinear(score_c5, [self.H, self.W]) score_c4 = self._conv("score_c4", fusion_c4, 3, self.mlp_dim, 1, [1, 1, 1, 1]) self.up_c4 = tf.image.resize_bilinear(score_c4, [self.H, self.W]) score_c3 = self._conv("score_c3", fusion_c3, 3, self.mlp_dim, 1, [1, 1, 1, 1]) self.up_c3 = tf.image.resize_bilinear(score_c3, [self.H, self.W]) score_c2 = self._conv("score_c2", fusion_c2, 3, self.mlp_dim, 1, [1, 1, 1, 1]) self.up_c2 = tf.image.resize_bilinear(score_c2, [self.H, self.W]) # Convolutional LSTM convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim, [1, 1]) convlstm_outputs, states = tf.nn.dynamic_rnn( convlstm_cell, tf.convert_to_tensor([[ fusion_c5[0], fusion_c4[0], fusion_c3[0], fusion_c2[0], fusion_c3[0], fusion_c4[0], fusion_c5[0] ]]), dtype=tf.float32) score = self._conv("score", convlstm_outputs[:, -1], 3, self.mlp_dim, 1, [1, 1, 1, 1]) self.pred = score self.up = tf.image.resize_bilinear(self.pred, [self.H, self.W]) self.sigm = tf.sigmoid(self.up)
def build_graph(self): if self.weights == 'deeplab': visual_feat = self._conv("mlp0", self.visual_feat, 1, self.vf_dim, self.v_emb_dim, [1, 1, 1, 1]) elif self.weights == 'resnet': visual_feat = self.visual_feat embedding_mat = tf.get_variable( "embedding", [self.vocab_size, self.w_emb_dim], initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08)) embedded_seq = tf.nn.embedding_lookup(embedding_mat, tf.transpose(self.words)) rnn_cell_basic = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size, state_is_tuple=False) if self.mode == 'train' and self.keep_prob_rnn < 1: rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper( rnn_cell_basic, output_keep_prob=self.keep_prob_rnn) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * self.num_rnn_layers, state_is_tuple=False) state = cell.zero_state(self.batch_size, tf.float32) state_shape = state.get_shape().as_list() state_shape[0] = self.batch_size state.set_shape(state_shape) def f1(): return tf.constant(0.), state def f2(): # Word input to embedding layer w_emb = embedded_seq[n, :, :] if self.mode == 'train' and self.keep_prob_emb < 1: w_emb = tf.nn.dropout(w_emb, self.keep_prob_emb) return cell(w_emb, state) with tf.variable_scope("RNN"): for n in range(self.num_steps): if n > 0: tf.get_variable_scope().reuse_variables() # rnn_output, state = cell(w_emb, state) rnn_output, state = tf.cond( tf.equal(self.words[0, n], tf.constant(0)), f1, f2) lang_feat = tf.reshape(rnn_output, [self.batch_size, 1, 1, self.rnn_size]) lang_feat = tf.nn.l2_normalize(lang_feat, 3) lang_feat = tf.tile(lang_feat, [1, self.vf_h, self.vf_w, 1]) # Generate spatial grid visual_feat = tf.nn.l2_normalize(visual_feat, 3) spatial = tf.convert_to_tensor( generate_spatial_batch(self.batch_size, self.vf_h, self.vf_w)) feat_all = tf.concat([visual_feat, lang_feat, spatial], 3) # RNN output to visual weights fusion = self._conv("fusion", feat_all, 1, self.v_emb_dim + self.rnn_size + 8, self.mlp_dim, [1, 1, 1, 1]) fusion = tf.nn.relu(fusion) c5_lateral = self._conv("c5_lateral", self.visual_feat, 1, self.vf_dim, self.mlp_dim, [1, 1, 1, 1]) c5_lateral = tf.nn.relu(c5_lateral) c4_lateral = self._conv("c4_lateral", self.visual_feat_c4, 1, 1024, self.mlp_dim, [1, 1, 1, 1]) c4_lateral = tf.nn.relu(c4_lateral) c3_lateral = self._conv("c3_lateral", self.visual_feat_c3, 1, 512, self.mlp_dim, [1, 1, 1, 1]) c3_lateral = tf.nn.relu(c3_lateral) # Convolutional LSTM convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.mlp_dim, [1, 1]) convlstm_outputs, states = tf.nn.dynamic_rnn(convlstm_cell, tf.convert_to_tensor([[ fusion[0], c5_lateral[0], c4_lateral[0], c3_lateral[0] ]]), dtype=tf.float32) score = self._conv("score", convlstm_outputs[:, -1], 3, self.mlp_dim, 1, [1, 1, 1, 1]) self.pred = score self.up = tf.image.resize_bilinear(self.pred, [self.H, self.W]) self.sigm = tf.sigmoid(self.up)
def build_graph(self): # Obtain visual feature visual_feat = self._conv('mlp0', self.visual_feat, 1, self.vf_dim, self.v_emb_dim, [1, 1, 1, 1]) # word embedding embed_mat = tf.get_variable('embedding', [self.vocab_size, self.w_emb_dim], initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08)) embed_seq = tf.nn.embedding_lookup(embed_mat, tf.transpose(self.words)) # LSTM cell for language feature extraction lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) state = lstm_cell.zero_state(self.batch_size, tf.float32) def skip(): return tf.constant(0.), state def update_cell(): return lstm_cell(embed_seq[n, :, :], state) with tf.variable_scope('RNN'): for n in range(self.num_steps): if n > 0: tf.get_variable_scope().reuse_variables() rnn_output, state = tf.cond( tf.equal(self.words[0, n], tf.constant(0)), skip, update_cell) # Obtain language feature lang_feat = tf.reshape(rnn_output, [self.batch_size, 1, 1, self.rnn_size]) lang_feat = tf.nn.l2_normalize(lang_feat, 3) lang_feat = tf.tile(lang_feat, [1, self.vf_h, self.vf_w, 1]) # Generate spatial grid spatial_feat = tf.convert_to_tensor(generate_spatial_batch( self.batch_size, self.vf_h, self.vf_w)) # Fuse all features feat_all = tf.concat([visual_feat, lang_feat, spatial_feat], 3) fusion = self._conv('fusion', feat_all, 1, self.v_emb_dim + self.rnn_size + 8, self.convlstm_dim, [1, 1, 1, 1]) fusion = tf.nn.relu(fusion) # Hierarchical feature extraction c5_lateral = self._conv('c5_lateral', self.visual_feat_c5, 1, 2048, self.convlstm_dim, [1, 1, 1, 1]) c5_lateral = tf.nn.relu(c5_lateral) c4_lateral = self._conv('c4_lateral', self.visual_feat_c4, 1, 1024, self.convlstm_dim, [1, 1, 1, 1]) c4_lateral = tf.nn.relu(c4_lateral) c3_lateral = self._conv('c3_lateral', self.visual_feat_c3, 1, 512, self.convlstm_dim, [1, 1, 1, 1]) c3_lateral = tf.nn.relu(c3_lateral) # Recurrent refinement via Convolutional LSTM convlstm_cell = ConvLSTMCell([self.vf_h, self.vf_w], self.convlstm_dim, [1, 1]) convlstm_inputs = tf.convert_to_tensor( [[fusion[0], c5_lateral[0], c4_lateral[0], c3_lateral[0]]]) convlstm_outputs, states = tf.nn.dynamic_rnn(convlstm_cell, convlstm_inputs, dtype=tf.float32) # Obtain score and prediction self.score = self._conv('score', convlstm_outputs[:, -1], 3, self.convlstm_dim, 1, [1, 1, 1, 1]) self.pred = tf.image.resize_bilinear(self.score, [self.H, self.W]) if self.mode == 'test': self.sigm = tf.sigmoid(self.pred)