def _build_encoder(self, input_seq_batch, seq_len_batch, scope='encoder', reuse=None): lstm_dim = self.lstm_dim num_layers = self.num_layers apply_dropout = self.encoder_dropout with tf.variable_scope(scope, reuse=reuse): #T = tf.shape(input_seq_batch)[0] T = input_seq_batch.shape.as_list()[0] N = tf.shape(input_seq_batch)[1] self.T_encoder = T self.N = N with tf.variable_scope(self.embed_scope, reuse=True): embedding_mat = tf.get_variable('embed_mat', [self.encoder_num_vocab, self.encoder_embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, input_seq_batch) self.embedded_input_seq = embedded_seq # The RNN cell = _get_lstm_cell(num_layers, lstm_dim, apply_dropout) # encoder_outputs has shape [T, N, lstm_dim] encoder_outputs, encoder_states = tf.nn.dynamic_rnn(cell, embedded_seq, seq_len_batch, dtype=tf.float32, time_major=True, scope='lstm') self.encoder_outputs = encoder_outputs self.encoder_states = encoder_states # check if wv flag is set if self.params['use_word_vectors']: # transform the encoder outputs for further attention alignments # encoder_outputs_flat has shape [T, N, lstm_dim] encoder_h_transformed = fc('encoder_h_transform', tf.reshape(embedded_seq, [-1, self.encoder_embed_dim]), output_dim=lstm_dim) else: # transform the encoder outputs for further attention alignments # encoder_outputs_flat has shape [T, N, lstm_dim] encoder_h_transformed = fc('encoder_h_transform', tf.reshape(encoder_outputs, [-1, lstm_dim]), output_dim=lstm_dim) encoder_h_transformed = tf.reshape(encoder_h_transformed, to_T([T, N, lstm_dim])) self.encoder_h_transformed = encoder_h_transformed # seq_not_finished has shape [T, N, 1], where seq_not_finished[t, n] # is 1 iff sequence n is not finished at time t, and 0 otherwise seq_not_finished = tf.less(tf.range(T)[:, tf.newaxis, tf.newaxis], seq_len_batch[:, tf.newaxis]) seq_not_finished = tf.cast(seq_not_finished, tf.float32) self.seq_not_finished = seq_not_finished
def EqualNumModule(self, input_0, input_1, time_idx, batch_idx, scope='EqualNumModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors # Mapping: att_grid x att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # input_1: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. linear transform of the attention map (also including max and min) with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(input_0) H, W = self.att_shape[1:3] att_all_0 = tf.reshape(input_0, to_T([-1, H * W])) att_min_0 = tf.reduce_min(input_0, axis=[1, 2]) att_max_0 = tf.reduce_max(input_0, axis=[1, 2]) att_all_1 = tf.reshape(input_1, to_T([-1, H * W])) att_min_1 = tf.reduce_min(input_1, axis=[1, 2]) att_max_1 = tf.reduce_max(input_1, axis=[1, 2]) # att_reduced has shape [N, 3] att_concat = tf.concat([ att_all_0, att_min_0, att_max_0, att_all_1, att_min_1, att_max_1 ], axis=1) #scores = fc_relu('fc_scores', att_concat, output_dim=self.num_choices) scores = fc('fc_scores', att_concat, output_dim=self.num_choices) return scores
def localization_module_batch_score(vis_feat, spatial_feat, lang_feat, scope="localization_module", reuse=None): # Input: # vis_feat: [N_batch, N_vis, D_vis] # spatial_feat: [N_batch, N_vis, D_spatial] # lang_feat: [N_batch, D_lang] # Output: # localization_scores: [N_batch, N_vis, 1] # # This function is not responsible for initializing the variables. Please # handle variable initialization outside. with tf.variable_scope(scope, reuse=reuse): # An embedding module that maps the visual feature plus the spatial feature # linearly to the same dimension as the language feature N_batch = tf.shape(vis_feat)[0] N_vis = tf.shape(vis_feat)[1] D_vis = vis_feat.get_shape().as_list()[-1] D_spatial = spatial_feat.get_shape().as_list()[-1] D_lang = lang_feat.get_shape().as_list()[-1] # flatten the visual and spatial features and embed them to the same # dimension as the language feature vis_spatial_feat = tf.concat([vis_feat, spatial_feat], axis=2) vis_spatial_feat = tf.reshape(vis_spatial_feat, [-1, D_vis+D_spatial]) vis_spatial_embed = fc('vis_spatial_embed', vis_spatial_feat, output_dim=D_lang) # Reshape visual feature and language feature for broadcast multiplication lang_feat = tf.reshape(lang_feat, [-1, 1, D_lang]) vis_spatial_embed = tf.reshape(vis_spatial_embed, to_T([N_batch, -1, D_lang])) # Elementwise multiplication with language feature and l2-normalization eltwise_mult = tf.nn.l2_normalize(vis_spatial_embed * lang_feat, 2) eltwise_mult = tf.reshape(eltwise_mult, [-1, D_lang]) # Localization scores as linear classification over the l2-normalized localization_scores = fc('localization_scores', eltwise_mult, output_dim=1) localization_scores = tf.reshape(localization_scores, to_T([N_batch, N_vis, 1])) return localization_scores
def SceneModule(self, time_idx, batch_idx, pos_val=3, scope='SceneModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors # Mapping: None -> att_grid # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Just output a positive attention everywhere N = tf.shape(time_idx)[0] att_grid = pos_val*tf.ones(to_T([N]+self.att_shape[1:])) return att_grid
def build_input_unit(input_seq_batch, seq_length_batch, num_vocab, scope='input_unit', reuse=None): """ Preprocess the input sequence with a (single-layer) bidirectional LSTM. Input: input_seq_batch: [S, N], tf.int32 seq_length_batch: [N], tf.int32 Return: lstm_seq: [S, N, d], tf.float32 q_encoding: [N, d], tf.float32 embed_seq: [S, N, e], tf.float32 """ with tf.variable_scope(scope, reuse=reuse): # word embedding embed_dim = cfg.MODEL.EMBED_DIM if cfg.USE_FIXED_WORD_EMBED: embed_mat = to_T(np.load(cfg.FIXED_WORD_EMBED_FILE)) else: embed_mat = tf.get_variable( 'embed_mat', [num_vocab, embed_dim], initializer=tf.initializers.random_normal( stddev=np.sqrt(1. / embed_dim))) embed_seq = tf.nn.embedding_lookup(embed_mat, input_seq_batch) # bidirectional LSTM lstm_dim = cfg.MODEL.LSTM_DIM assert lstm_dim % 2 == 0, \ 'lstm_dim is the dimension of [fw, bw] and must be a multiply of 2' cell_fw = tf.nn.rnn_cell.LSTMCell(lstm_dim // 2, name='basic_lstm_cell') cell_bw = tf.nn.rnn_cell.LSTMCell(lstm_dim // 2, name='basic_lstm_cell') outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, embed_seq, dtype=tf.float32, sequence_length=seq_length_batch, time_major=True) # concatenate the hidden state from forward and backward LSTM lstm_seq = tf.concat(outputs, axis=2) # concatenate the final hidden state of the forward and backward LSTM # for question representation q_encoding = tf.concat([states[0].h, states[1].h], axis=1) return lstm_seq, q_encoding, embed_seq
def _build_encoder(self, input_seq_batch, seq_length_batch, scope='encoder', reuse=None): lstm_dim = self.lstm_dim num_layers = self.num_layers apply_dropout = self.encoder_dropout with tf.variable_scope(scope, reuse=reuse): self.T_encoder = tf.shape(input_seq_batch)[0] self.N = tf.shape(input_seq_batch)[1] # Step 1: Embedding the input seq embedding_mat = tf.get_variable( 'embedding_mat', [self.encoder_num_vocab, self.encoder_embed_dim]) # input_seq_batch has shape [T, N] and embedded_input_seq has shape [T, N, D]. # now apply the embedding to input seq batch self.embedded_input_seq = tf.nn.embedding_lookup( embedding_mat, input_seq_batch) # Step 2: Build the RNN(LSTM) cell_layers = _get_lstm_cell(num_layers, lstm_dim, apply_dropout) # encoder_outputs has shape [T, N, lstm_dim] encoder_outputs, self.encoder_states = tf.nn.dynamic_rnn( cell_layers, self.embedded_input_seq, seq_length_batch, dtype=tf.float32, time_major=True, scope='lstm') self.encoder_outputs = encoder_outputs # Step 3: Flatten the outputs # adjust the encoder outputs size to batch-like data for decoder usage encoder_h_transformed = fc('encoder_h_transform', tf.reshape(encoder_outputs, [-1, lstm_dim]), output_dim=lstm_dim) # reshape the flattened encoder to [T, N, lstm_dim] self.encoder_h_transformed = tf.reshape( encoder_h_transformed, to_T([self.T_encoder, self.N, lstm_dim])) # seq_not_finished is a shape [T, N, 1] tensor, where seq_not_finished[t, n] # is 1 iff sequence n is not finished at time t, and 0 otherwise seq_not_finished = tf.less( tf.range(self.T_encoder)[:, tf.newaxis, tf.newaxis], seq_length_batch[:, tf.newaxis]) self.seq_not_finished = tf.cast(seq_not_finished, tf.float32)
def __init__(self, input_seq_batch, seq_length_batch, T_decoder, num_vocab_txt, embed_dim_txt, num_vocab_nmn, embed_dim_nmn, lstm_dim, num_layers, assembler, encoder_dropout, decoder_dropout, decoder_sampling, use_gt_layout=None, gt_layout_batch=None, scope='encoder_decoder', reuse=None): self.T_decoder = T_decoder self.encoder_num_vocab = num_vocab_txt self.encoder_embed_dim = embed_dim_txt self.decoder_num_vocab = num_vocab_nmn self.decoder_embed_dim = embed_dim_nmn self.lstm_dim = lstm_dim self.num_layers = num_layers self.EOS_token = assembler.EOS_idx # decoding transition variables self.P = to_T(assembler.P, dtype=tf.int32) self.W = to_T(assembler.W, dtype=tf.int32) self.b = to_T(assembler.b, dtype=tf.int32) self.encoder_dropout = encoder_dropout self.decoder_dropout = decoder_dropout self.decoder_sampling = decoder_sampling with tf.variable_scope(scope, reuse=reuse): self._build_encoder(input_seq_batch, seq_length_batch) self._build_decoder(use_gt_layout, gt_layout_batch)
def add_spatial_coord_map(image_feat_grid): image_feat_shape = tf.shape(image_feat_grid) N = image_feat_shape[0] # static dimensions #H = image_feat_shape[1] #W = image_feat_shape[2] H, W = image_feat_grid.shape.as_list()[1:3] x_map = tf.tile(tf.reshape(tf.linspace(-1., 1., W), [1, 1, -1, 1]), to_T([N, H, 1, 1])) y_map = tf.tile(tf.reshape(tf.linspace(-1., 1., H), [1, -1, 1, 1]), to_T([N, 1, W, 1])) # stop gradient on coords_map (needed to fix the tile grad error on TF 1.0.0) coords_map = tf.stop_gradient(tf.concat([x_map, y_map], axis=3)) image_feat_with_coords = tf.concat([image_feat_grid, coords_map], axis=3) # set shapes of the new feature maps image_feat_static_shape = image_feat_grid.get_shape().as_list() image_feat_static_shape[3] += 2 image_feat_with_coords.set_shape(image_feat_static_shape) image_feat_static_shape[3] = 2 coords_map.set_shape(image_feat_static_shape) return image_feat_with_coords, coords_map
def __init__(self, holders, use_gt_prog, assembler, params, reuse=None): self.T_decoder = params['max_dec_len'] self.encoder_num_vocab = params['text_vocab_size'] self.encoder_embed_dim = params['text_embed_size'] self.decoder_num_vocab = params['prog_vocab_size'] self.decoder_embed_dim = params['prog_embed_size'] self.lstm_dim = params['lstm_size'] self.num_layers = params['num_layers'] self.EOS_token = assembler.EOS_idx self.embed_scope = params['embed_scope'] self.temperature = params.get('temperature', 1) # if word vectors need to be used or lstm outputs for attention params['use_word_vectors'] = 'wv-att' in params['model'] params['generator'] = params.get('generator', 'ques') self.params = params # decoding transition variables self.P = to_T(assembler.P, dtype=tf.int32) self.W = to_T(assembler.W, dtype=tf.int32) self.b = to_T(assembler.b, dtype=tf.int32) self.encoder_dropout = params['enc_dropout'] self.decoder_dropout = params['dec_dropout'] self.decoder_sampling = params['dec_sampling'] # detect fake inputs if 'fake' in holders: scope = 'enc_dec_cap' else: scope = 'enc_dec' with tf.variable_scope(scope, reuse=reuse): # build a special encoder, if needed if 'fake' not in holders and params['generator'] == 'mem': self._build_memory_encoder(holders) else: # build a normal encoder self._build_encoder(holders['ques'], holders['ques_len']) self._build_decoder(use_gt_prog, holders['prog_gt'])
def FindModule(self, time_idx, batch_idx, map_dim=500, scope='FindModule', reuse=None): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: image_feat_grid x text_param -> att_grid # Input: # image_feat_grid: [N, H, W, D_im] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Elementwise multiplication between image_feat_grid and text_param # 2. L2-normalization # 3. Linear classification with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize( image_feat_mapped * text_param_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) # TODO # Do we need to take exponential over the scores? # No. # Does the attention needs to be normalized? (sum up to 1) # No, since non-existence should be 0 everywhere return att_grid
def FindModule(self, time_idx, batch_idx, map_dim=1024, scope='FindModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: image_feat_grid x text_param -> att_grid # Input: # image_feat_grid: [N, H, W, D_im] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Elementwise multiplication between image_feat_grid and text_param # 2. L2-normalization # 3. Linear classification with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize( image_feat_mapped * text_param_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def bbox_offset_loss(self, bbox_ind_batch, bbox_offset_batch): if cfg.MODEL.BBOX_REG_AS_FCN: N = tf.shape(self.bbox_offset_fcn)[0] B = tf.shape(self.bbox_offset_fcn)[1] # B = H*W bbox_offset_flat = tf.reshape(self.bbox_offset_fcn, to_T([N * B, 4])) slice_inds = tf.range(N) * B + bbox_ind_batch bbox_offset_sliced = tf.gather(bbox_offset_flat, slice_inds) loss_bbox_offset = tf.reduce_mean( tf.squared_difference(bbox_offset_sliced, bbox_offset_batch)) else: loss_bbox_offset = tf.reduce_mean( tf.squared_difference(self.bbox_offset, bbox_offset_batch)) return loss_bbox_offset
def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=3, map_dim=500, scope='TransformModule', reuse=None): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors att_grid = input_0 text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x text_param -> att_grid # Input: # att_grid: [N, H, W, 1] # text_param: [N, D_txt] # Output: # att_grid_transformed: [N, H, W, 1] # # Implementation: # Convolutional layer that also involve text_param # A 'soft' convolutional kernel that is modulated by text_param with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(att_grid) N = att_shape[0] H = att_shape[1] W = att_shape[2] att_maps = _conv('conv_maps', att_grid, kernel_size=kernel_size, stride=1, output_dim=map_dim) text_param_mapped = fc('text_fc', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) ###################### #eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3) x = att_maps * text_param_mapped square_sum = math_ops.reduce_sum(math_ops.square(x), 3, keep_dims=True) x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, 1e-12)) eltwise_mult = math_ops.multiply(x, x_inv_norm, name=None) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) return att_grid
def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=5, map_dim=250, scope='TransformModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x text_param -> att_grid # Input: # input_0: [N, H, W, 1] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # Convolutional layer that also involve text_param # A 'soft' convolutional kernel that is modulated by text_param with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(input_0) N = att_shape[0] H = att_shape[1] W = att_shape[2] att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size, stride=1, output_dim=map_dim) text_param_mapped = fc('text_fc', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def _move_ptr_bw(stack_ptr): """ Move the stack pointer backward (i.e. to pop from stack). """ # Note: in TF, conv1d is implemented as auto-correlation (instead of # mathmatical convolution), so no flipping of the filter. filter_fw = to_T(np.array([0, 0, 1], np.float32).reshape((3, 1, 1))) new_stack_ptr = tf.squeeze(tf.nn.conv1d(stack_ptr[..., ax], filter_fw, 1, 'SAME'), axis=[2]) # when the stack pointer is already at the stack bottom, keep # the pointer in the same location (otherwise the pointer will be all zero) if cfg.MODEL.NMN.STACK.GUARD_STACK_PTR: stack_len = cfg.MODEL.NMN.STACK.LENGTH stack_bottom_mask = tf.one_hot(0, stack_len) new_stack_ptr += stack_bottom_mask * stack_ptr return new_stack_ptr
def SamePropertyModule(self, input_0, input_1, time_idx, batch_idx, map_dim=250, scope='SamePropertyModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # input_1: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. Extract visual features using the input attention map, and # linear transform to map_dim # 2. linear transform language features to map_dim # 3. Convolve image features to map_dim # 4. Element-wise multiplication of the three, l2_normalize, linear transform. with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) att_softmax_0 = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H*W]))), to_T([N, H, W, 1])) att_softmax_1 = tf.reshape( tf.nn.softmax(tf.reshape(input_1, to_T([N, H*W]))), to_T([N, H, W, 1])) # att_feat_0, att_feat_1 has shape [N, D_vis] att_feat_0 = tf.reduce_sum(image_feat_grid * att_softmax_0, axis=[1, 2]) att_feat_1 = tf.reduce_sum(image_feat_grid * att_softmax_1, axis=[1, 2]) att_feat_mapped_0 = tf.reshape( fc('fc_att_0', att_feat_0, output_dim=map_dim), to_T([N, map_dim])) att_feat_mapped_1 = tf.reshape( fc('fc_att_1', att_feat_1, output_dim=map_dim), to_T([N, map_dim])) eltwise_mult = tf.nn.l2_normalize( att_feat_mapped_0 * text_param_mapped * att_feat_mapped_1, 1) scores = fc('fc_eltwise', eltwise_mult, output_dim=self.num_choices) return scores
def empty_safe_1x1_conv(name, bottom, output_dim, reuse=None): # use this for 1x1 convolution in modules to avoid the crash. bottom_shape = tf.shape(bottom) input_dim = bottom.get_shape().as_list()[-1] # weights and biases variables with tf.variable_scope(name, reuse=reuse): # initialize the variables weights_initializer = tf.contrib.layers.xavier_initializer() biases_initializer = tf.constant_initializer(0.) weights = tf.get_variable('weights', [input_dim, output_dim], initializer=weights_initializer) biases = tf.get_variable('biases', output_dim, initializer=biases_initializer) conv_flat = tf.matmul(tf.reshape(bottom, [-1, input_dim]), weights) + biases conv = tf.reshape(conv_flat, to_T([bottom_shape[0], bottom_shape[1], bottom_shape[2], output_dim])) return conv
def refgoog_attbilstm_net(input_batch, bbox_batch, spatial_batch, expr_obj, num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout): # bbox_batch has shape [N_box, 5] and # spatial_batch has shape [N_box, D_spatial] and # expr_obj has shape [T, N_batch] N_batch = tf.shape(expr_obj)[1] N_box = tf.shape(spatial_batch)[0] # Extract visual features vis_feat = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, bbox_batch, "vgg_local", apply_dropout=vgg_dropout) D_vis = vis_feat.get_shape().as_list()[-1] # Extract representation using attention lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel = lstm_net.attbilstm( expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=lstm_dropout) # Score for each bounding box matching the first object # scores_obj1 has shape [N_batch, N_box, 1] scores_obj1 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj1) # Score for each bounding box matching the second object # scores_obj2 has shape [N_batch, N_box, 1] scores_obj2 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj2, reuse=True) # Scores for each pair of bounding box matching the relationship # Tile the scores by broadcasting add # scores_rel has shape [N_batch, N_box, N_box, 1] scores_rel = modules.relationship_module_spatial_only_grid_score( spatial_batch, scores_obj1, spatial_batch, scores_obj2, lang_relation, rescale_scores=True) tf.add_to_collection("s_pair", scores_rel) # marginal_scores has shape [N_batch, N_box, 1] marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2) final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1])) return final_scores, probs_obj1, probs_obj2, probs_rel
def instantiate_batch(self, inputs): """ Inputs: image feature for the example text attention for all modules for the example time id for current module """ vis_att, img_feat, _ = inputs encode_size = self._params['encode_size'] with tf.variable_scope(self._module_scope): with tf.variable_scope(self._scope, reuse=self._reuse): H, W = img_feat.shape.as_list()[1:3] att_all = tf.reshape(vis_att, to_T([-1, H * W])) att_min = tf.reduce_min(vis_att, axis=[1, 2]) att_max = tf.reduce_max(vis_att, axis=[1, 2]) # att_reduced has shape [N, 3] att_concat = tf.concat([att_all, att_min, att_max], axis=1) context = fc('fc_scores', att_concat, output_dim=encode_size) return [context]
def refgoog_retrieval_baseline(vis_feat, spatial_batch, expr_obj, num_vocab, embed_dim, lstm_dim): N_batch = tf.shape(expr_obj)[1] N_box = tf.shape(spatial_batch)[0] D_vis = vis_feat.get_shape().as_list()[-1] lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel = lstm_net.attbilstm( expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=False) scores_obj1 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj1) scores_obj2 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj2, reuse=True) scores_rel = modules.relationship_module_spatial_only_grid_score( spatial_batch, scores_obj1, spatial_batch, scores_obj2, lang_relation, rescale_scores=True) marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2) final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1])) return final_scores
def CountModule(self, input_0, time_idx, batch_idx, scope='CountModule', kernel_size=5, map_dim=300, reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors # Mapping: att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. linear transform of the attention map (also including max and min) with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): H, W = self.att_shape[1:3] att_all = tf.reshape(input_0, to_T([-1, H * W])) att_min = tf.reduce_min(input_0, axis=[1, 2]) att_max = tf.reduce_max(input_0, axis=[1, 2]) # att_reduced has shape [N, 3] att_concat = tf.concat([att_all, att_min, att_max], axis=1) scores = fc('fc_scores', att_concat, output_dim=self.num_choices) # att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size,stride=1, output_dim=map_dim) # att_grid = _1x1_conv("conv_eltwise",att_maps,output_dim=1) # att_grid.set_shape(self.att_shape) # att_shape = tf.shape(att_grid) # H, W = self.att_shape[1:3] # att_all = tf.reshape(att_grid, to_T([-1, H*W])) # scores = fc('fc_scores', att_all, output_dim=self.num_choices) return scores
def __init__(self, image_feat_grid, word_vecs, num_choices): self.image_feat_grid = image_feat_grid self.word_vecs = word_vecs self.num_choices = num_choices # Capture the variable scope for creating all variables with tf.variable_scope('module_variables') as module_variable_scope: self.module_variable_scope = module_variable_scope # Flatten word vecs for efficient slicing # word_vecs has shape [T_decoder, N, D] word_vecs_shape = tf.shape(word_vecs) T_full = word_vecs_shape[0] self.N_full = word_vecs_shape[1] D_word = word_vecs.get_shape().as_list()[-1] self.word_vecs_flat = tf.reshape( word_vecs, to_T([T_full*self.N_full, D_word])) # create each dummy modules here so that weights won't get initialized again att_shape = image_feat_grid.get_shape().as_list()[:-1] + [1] self.att_shape = att_shape input_att = tf.placeholder(tf.float32, att_shape) time_idx = tf.placeholder(tf.int32, [None]) batch_idx = tf.placeholder(tf.int32, [None]) self.SceneModule(time_idx, batch_idx, reuse=False) self.FindModule(time_idx, batch_idx, reuse=False) self.FindSamePropertyModule(input_att, time_idx, batch_idx, reuse=False) self.TransformModule(input_att, time_idx, batch_idx, reuse=False) self.AndModule(input_att, input_att, time_idx, batch_idx, reuse=False) self.FilterModule(input_att, time_idx, batch_idx, reuse=False) self.OrModule(input_att, input_att, time_idx, batch_idx, reuse=False) self.ExistModule(input_att, time_idx, batch_idx, reuse=False) self.CountModule(input_att, time_idx, batch_idx, reuse=False) self.EqualNumModule(input_att, input_att, time_idx, batch_idx, reuse=False) self.MoreNumModule(input_att, input_att, time_idx, batch_idx, reuse=False) self.LessNumModule(input_att, input_att, time_idx, batch_idx, reuse=False) self.SamePropertyModule(input_att, input_att, time_idx, batch_idx, reuse=False) self.DescribeModule(input_att, time_idx, batch_idx, reuse=False)
def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=5, map_dim=1024, scope='TransformModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x text_param -> att_grid # Input: # input_0: [N, H, W, 1] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation (Same as FindSamePropertyModule): # 1. Extract visual features using the input attention map, and # linear transform to map_dim # 2. linear transform language features to map_dim # 3. Convolve image features to map_dim # 4. Element-wise multiplication of the three, l2_normalize, linear transform. with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) att_softmax = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))), to_T([N, H, W, 1])) # att_feat has shape [N, D_vis] att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2]) att_feat_mapped = tf.reshape( fc('fc_att', att_feat, output_dim=map_dim), to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize( image_feat_mapped * text_param_mapped * att_feat_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def loop_fn(time, cell_output, cell_state, loop_state): if cell_output is None: # time == 0 next_cell_state = encoder_states next_input = tf.tile(go_embedding, to_T([N, 1])) else: # time > 0 next_cell_state = cell_state # compute the attention map over the input sequence # a_raw has shape [T, N, 1] att_raw = tf.reduce_sum(tf.tanh( tf.nn.xw_plus_b(cell_output, W_a, b_a) + self.encoder_h_transformed) * v, axis=2, keep_dims=True) # softmax along the first dimension (T) over not finished examples # att has shape [T, N, 1] att = tf.nn.softmax(att_raw, dim=0) * self.seq_not_finished att = att / tf.reduce_sum(att, axis=0, keep_dims=True) # d has shape [N, lstm_dim] d2 = tf.reduce_sum(att * self.encoder_outputs, axis=0) # token_scores has shape [N, num_vocab] token_scores = tf.nn.xw_plus_b( tf.concat([cell_output, d2], axis=1), W_y, b_y) # predict the next token (behavior depending on parameters) if sampling: # predicted_token has shape [N] logits = token_scores predicted_token = tf.cast( tf.reshape(tf.multinomial(token_scores, 1), [-1]), tf.int32) else: # predicted_token has shape [N] predicted_token = tf.cast(tf.argmax(token_scores, 1), tf.int32) predicted_token = gt_layout_batch[time - 1] # token_prob has shape [N], the probability of the predicted token # although token_prob is not needed for predicting the next token # it is needed in output (for policy gradient training) # [N, num_vocab] # mask has shape [N, num_vocab] mask = tf.equal(mask_range, tf.reshape(predicted_token, [-1, 1])) all_token_probs = tl.activation.pixel_wise_softmax( token_scores) token_prob = tf.reduce_sum(all_token_probs * tf.cast(mask, tf.float32), axis=1) neg_entropy = tf.reduce_sum( all_token_probs * tf.log(tf.maximum(1e-5, all_token_probs)), axis=1) # is_eos_predicted is a [N] bool tensor, indicating whether # <eos> has already been predicted previously in each sequence is_eos_predicted = loop_state[2] predicted_token_old = predicted_token # if <eos> has already been predicted, now predict <eos> with # prob 1 predicted_token = tf.where(is_eos_predicted, all_eos_pred, predicted_token) token_prob = tf.where(is_eos_predicted, all_one_prob, token_prob) neg_entropy = tf.where(is_eos_predicted, all_zero_entropy, neg_entropy) is_eos_predicted = tf.logical_or( is_eos_predicted, tf.equal(predicted_token_old, EOS_token)) # the prediction is from the cell output of the last step # timestep (t-1), feed it as input into timestep t next_input = tf.nn.embedding_lookup( embedding_mat, predicted_token) elements_finished = tf.greater_equal(time, T_max) # loop_state is a 5-tuple, representing # 1) the predicted_tokens # 2) the prob of predicted_tokens # 3) whether <eos> has already been predicted # 4) the negative entropy of policy (accumulated across timesteps) # 5) the attention if loop_state is None: # time == 0 # Write the predicted token into the output predicted_token_array = tf.TensorArray(dtype=tf.int32, size=T_max, infer_shape=False) token_prob_array = tf.TensorArray(dtype=tf.float32, size=T_max, infer_shape=False) att_array = tf.TensorArray(dtype=tf.float32, size=T_max, infer_shape=False) next_loop_state = (predicted_token_array, token_prob_array, tf.zeros(to_T([N]), dtype=tf.bool), tf.zeros(to_T([N]), dtype=tf.float32), att_array) else: # time > 0 t_write = time - 1 next_loop_state = (loop_state[0].write( t_write, predicted_token), loop_state[1].write( t_write, token_prob), is_eos_predicted, loop_state[3] + neg_entropy, loop_state[4].write(t_write, att)) return (elements_finished, next_input, next_cell_state, cell_output, next_loop_state)
def _build_decoder(self, gt_layout_batch, scope='decoder', reuse=None): # This function is for decoding only. It performs greedy search or sampling. # The first input is <go> (its embedding vector) ,and the subsequent inputs # are the outputs from previous time step (implementing attention). # # T_max is the maximum length of decoded sequence (including <eos>) # num_vocab does not include <go> N = self.N encoder_states = self.encoder_states T_max = self.T_decoder lstm_dim = self.lstm_dim num_layers = self.num_layers apply_dropout = self.decoder_dropout EOS_token = self.EOS_token sampling = self.decoder_sampling with tf.variable_scope(scope, reuse=reuse): embedding_mat = tf.get_variable( 'embedding_mat', [self.decoder_num_vocab, self.decoder_embed_dim]) # Special embeddign for <go>, which denotes seq start go_embedding = tf.get_variable('go_embedding', [1, self.decoder_embed_dim]) with tf.variable_scope('att_prediction'): v = tf.get_variable('v', [lstm_dim]) W_a = tf.get_variable( 'weights', [lstm_dim, lstm_dim], initializer=tf.contrib.layers.xavier_initializer()) b_a = tf.get_variable('biases', lstm_dim, initializer=tf.constant_initializer(0.)) # The parameters to predict the next token with tf.variable_scope('token_prediction'): W_y = tf.get_variable( 'weights', [lstm_dim * 2, self.decoder_num_vocab], initializer=tf.contrib.layers.xavier_initializer()) b_y = tf.get_variable('biases', self.decoder_num_vocab, initializer=tf.constant_initializer(0.)) mask_range = tf.reshape( tf.range(self.decoder_num_vocab, dtype=tf.int32), [1, -1]) all_eos_pred = EOS_token * tf.ones(to_T([N]), tf.int32) all_one_prob = tf.ones(to_T([N]), tf.float32) all_zero_entropy = tf.zeros(to_T([N]), tf.float32) def loop_fn(time, cell_output, cell_state, loop_state): if cell_output is None: # time == 0 next_cell_state = encoder_states next_input = tf.tile(go_embedding, to_T([N, 1])) else: # time > 0 next_cell_state = cell_state # compute the attention map over the input sequence # a_raw has shape [T, N, 1] att_raw = tf.reduce_sum(tf.tanh( tf.nn.xw_plus_b(cell_output, W_a, b_a) + self.encoder_h_transformed) * v, axis=2, keep_dims=True) # softmax along the first dimension (T) over not finished examples # att has shape [T, N, 1] att = tf.nn.softmax(att_raw, dim=0) * self.seq_not_finished att = att / tf.reduce_sum(att, axis=0, keep_dims=True) # d has shape [N, lstm_dim] d2 = tf.reduce_sum(att * self.encoder_outputs, axis=0) # token_scores has shape [N, num_vocab] token_scores = tf.nn.xw_plus_b( tf.concat([cell_output, d2], axis=1), W_y, b_y) # predict the next token (behavior depending on parameters) if sampling: # predicted_token has shape [N] logits = token_scores predicted_token = tf.cast( tf.reshape(tf.multinomial(token_scores, 1), [-1]), tf.int32) else: # predicted_token has shape [N] predicted_token = tf.cast(tf.argmax(token_scores, 1), tf.int32) predicted_token = gt_layout_batch[time - 1] # token_prob has shape [N], the probability of the predicted token # although token_prob is not needed for predicting the next token # it is needed in output (for policy gradient training) # [N, num_vocab] # mask has shape [N, num_vocab] mask = tf.equal(mask_range, tf.reshape(predicted_token, [-1, 1])) all_token_probs = tl.activation.pixel_wise_softmax( token_scores) token_prob = tf.reduce_sum(all_token_probs * tf.cast(mask, tf.float32), axis=1) neg_entropy = tf.reduce_sum( all_token_probs * tf.log(tf.maximum(1e-5, all_token_probs)), axis=1) # is_eos_predicted is a [N] bool tensor, indicating whether # <eos> has already been predicted previously in each sequence is_eos_predicted = loop_state[2] predicted_token_old = predicted_token # if <eos> has already been predicted, now predict <eos> with # prob 1 predicted_token = tf.where(is_eos_predicted, all_eos_pred, predicted_token) token_prob = tf.where(is_eos_predicted, all_one_prob, token_prob) neg_entropy = tf.where(is_eos_predicted, all_zero_entropy, neg_entropy) is_eos_predicted = tf.logical_or( is_eos_predicted, tf.equal(predicted_token_old, EOS_token)) # the prediction is from the cell output of the last step # timestep (t-1), feed it as input into timestep t next_input = tf.nn.embedding_lookup( embedding_mat, predicted_token) elements_finished = tf.greater_equal(time, T_max) # loop_state is a 5-tuple, representing # 1) the predicted_tokens # 2) the prob of predicted_tokens # 3) whether <eos> has already been predicted # 4) the negative entropy of policy (accumulated across timesteps) # 5) the attention if loop_state is None: # time == 0 # Write the predicted token into the output predicted_token_array = tf.TensorArray(dtype=tf.int32, size=T_max, infer_shape=False) token_prob_array = tf.TensorArray(dtype=tf.float32, size=T_max, infer_shape=False) att_array = tf.TensorArray(dtype=tf.float32, size=T_max, infer_shape=False) next_loop_state = (predicted_token_array, token_prob_array, tf.zeros(to_T([N]), dtype=tf.bool), tf.zeros(to_T([N]), dtype=tf.float32), att_array) else: # time > 0 t_write = time - 1 next_loop_state = (loop_state[0].write( t_write, predicted_token), loop_state[1].write( t_write, token_prob), is_eos_predicted, loop_state[3] + neg_entropy, loop_state[4].write(t_write, att)) return (elements_finished, next_input, next_cell_state, cell_output, next_loop_state) # The RNN cell_layers = _get_lstm_cell(num_layers, lstm_dim, apply_dropout) _, _, decodes_ta = tf.nn.raw_rnn(cell_layers, loop_fn, scope='lstm') predicted_tokens = decodes_ta[0].stack() token_probs = decodes_ta[1].stack() neg_entropy = decodes_ta[3] # atts has shape [T_decoder, T_encoder, N, 1] self.atts = decodes_ta[4].stack() # word_vec has shape [T_decoder, N, 1] word_vecs = tf.reduce_sum(self.atts * self.embedded_input_seq, axis=1) predicted_tokens.set_shape([None, None]) token_probs.set_shape([None, None]) neg_entropy.set_shape([None]) word_vecs.set_shape([None, None, self.encoder_embed_dim]) self.predicted_tokens = predicted_tokens self.token_probs = token_probs self.neg_entropy = neg_entropy self.word_vecs = word_vecs
def visual7w_attbilstm_net(input_batch, bbox_batch1, spatial_batch1, bbox_batch2, spatial_batch2, expr_obj, num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout): # a sentence is parsed into [expr_obj1, expr_relation, expr_obj2] # bbox_batch1 has shape [N_batch*N1, 5] and # spatial_batch1 has shape [N_batch, N1, D_spatial] and # bbox_batch2 has shape [N2, 5] and # spatial_batch2 has shape [1, N2, D_spatial] and # expr_obj has shape [T, N_batch] # where N1 is the number of choices (= 4 in Visual 7W) and # N2 is the number of proposals (~ 300 for RPN in Faster RCNN) N_batch = tf.shape(spatial_batch1)[0] N1 = tf.shape(spatial_batch1)[1] N2 = tf.shape(spatial_batch2)[1] # Extract visual features vis_feat1 = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, tf.reshape(bbox_batch1, [-1, 5]), "vgg_local", apply_dropout=vgg_dropout) D_vis = vis_feat1.get_shape().as_list()[-1] vis_feat1 = tf.reshape(vis_feat1, to_T([N_batch, N1, D_vis])) vis_feat1.set_shape([None, None, D_vis]) # Reshape and tile vis_feat2 and spatial_batch2 vis_feat2 = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, tf.reshape(bbox_batch2, [-1, 5]), "vgg_local", apply_dropout=vgg_dropout, reuse=True) vis_feat2 = tf.reshape(vis_feat2, to_T([1, N2, D_vis])) vis_feat2 = tf.tile(vis_feat2, to_T([N_batch, 1, 1])) vis_feat2.set_shape([None, None, D_vis]) spatial_batch2 = tf.tile(spatial_batch2, to_T([N_batch, 1, 1])) # Extract representation using attention lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel = lstm_net.attbilstm( expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=lstm_dropout) # Score for each bounding box matching the first object # scores_obj1 has shape [N_batch, N1, 1] scores_obj1 = modules.localization_module_batch_score( vis_feat1, spatial_batch1, lang_obj1) # Score for each bounding box matching the second object # scores_obj2 has shape [N_batch, N2, 1] scores_obj2 = modules.localization_module_batch_score(vis_feat2, spatial_batch2, lang_obj2, reuse=True) # Scores for each pair of bounding box matching the relationship # Tile the scores by broadcasting add # scores_rel has shape [N_batch, N1, N2, 1] scores_rel = modules.relationship_module_spatial_only_batch_score( spatial_batch1, scores_obj1, spatial_batch2, scores_obj2, lang_relation, rescale_scores=True) # marginal_scores has shape [N_batch, N1, 1] tf.add_to_collection("s_pair", scores_rel) marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2) final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1])) return final_scores
def DescribeModule(self, input_0, time_idx, batch_idx, map_dim=1024, scope='DescribeModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) encoder_states = self._slice_encoder_states(batch_idx) # Mapping: att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. Extract visual features using the input attention map, and # linear transform to map_dim # 2. linear transform language features to map_dim # 3. Element-wise multiplication of the two, l2_normalize, linear transform. with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) att_softmax = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))), to_T([N, H, W, 1])) # att_feat, att_feat_1 has shape [N, D_vis] att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2]) att_feat_mapped = tf.reshape( fc('fc_att', att_feat, output_dim=map_dim), to_T([N, map_dim])) if encoder_states is not None: # Add in encoder states in the elementwise multiplication encoder_states_mapped = fc('fc_encoder_states', encoder_states, output_dim=map_dim) eltwise_mult = tf.nn.l2_normalize( text_param_mapped * att_feat_mapped * encoder_states_mapped, 1) else: eltwise_mult = tf.nn.l2_normalize( text_param_mapped * att_feat_mapped, 1) scores = fc('fc_eltwise', eltwise_mult, output_dim=self.num_choices) return scores
def build_output_unit_loc(q_encoding, kb_batch, att_last, scope='output_unit_loc', reuse=None): """ Apply a 1-layer convolution network to predict localization scores. Apply dropout if specified. Input: kb_batch: [N, H, W, d], tf.float32 att_last: [N, H, W, 1], tf.float32 Return: loc_scores: [N, H*W], tf.float32 bbox_offset: [N, 4], tf.float32 """ with tf.variable_scope(scope, reuse=reuse): if cfg.MODEL.LOC_SCORES_POS_AFFINE: # make sure att signs do not flip w = tf.abs(tf.get_variable('loc_scores_affine_raw_w', [])) b = tf.get_variable('loc_scores_affine_b', []) loc_scores = w * att_last + b else: loc_scores = conv('conv_loc', att_last, kernel_size=3, stride=1, output_dim=1) loc_scores = tf.reshape(loc_scores, [-1, cfg.MODEL.H_FEAT * cfg.MODEL.W_FEAT]) # extract the attended features for bounding box regression if cfg.MODEL.BBOX_REG_AS_FCN: if cfg.MODEL.BBOX_REG_USE_QUESTION: q_mapped = fc('fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM) bbox_offset_input = tf.nn.l2_normalize(q_mapped[:, ax, ax, :] * kb_batch, axis=-1) else: bbox_offset_input = kb_batch bbox_offset_fcn = conv('conv_bbox_offset', bbox_offset_input, 1, 1, output_dim=4) N = tf.shape(bbox_offset_fcn)[0] B = cfg.MODEL.H_FEAT * cfg.MODEL.W_FEAT # B = H*W # bbox_offset_fcn [N, B, 4] is used for training bbox_offset_fcn = tf.reshape(bbox_offset_fcn, to_T([N, B, 4])) # bbox_offset [N, 4] is only used for prediction bbox_offset_flat = tf.reshape(bbox_offset_fcn, to_T([N * B, 4])) slice_inds = tf.range(N) * B + tf.argmax( loc_scores, axis=-1, output_type=tf.int32) bbox_offset = tf.gather(bbox_offset_flat, slice_inds) else: bbox_offset_fcn = None kb_loc = _extract_softmax_avg(kb_batch, att_last) if cfg.MODEL.BBOX_REG_USE_QUESTION: q_mapped = fc('fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM) elt_prod = tf.nn.l2_normalize(q_mapped * kb_loc, axis=-1) bbox_offset = fc('fc_bbox_offset_with_q', elt_prod, output_dim=4) else: bbox_offset = fc('fc_bbox_offset', kb_loc, output_dim=4) return loc_scores, bbox_offset, bbox_offset_fcn
def _spatial_softmax(att_raw): att_shape = tf.shape(att_raw) N = att_shape[0] att_softmax = tf.nn.softmax(tf.reshape(att_raw, to_T([N, -1])), axis=1) att_softmax = tf.reshape(att_softmax, att_shape) return att_softmax
def relationship_module_spatial_only_batch_score(spatial_feat1, scores1, spatial_feat2, scores2, lang_feat, scope="relationship_module_spatial_only", rescale_scores=False, reuse=None): # Input shape: # spatial_feat1, spatial_feat2 : [N_batch, N1, D_spatial], [N_batch, N2, D_spatial] # scores1, scores2: [N_batch, N1, 1], [N_batch, N2, 1] # lang_feat: [N_batch, D_lang] # Output shape: # relationship_scores: [N_batch, N1, N2, 1] # # This function is not responsible for initializing the variables. Please # handle variable initialization outside. with tf.variable_scope(scope, reuse=reuse): # An embedding module that maps the visual feature plus the spatial feature # linearly to the same dimension as the language feature N_batch = tf.shape(lang_feat)[0] D_lang = lang_feat.get_shape().as_list()[-1] N1 = tf.shape(spatial_feat1)[1] N2 = tf.shape(spatial_feat2)[1] D_spatial = spatial_feat1.get_shape().as_list()[-1] # Tiled spatial features of size [N_batch, N1, N2, 5*2], such that # spatial_feat_tiled[k, i, j] = [ spatial_feat1[k, i], spatial_feat1[k, j] ] spatial_feat_tiled = tf.reshape(tf.concat([ tf.tile(tf.reshape(spatial_feat1, to_T([N_batch, -1, 1, D_spatial])), to_T([1, 1, N2, 1])), tf.tile(tf.reshape(spatial_feat2, to_T([N_batch, 1, -1, D_spatial])), to_T([1, N1, 1, 1])) ], axis=2), [-1, D_spatial*2]) # Embedded spatial feature of size [N_batchxN1xN2, D_lang] spatial_embed = fc('spatial_embed', spatial_feat_tiled, output_dim=D_lang) # Elementwise multiplication with language feature and l2-normalization spatial_embed = tf.reshape(spatial_embed, to_T([N_batch, -1, D_lang])) lang_feat = tf.reshape(lang_feat, [-1, 1, D_lang]) eltwise_mult = tf.nn.l2_normalize(spatial_embed * lang_feat, 2) # eltwise_mult has shape [N_batchxN1xN2, D_lang] eltwise_mult = tf.reshape(eltwise_mult, [-1, D_lang]) # Localization scores as linear classification over the l2-normalized relationship_scores = fc('relationship_scores', eltwise_mult, output_dim=1) relationship_scores = tf.reshape(relationship_scores, to_T([N_batch, N1, N2, 1])) # Rescale the scores, if specified if rescale_scores: alpha_obj1 = tf.get_variable("alpha_obj1", shape=[], dtype=tf.float32, initializer=tf.constant_initializer(1)) alpha_obj2 = tf.get_variable("alpha_obj2", shape=[], dtype=tf.float32, initializer=tf.constant_initializer(1)) alpha_rel = tf.get_variable("alpha_rel", shape=[], dtype=tf.float32, initializer=tf.constant_initializer(1)) scores1 = tf.multiply(scores1, alpha_obj1) scores2 = tf.multiply(scores2, alpha_obj2) relationship_scores = tf.multiply(relationship_scores, alpha_rel) final_scores = tf.add(tf.add(tf.reshape(scores1, to_T([N_batch, N1, 1, 1])), tf.reshape(scores2, to_T([N_batch, 1, N2, 1]))), relationship_scores) final_scores.set_shape([None, None, None, 1]) return final_scores