def encode(self, feature, length, scope=None): """Encodes sequence features into representation. Args: feature: A [batch, max_sequence_len, dims] float tensor. length: A [batch] int tensor. Returns: A [batch, dims] float tensor. """ with tf.name_scope('avg_pooling_encoder'): mask = tf.sequence_mask( length, maxlen=utils.get_tensor_shape(feature)[-2], dtype=tf.float32) feature = utils.masked_avg_nd(data=feature, mask=mask, dim=1) return tf.squeeze(feature, axis=1)
def _average_encoding(sequence_feature, sequence_length): """Encodes sequence using Average pooling. Args: sequence_feature: a [batch_sequence, max_sequence_length, feature_dimensions]. float tensor. sequence_length: a [batch_sequence] int tensor. Returns: sequence_emb: A [batch_sequence, common_dimensions] float tensor, representing the embedding vectors. """ (_, max_sequence_length, _) = utils.get_tensor_shape(sequence_feature) mask = tf.sequence_mask( sequence_length, maxlen=max_sequence_length, dtype=tf.float32) sequence_emb = utils.masked_avg_nd(sequence_feature, mask, dim=1) sequence_emb = tf.squeeze(sequence_emb, axis=1) return sequence_emb
def test_masked_avg_nd(self): tf.reset_default_graph() data = tf.placeholder(tf.float32, shape=[None, None, None]) mask = tf.placeholder(tf.float32, shape=[None, None]) masked_avgs = utils.masked_avg_nd(data, mask) with self.test_session() as sess: result = sess.run(masked_avgs, feed_dict={ data: [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], mask: [[1, 0, 1], [0, 1, 0]] }) self.assertAllClose(result, [[[3, 4]], [[9, 10]]]) result = sess.run(masked_avgs, feed_dict={ data: [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], mask: [[0, 0, 0], [0, 0, 0]] }) self.assertAllClose(result, [[[0, 0]], [[0, 0]]])
def encode(self, feature, length, scope=None): """Encodes sequence features into representation. Args: feature: A [batch, max_sequence_len, dims] float tensor. length: A [batch] int tensor. Returns: A [batch, dims] float tensor. """ options = self._model_proto is_training = self._is_training mask = tf.sequence_mask( length, maxlen=utils.get_tensor_shape(feature)[1], dtype=tf.float32) feature = tf.contrib.layers.fully_connected( inputs=feature, num_outputs=feature.get_shape()[-1].value, activation_fn=None, scope=scope) feature = utils.masked_avg_nd(data=feature, mask=mask, dim=1) return tf.squeeze(feature, axis=1)
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Image CNN features. inputs = examples[InputDataFields.image] image_features = model_utils.calc_cnn_feature( inputs, options.cnn_options, is_training=is_training) with slim.arg_scope( build_hyperparams(options.image_fc_hyperparams, is_training)): image_features = slim.fully_connected( image_features, num_outputs=options.shared_dims, activation_fn=None, scope='image') # Text Global-Average-Pooling features. (image_id, num_captions, caption_strings, caption_lengths) = (examples[InputDataFields.image_id], examples[InputDataFields.num_captions], examples[InputDataFields.caption_strings], examples[InputDataFields.caption_lengths]) image_id = tf.string_to_number(image_id, out_type=tf.int64) (image_ids_gathered, caption_strings_gathered, caption_lengths_gathered) = model_utils.gather_in_batch_captions( image_id, num_captions, caption_strings, caption_lengths) (caption_token_ids_gathered, caption_features_gathered) = self._extract_text_feature( caption_strings_gathered, caption_lengths_gathered, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding, max_norm=None) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): if visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor: attn = slim.fully_connected( caption_features_gathered, num_outputs=1, activation_fn=None, scope='caption_attn') attn = tf.squeeze(attn, axis=-1) caption_features_gathered = slim.fully_connected( caption_features_gathered, num_outputs=options.shared_dims, activation_fn=None, scope='caption') oov = len(self._open_vocabulary_list) caption_masks_gathered = tf.logical_not( tf.equal(caption_token_ids_gathered, oov)) caption_masks_gathered = tf.to_float(caption_masks_gathered) if visual_w2v_model_pb2.VisualW2vModel.GAP == options.text_feature_extractor: caption_features_gathered = utils.masked_avg_nd( data=caption_features_gathered, mask=caption_masks_gathered, dim=1) caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1) elif visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor: attn = utils.masked_softmax(attn, mask=caption_masks_gathered, dim=-1) caption_features_gathered = tf.multiply( tf.expand_dims(attn, axis=-1), caption_features_gathered) caption_features_gathered = utils.masked_sum_nd( caption_features_gathered, mask=caption_masks_gathered, dim=1) caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1) else: raise ValueError('Invalid text feature extractor.') # Export token embeddings. with tf.variable_scope(tf.get_variable_scope(), reuse=True): _, token_embeddings = self._encode_tokens( tokens=tf.constant(self._open_vocabulary_list), embedding_dims=options.embedding_dims, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, trainable=options.train_word_embedding) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): token_embeddings = slim.fully_connected( token_embeddings, num_outputs=options.shared_dims, activation_fn=None, scope='caption') var_to_assign = tf.get_variable( name='weights_proj', shape=[len(self._open_vocabulary_list), options.shared_dims]) var_to_assign = tf.assign(var_to_assign, token_embeddings) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, var_to_assign) tf.summary.histogram('token_embedding_proj', token_embeddings) # Compute similarity. similarity = model_utils.calc_pairwise_similarity( feature_a=image_features, feature_b=caption_features_gathered, l2_normalize=True, dropout_keep_prob=options.cross_modal_dropout_keep_prob, is_training=is_training) predictions = { VisualW2vPredictions.image_id: image_id, VisualW2vPredictions.image_ids_gathered: image_ids_gathered, VisualW2vPredictions.similarity: similarity, VisualW2vPredictions.word2vec: var_to_assign, } return predictions
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) predictions = { DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, } # FRCNN. # `proposal_features` shape = [batch, max_num_proposals, feature_dims]. # `proposal_masks` shape = [batch, max_num_proposals]. proposal_features = self._extract_frcnn_feature( inputs, num_proposals, proposals) batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features) proposal_masks = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) # Build the SADDN predictions. # `logits_c_given_r` shape = [batch, max_num_proposals, num_classes]. # `logits_r_given_c` shape = [batch, max_num_proposals, num_classes]. with tf.variable_scope('SADDN'), \ slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)): logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=self._num_classes, activation_fn=None, scope='proba_c_given_r') logits_r_given_c = slim.fully_connected( proposal_features, num_outputs=self._num_classes, activation_fn=None, scope='proba_r_given_c') proba_c_given_r = tf.nn.softmax(logits_c_given_r) proba_r_given_c = utils.masked_softmax( data=logits_r_given_c, mask=tf.expand_dims(proposal_masks, axis=-1), dim=1) proba_r_given_c = tf.multiply( tf.expand_dims(proposal_masks, axis=-1), proba_r_given_c) tf.summary.image('inputs', inputs, max_outputs=10) model_utils.visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=2000) # SADDN iterations. logits_at_0 = utils.masked_avg_nd(data=logits_c_given_r, mask=proposal_masks, dim=1) logits_at_0 = tf.squeeze(logits_at_0, axis=1) logits_at_i = logits_at_0 for i in range(options.saddn_iterations): # Infer the proba_r_given_c. # Infer the proba_c. proba_c_at_i = tf.nn.softmax(logits_at_i) import pdb pdb.set_trace() proba_r_at_i = tf.multiply(tf.expand_dims(proba_c_at_i, axis=1), proba_r_given_c) proba_r_at_i = tf.reduce_sum(proba_r_at_i, axis=-1, keepdims=True) # Infer the detection results at iter `i`. (num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, detection_classes_at_i) = model_utils.post_process( proposals, proba_r_at_i * proba_c_given_r) (predictions[StackedAttnPredictions.logits + '_at_{}'.format(i)], predictions[DetectionResultFields.num_detections + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i)]) = (logits_at_i, num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, detection_classes_at_i) model_utils.visl_proposals_top_k( inputs, num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, tf.gather(self._vocabulary_list, tf.to_int32(detection_classes_at_i - 1)), name='detection_{}'.format(i)) # `logits_at_i` for the next iteration. logits_at_i = tf.multiply(proba_r_at_i, logits_c_given_r) logits_at_i = tf.reduce_sum(logits_at_i, axis=1) return predictions