def _build_midn_network(self, num_proposals, proposal_features, num_classes=20): """Builds the Multiple Instance Detection Network. MIDN: An attention network. Args: num_proposals: A [batch] int tensor. proposal_features: A [batch, max_num_proposals, features_dims] float tensor. num_classes: Number of classes. Returns: logits: A [batch, num_classes] float tensor. proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor. """ with tf.name_scope('multi_instance_detection'): batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features) mask = tf.sequence_mask( num_proposals, maxlen=max_num_proposals, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) # Calculates the attention score: proposal `r` given class `c`. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. logits_r_given_c = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='midn/proba_r_given_c') logits_r_given_c = tf.multiply(mask, logits_r_given_c) proba_r_given_c = utils.masked_softmax( data=logits_r_given_c, mask=mask, dim=1) proba_r_given_c = tf.multiply(mask, proba_r_given_c) tf.summary.histogram('midn/logits_r_given_c', logits_r_given_c) # Calculates the weighted logits: # logits_c_given_r shape = [batch, max_num_proposals, num_classes]. # logits shape = [batch, num_classes]. logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='midn/proba_c_given_r') proba_c_given_r = tf.nn.softmax(logits_c_given_r) proba_c_given_r = tf.multiply(mask, proba_c_given_r) tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r) # Aggregates the logits. logits = tf.multiply(logits_c_given_r, proba_r_given_c) logits = tf.reduce_sum(logits, axis=1) tf.summary.histogram('midn/logits', logits) return logits, proba_r_given_c
def test_masked_softmax(self): tf.reset_default_graph() data = tf.placeholder(tf.float32, shape=[None, None]) mask = tf.placeholder(tf.float32, shape=[None, None]) masked_softmax = utils.masked_softmax(data, mask) with self.test_session() as sess: result = sess.run(masked_softmax, feed_dict={ data: [[1, 1, 1, 1], [1, 1, 1, 1]], mask: [[1, 1, 1, 1], [1, 1, 1, 1]] }) self.assertAllClose( result, [[0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]]) result = sess.run(masked_softmax, feed_dict={ data: [[1, 1, 1, 1], [1, 1, 1, 1]], mask: [[1, 1, 0, 0], [0, 0, 1, 1]] }) self.assertAllClose(result, [[0.5, 0.5, 0.0, 0.0], [0.0, 0.0, 0.5, 0.5]])
def encode(self, feature, length, scope=None): """Encodes sequence features into representation. Args: feature: A [batch, max_sequence_len, dims] float tensor. length: A [batch] int tensor. Returns: A [batch, dims] float tensor. """ options = self._model_proto is_training = self._is_training mask = tf.sequence_mask( length, maxlen=utils.get_tensor_shape(feature)[1], dtype=tf.float32) # Compute attention distribution. node = feature for i in range(options.hidden_layers): node = tf.contrib.layers.fully_connected( inputs=node, num_outputs=feature.get_shape()[-1].value, scope=scope + '/hidden_{}'.format(i)) logits = tf.contrib.layers.fully_connected( inputs=node, num_outputs=1, activation_fn=None, scope=scope) probas = utils.masked_softmax( data=logits, mask=tf.expand_dims(mask, axis=-1), dim=1) feature = utils.masked_sum_nd(data=feature * probas, mask=mask, dim=1) # Summary. #tf.summary.histogram('attn/probas/' + scope, probas) #tf.summary.histogram('attn/logits/' + scope, logits) return tf.squeeze(feature, axis=1)
def create_graph(self, proposal_repr, slogan_repr, label_repr, dbpedia_repr, proposal_mask, slogan_mask, label_mask, label_to_slogan_mask, dbpedia_mask, dbpedia_to_slogan_mask): """Creates graph.""" options = self._options is_training = self._is_training (batch_i, embedding_dims, max_proposal_num, max_label_num, max_slogan_num, max_dbpedia_num) = (proposal_repr.get_shape()[0].value, proposal_repr.get_shape()[-1].value, utils.get_tensor_shape(proposal_repr)[1], utils.get_tensor_shape(label_repr)[1], utils.get_tensor_shape(slogan_repr)[1], utils.get_tensor_shape(dbpedia_repr)[1]) # Create access matrix. access_matrix = self._create_access_matrix( max_proposal_num, max_slogan_num, max_label_num, label_to_slogan_mask, max_dbpedia_num, dbpedia_to_slogan_mask, batch_i) tf.summary.histogram('histogram/access_matrix', access_matrix) sentinel_mask = tf.ones([batch_i, 1]) sentinel_repr = tf.zeros( [batch_i, 1, proposal_repr.get_shape()[-1].value]) node_mask = tf.concat([ sentinel_mask, proposal_mask, slogan_mask, label_mask, dbpedia_mask ], axis=1) # Layer level-0 inference. with tf.variable_scope('layer_lv0_inference'): # lv0 predictions. (lv0_proposal_scores, lv0_slogan_scores, lv0_label_to_proposal_scores, lv0_dbpedia_to_slogan_scores) = self._create_lv0_edge_scores( proposal_repr, slogan_repr, label_repr, dbpedia_repr, proposal_mask, slogan_mask, label_mask, dbpedia_mask) # Create lv0 graph, edges to sentinel are not updated. node_to_node = self._create_adjacency_matrix( proposal_to_sentinel=tf.zeros([batch_i, 1, max_proposal_num]), slogan_to_sentinel=tf.zeros([batch_i, 1, max_slogan_num]), proposal_to_proposal=tf.linalg.diag(lv0_proposal_scores), slogan_to_slogan=tf.linalg.diag(lv0_slogan_scores), label_to_proposal=lv0_label_to_proposal_scores, dbpedia_to_slogan=lv0_dbpedia_to_slogan_scores) adjacency = utils.masked_softmax(node_to_node, mask=tf.multiply( access_matrix, tf.expand_dims(node_mask, axis=1)), dim=-1) adjacency = tf.multiply( adjacency, tf.multiply(tf.expand_dims(node_mask, 1), tf.expand_dims(node_mask, 2))) node_repr = tf.concat([ sentinel_repr, proposal_repr, slogan_repr, label_repr, dbpedia_repr ], axis=1) node_repr = tf.matmul(adjacency, node_repr) # Layer level-1 inference. with tf.variable_scope('layer_lv1_inference'): # Update representation. proposal_repr = tf.slice( node_repr, begin=[0, 1, 0], size=[batch_i, max_proposal_num, embedding_dims]) slogan_repr = tf.slice( node_repr, begin=[0, 1 + max_proposal_num, 0], size=[batch_i, max_slogan_num, embedding_dims]) # lv1 predictions. (lv1_proposal_scores, lv1_slogan_scores) = self._create_lv1_edge_scores( proposal_repr, slogan_repr, proposal_mask, slogan_mask) # Create lv1 graph, update edges between nodes and the sentinel. node_to_node = self._create_adjacency_matrix( proposal_to_sentinel=tf.expand_dims(lv1_proposal_scores, 1), slogan_to_sentinel=tf.expand_dims(lv1_slogan_scores, 1), proposal_to_proposal=tf.linalg.diag(lv0_proposal_scores), slogan_to_slogan=tf.linalg.diag(lv0_slogan_scores), label_to_proposal=lv0_label_to_proposal_scores, dbpedia_to_slogan=lv0_dbpedia_to_slogan_scores) adjacency = utils.masked_softmax(node_to_node, mask=access_matrix * tf.expand_dims(node_mask, axis=1), dim=-1) adjacency = tf.multiply( adjacency, tf.multiply(tf.expand_dims(node_mask, 1), tf.expand_dims(node_mask, 2))) node_repr = tf.concat([ sentinel_repr, proposal_repr, slogan_repr, label_repr, dbpedia_repr ], axis=1) node_repr = tf.matmul(adjacency, node_repr) tf.summary.histogram('histogram/adjacency_logits', node_to_node) # Sparse loss. self_loop_values = tf.linalg.diag_part(adjacency) slogan_values = tf.slice(self_loop_values, begin=[0, 1 + max_proposal_num], size=[batch_i, max_slogan_num]) if options.HasField('sparse_loss_weight'): slogan_value_masks = tf.less(slogan_values, 1) sparse_loss = -tf.div( tf.reduce_sum( tf.boolean_mask(slogan_values, slogan_value_masks)), 1e-8 + tf.reduce_sum(tf.to_float(slogan_value_masks))) tf.summary.scalar('loss/sparse_loss', sparse_loss) tf.losses.add_loss( tf.multiply(sparse_loss, options.sparse_loss_weight, name='sparse_loss')) image_repr = node_repr[:, 0, :] return image_repr, adjacency, node_to_node
def create_graph(self, proposal_repr, slogan_repr, label_repr, dbpedia_repr, proposal_mask, slogan_mask, label_mask, label_to_slogan_mask, dbpedia_mask, dbpedia_to_slogan_mask): """Creates graph.""" options = self._options is_training = self._is_training (batch_i, embedding_dims, max_proposal_num, max_label_num, max_slogan_num, max_dbpedia_num) = (proposal_repr.get_shape()[0].value, proposal_repr.get_shape()[-1].value, utils.get_tensor_shape(proposal_repr)[1], utils.get_tensor_shape(label_repr)[1], utils.get_tensor_shape(slogan_repr)[1], utils.get_tensor_shape(dbpedia_repr)[1]) # Create access matrix. access_matrix = self._create_access_matrix( max_proposal_num, max_slogan_num, max_label_num, label_to_slogan_mask, max_dbpedia_num, dbpedia_to_slogan_mask, batch_i) tf.summary.histogram('histogram/access_matrix', access_matrix) # Get the graph predictions. sentinel_mask = tf.ones([batch_i, 1]) sentinel_repr = tf.zeros( [batch_i, 1, proposal_repr.get_shape()[-1].value]) # Build adjacency matrix. node_to_node = self._create_adjacency_matrix( proposal_to_sentinel=self._create_edge_weights_helper( sentinel_repr, proposal_repr, 'proposal_to_sentinel'), slogan_to_sentinel=self._create_edge_weights_helper( sentinel_repr, slogan_repr, 'slogan_to_sentinel'), proposal_to_proposal=self._create_edge_weights_helper( proposal_repr, proposal_repr, 'proposal_to_proposal'), slogan_to_slogan=self._create_edge_weights_helper( slogan_repr, slogan_repr, 'slogan_to_slogan'), label_to_proposal=self._create_edge_weights_helper( proposal_repr, label_repr, 'label_to_proposal'), dbpedia_to_slogan=self._create_edge_weights_helper( slogan_repr, dbpedia_repr, 'dbpedia_to_slogan')) tf.summary.histogram('histogram/adjacency_logits', node_to_node) node_mask = tf.concat([ sentinel_mask, proposal_mask, slogan_mask, label_mask, dbpedia_mask ], axis=1) node_repr = tf.concat([ sentinel_repr, proposal_repr, slogan_repr, label_repr, dbpedia_repr ], axis=1) adjacency = utils.masked_softmax(node_to_node, mask=access_matrix * tf.expand_dims(node_mask, axis=1), dim=-1) adjacency = tf.multiply( adjacency, tf.multiply(tf.expand_dims(node_mask, 1), tf.expand_dims(node_mask, 2))) for _ in range(2): node_repr = tf.matmul(adjacency, node_repr) image_repr = node_repr[:, 0, :] return image_repr, adjacency, node_to_node
def _build_latent_network(self, num_proposals, proposal_features, num_classes=20, num_latent_factors=20, proba_h_given_c=None): """Builds the Multiple Instance Detection Network. MIDN: An attention network. Args: num_proposals: A [batch] int tensor. proposal_features: A [batch, max_num_proposals, features_dims] float tensor. num_classes: Number of classes. proba_h_given_c: A [num_latent_factors, num_classes] float tensor. Returns: logits: A [batch, num_classes] float tensor. proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor. proba_h_given_c: A [num_latent_factors, num_classes] float tensor. """ if proba_h_given_c is not None: assert proba_h_given_c.get_shape()[0].value == num_latent_factors with tf.name_scope('multi_instance_detection'): batch, max_num_proposals, _ = utils.get_tensor_shape( proposal_features) mask = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) # Calculates the values of following tensors: # logits_c_given_r shape = [batch, max_num_proposals, num_classes]. # logits_r_given_h shape = [batch, max_num_proposals, num_hiddens]. # logits_h_given_c shape = [num_latent_factors, num_classes]. with tf.variable_scope('midn'): logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='proba_c_given_r') logits_r_given_h = slim.fully_connected( proposal_features, num_outputs=num_latent_factors, activation_fn=None, scope='proba_r_given_h') if proba_h_given_c is None: logits_h_given_c = slim.fully_connected( tf.diag(tf.ones([num_classes])), num_outputs=num_latent_factors, activation_fn=None, scope='proba_h_given_c') logits_h_given_c = tf.transpose(logits_h_given_c) proba_h_given_c = tf.nn.softmax(logits_h_given_c, axis=0) tf.summary.histogram('midn/logits_h_given_c', logits_h_given_c) # Marginalize `h` to get proba_r_given_c. logits_r_given_c = tf.matmul( tf.reshape(logits_r_given_h, [-1, num_latent_factors]), proba_h_given_c) logits_r_given_c = tf.reshape( logits_r_given_c, [batch, max_num_proposals, num_classes]) proba_r_given_c = utils.masked_softmax(data=logits_r_given_c, mask=mask, dim=1) proba_r_given_c = tf.multiply(mask, proba_r_given_c) # Aggregates the logits. logits = tf.multiply(logits_c_given_r, proba_r_given_c) logits = tf.reduce_sum(logits, axis=1) tf.summary.histogram('midn/logits', logits) tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r) tf.summary.histogram('midn/logits_r_given_h', logits_r_given_h) return logits, proba_r_given_c, proba_h_given_c
def _build_midn_network(self, num_proposals, proposal_features, num_classes=20): """Builds the Multiple Instance Detection Network. MIDN: An attention network. Args: num_proposals: A [batch] int tensor. proposal_features: A [batch, max_num_proposals, features_dims] float tensor. num_classes: Number of classes. Returns: logits: A [batch, num_classes] float tensor. proba_r_given_c: A [batch, max_num_proposals, num_classes] float tensor. """ with tf.name_scope('multi_instance_detection'): batch, max_num_proposals, _ = utils.get_tensor_shape( proposal_features) mask = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) # Calculates the values of following tensors: # logits_r_given_c shape = [batch, max_num_proposals, num_classes]. # logits_c_given_r shape = [batch, max_num_proposals, num_classes]. with tf.variable_scope('midn'): logits_r_given_c = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='proba_r_given_c') logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=num_classes, activation_fn=None, scope='proba_c_given_r') # Calculates the detection scores. proba_r_given_c = utils.masked_softmax(data=tf.multiply( mask, logits_r_given_c), mask=mask, dim=1) proba_r_given_c = tf.multiply(mask, proba_r_given_c) # Aggregates the logits. class_logits = tf.multiply(logits_c_given_r, proba_r_given_c) class_logits = utils.masked_sum(data=class_logits, mask=mask, dim=1) proposal_scores = tf.multiply(tf.nn.sigmoid(class_logits), proba_r_given_c) #proposal_scores = tf.multiply( # tf.nn.softmax(class_logits), proba_r_given_c) tf.summary.histogram('midn/logits_r_given_c', logits_r_given_c) tf.summary.histogram('midn/logits_c_given_r', logits_c_given_r) tf.summary.histogram('midn/proposal_scores', proposal_scores) tf.summary.histogram('midn/class_logits', class_logits) return tf.squeeze(class_logits, axis=1), proposal_scores, proba_r_given_c
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Image CNN features. inputs = examples[InputDataFields.image] image_features = model_utils.calc_cnn_feature( inputs, options.cnn_options, is_training=is_training) with slim.arg_scope( build_hyperparams(options.image_fc_hyperparams, is_training)): image_features = slim.fully_connected( image_features, num_outputs=options.shared_dims, activation_fn=None, scope='image') # Text Global-Average-Pooling features. (image_id, num_captions, caption_strings, caption_lengths) = (examples[InputDataFields.image_id], examples[InputDataFields.num_captions], examples[InputDataFields.caption_strings], examples[InputDataFields.caption_lengths]) image_id = tf.string_to_number(image_id, out_type=tf.int64) (image_ids_gathered, caption_strings_gathered, caption_lengths_gathered) = model_utils.gather_in_batch_captions( image_id, num_captions, caption_strings, caption_lengths) (caption_token_ids_gathered, caption_features_gathered) = self._extract_text_feature( caption_strings_gathered, caption_lengths_gathered, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding, max_norm=None) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): if visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor: attn = slim.fully_connected( caption_features_gathered, num_outputs=1, activation_fn=None, scope='caption_attn') attn = tf.squeeze(attn, axis=-1) caption_features_gathered = slim.fully_connected( caption_features_gathered, num_outputs=options.shared_dims, activation_fn=None, scope='caption') oov = len(self._open_vocabulary_list) caption_masks_gathered = tf.logical_not( tf.equal(caption_token_ids_gathered, oov)) caption_masks_gathered = tf.to_float(caption_masks_gathered) if visual_w2v_model_pb2.VisualW2vModel.GAP == options.text_feature_extractor: caption_features_gathered = utils.masked_avg_nd( data=caption_features_gathered, mask=caption_masks_gathered, dim=1) caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1) elif visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor: attn = utils.masked_softmax(attn, mask=caption_masks_gathered, dim=-1) caption_features_gathered = tf.multiply( tf.expand_dims(attn, axis=-1), caption_features_gathered) caption_features_gathered = utils.masked_sum_nd( caption_features_gathered, mask=caption_masks_gathered, dim=1) caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1) else: raise ValueError('Invalid text feature extractor.') # Export token embeddings. with tf.variable_scope(tf.get_variable_scope(), reuse=True): _, token_embeddings = self._encode_tokens( tokens=tf.constant(self._open_vocabulary_list), embedding_dims=options.embedding_dims, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, trainable=options.train_word_embedding) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): token_embeddings = slim.fully_connected( token_embeddings, num_outputs=options.shared_dims, activation_fn=None, scope='caption') var_to_assign = tf.get_variable( name='weights_proj', shape=[len(self._open_vocabulary_list), options.shared_dims]) var_to_assign = tf.assign(var_to_assign, token_embeddings) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, var_to_assign) tf.summary.histogram('token_embedding_proj', token_embeddings) # Compute similarity. similarity = model_utils.calc_pairwise_similarity( feature_a=image_features, feature_b=caption_features_gathered, l2_normalize=True, dropout_keep_prob=options.cross_modal_dropout_keep_prob, is_training=is_training) predictions = { VisualW2vPredictions.image_id: image_id, VisualW2vPredictions.image_ids_gathered: image_ids_gathered, VisualW2vPredictions.similarity: similarity, VisualW2vPredictions.word2vec: var_to_assign, } return predictions
def _predict_similarity(self, examples): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Extracts input data fields. (image, image_id, num_captions, caption_strings, caption_lengths) = (examples[InputDataFields.image], examples[InputDataFields.image_id], examples[InputDataFields.num_captions], examples[InputDataFields.caption_strings], examples[InputDataFields.caption_lengths]) image_feature = self._encode_images( image, cnn_name=options.cnn_name, cnn_trainable=options.cnn_trainable, cnn_weight_decay=options.cnn_weight_decay, cnn_feature_map=options.cnn_feature_map, cnn_dropout_keep_prob=options.cnn_dropout_keep_prob, cnn_checkpoint=options.cnn_checkpoint, cnn_scope=GAPVariableScopes.cnn, is_training=is_training) (image_ids_gathered, caption_strings_gathered, caption_lengths_gathered) = model_utils.gather_in_batch_captions( image_id, num_captions, caption_strings, caption_lengths) # Extract image feature, shape = # [batch, feature_height * feature_width, common_dimensions]. with tf.name_scope(OperationNames.image_model): image_feature = self._project_images( image_feature, common_dimensions=options.common_dimensions, scope=GAPVariableScopes.image_proj, hyperparams=options.image_proj_hyperparams, is_training=is_training) (batch, feature_height, feature_width, common_dimensions) = utils.get_tensor_shape(image_feature) image_feature = tf.reshape(image_feature, [batch, -1, common_dimensions]) # Extract caption feature, shape = # [num_captions_in_batch, max_caption_length, common_dimensions]. vocabulary_list = self._read_vocabulary(options.vocabulary_file) tf.logging.info("Read a vocabulary with %i words.", len(vocabulary_list)) with tf.name_scope(OperationNames.text_model): caption_feature = self._encode_captions( caption_strings_gathered, vocabulary_list=vocabulary_list, common_dimensions=options.common_dimensions, scope=GAPVariableScopes.word_embedding, is_training=is_training) (num_captions_in_batch, max_caption_length, common_dimensions) = utils.get_tensor_shape(caption_feature) # Calculates similarity matrix, shape=[batch, num_captions_in_batch]. with tf.name_scope(OperationNames.calc_pairwise_similarity): # Compute dot-product similarity. similarity = self._calc_pairwise_similarity( image_feature=tf.nn.l2_normalize(image_feature, axis=-1), text_feature=tf.nn.l2_normalize(caption_feature, axis=-1), dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) word_mask = tf.sequence_mask(caption_lengths_gathered, maxlen=max_caption_length, dtype=tf.float32) similarity = similarity * tf.expand_dims( tf.expand_dims(word_mask, 0), 0) if options.use_saliency_score: # Predict saliency score. # image_saliency shape = [batch, num_regions]. # caption_saliency shape = [num_captions_in_batch, max_caption_length]. image_saliency = self._calc_saliency_score( image_feature, scope=GAPVariableScopes.image_saliency, hyperparams=options.image_saliency_hyperparams, is_training=is_training) if options.l2_norm_for_word_saliency: caption_feature = tf.nn.l2_normalize(caption_feature, axis=-1) caption_saliency = self._calc_saliency_score( caption_feature, scope=GAPVariableScopes.word_saliency, hyperparams=options.word_saliency_hyperparams, is_training=is_training) # Apply masked attention. image_attention = tf.nn.softmax(image_saliency, axis=-1) caption_attention = utils.masked_softmax(caption_saliency, word_mask, dim=-1) tf.summary.scalar( 'loss/image_attention_max', tf.reduce_mean(tf.reduce_max(image_attention, axis=1))) tf.summary.scalar( 'loss/image_attention_min', tf.reduce_mean(tf.reduce_min(image_attention, axis=1))) tf.summary.scalar( 'loss/caption_attention_max', tf.reduce_mean( utils.masked_maximum(caption_attention, word_mask, dim=1))) tf.summary.scalar( 'loss/caption_attention_min', tf.reduce_mean( utils.masked_minimum(caption_attention, word_mask, dim=1))) if options.image_regularizer_weight > 0.0: log_image_attention = tf.log( tf.maximum(image_attention, _LOG_SMALL_NUMBER)) loss = tf.multiply( options.image_regularizer_weight, tf.reduce_mean( tf.reduce_sum(log_image_attention, axis=1))) tf.losses.add_loss(loss) tf.summary.scalar('loss/image_attention_log_loss', loss) if options.text_regularizer_weight > 0.0: log_caption_attention = tf.log( tf.maximum(caption_attention, _LOG_SMALL_NUMBER)) loss = tf.multiply( options.text_regularizer_weight, tf.reduce_mean( tf.reduce_sum(log_caption_attention * word_mask, axis=1))) tf.losses.add_loss(loss) tf.summary.scalar('loss/caption_attention_log_loss', loss) saliency_mask = self._calc_pairwise_similarity( image_feature=tf.expand_dims(image_attention, -1), text_feature=tf.expand_dims(caption_attention, -1), dropout_keep_prob=options.dropout_keep_prob, is_training=is_training) # Compute weighted sum. similarity = tf.reduce_sum(similarity * saliency_mask, axis=[1, 3]) self.visualize( image, tf.reshape(image_saliency, [-1, feature_height, feature_width])) tf.summary.histogram('image_saliency', image_saliency) tf.summary.histogram('text_saliency', caption_saliency) else: # Simple Global Average Pooling. similarity = tf.div( tf.reduce_sum(similarity, axis=[1, 3]), _SMALL_NUMBER + tf.cast( feature_width * feature_height * caption_lengths_gathered, tf.float32)) predictions = { GAPPredictions.image_id: image_id, GAPPredictions.image_ids_gathered: image_ids_gathered, GAPPredictions.similarity: similarity, } return predictions
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) predictions = { DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, } # FRCNN. # `proposal_features` shape = [batch, max_num_proposals, feature_dims]. # `proposal_masks` shape = [batch, max_num_proposals]. proposal_features = self._extract_frcnn_feature( inputs, num_proposals, proposals) batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features) proposal_masks = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) # Build the SADDN predictions. # `logits_c_given_r` shape = [batch, max_num_proposals, num_classes]. # `logits_r_given_c` shape = [batch, max_num_proposals, num_classes]. with tf.variable_scope('SADDN'), \ slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)): logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=self._num_classes, activation_fn=None, scope='proba_c_given_r') logits_r_given_c = slim.fully_connected( proposal_features, num_outputs=self._num_classes, activation_fn=None, scope='proba_r_given_c') proba_c_given_r = tf.nn.softmax(logits_c_given_r) proba_r_given_c = utils.masked_softmax( data=logits_r_given_c, mask=tf.expand_dims(proposal_masks, axis=-1), dim=1) proba_r_given_c = tf.multiply( tf.expand_dims(proposal_masks, axis=-1), proba_r_given_c) tf.summary.image('inputs', inputs, max_outputs=10) model_utils.visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=2000) # SADDN iterations. logits_at_0 = utils.masked_avg_nd(data=logits_c_given_r, mask=proposal_masks, dim=1) logits_at_0 = tf.squeeze(logits_at_0, axis=1) logits_at_i = logits_at_0 for i in range(options.saddn_iterations): # Infer the proba_r_given_c. # Infer the proba_c. proba_c_at_i = tf.nn.softmax(logits_at_i) import pdb pdb.set_trace() proba_r_at_i = tf.multiply(tf.expand_dims(proba_c_at_i, axis=1), proba_r_given_c) proba_r_at_i = tf.reduce_sum(proba_r_at_i, axis=-1, keepdims=True) # Infer the detection results at iter `i`. (num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, detection_classes_at_i) = model_utils.post_process( proposals, proba_r_at_i * proba_c_given_r) (predictions[StackedAttnPredictions.logits + '_at_{}'.format(i)], predictions[DetectionResultFields.num_detections + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i)]) = (logits_at_i, num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, detection_classes_at_i) model_utils.visl_proposals_top_k( inputs, num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, tf.gather(self._vocabulary_list, tf.to_int32(detection_classes_at_i - 1)), name='detection_{}'.format(i)) # `logits_at_i` for the next iteration. logits_at_i = tf.multiply(proba_r_at_i, logits_c_given_r) logits_at_i = tf.reduce_sum(logits_at_i, axis=1) return predictions
def _build_midn_network(self, num_proposals, proposal_feature, num_classes=20, attention_normalizer=1.0, attention_tanh=False, attention_scale_factor=5.0): """Builds the Multiple Instance Detection Network. MIDN: An attention network. Args: num_proposals: A [batch] int tensor. proposal_feature: A [batch, max_num_proposals, feature_dims] float tensor. num_classes: Number of classes. Returns: proposal_scores: A [batch, max_num_proposals, num_classes] float tensor. """ with tf.name_scope('multi_instance_detection'): _, max_num_proposals, _ = utils.get_tensor_shape(proposal_feature) # branch1/branch2 shape = [batch, max_num_proposals, num_classes.] branch1 = slim.fully_connected(proposal_feature, num_outputs=num_classes, activation_fn=None, scope='midn/branch1') branch2 = slim.fully_connected(proposal_feature, num_outputs=num_classes, activation_fn=None, scope='midn/branch2') branch1 = branch1 / attention_normalizer branch2 = branch2 / attention_normalizer if attention_tanh: branch1 = attention_scale_factor * tf.nn.tanh(branch1) branch2 = attention_scale_factor * tf.nn.tanh(branch2) proba_c_given_r = tf.nn.softmax(branch1, axis=2) mask = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) proba_r_given_c = utils.masked_softmax(data=branch2, mask=mask, dim=1) proposal_scores = tf.multiply(proba_c_given_r, proba_r_given_c) # branch1 = slim.fully_connected( # proposal_feature, # num_outputs=num_classes, # activation_fn=None, # scope='midn/branch1') # branch1 = branch1 / np.sqrt(num_classes) # branch2 = slim.fully_connected( # proposal_feature, # num_outputs=num_classes, # activation_fn=None, # scope='midn/branch2') # branch2 = branch2 / np.sqrt(max_num_proposals) # proba_c_given_r = tf.nn.softmax(branch1, axis=2) # #proba_r_given_c = tf.nn.softmax(branch2, axis=1) # mask = tf.sequence_mask( # num_proposals, maxlen=max_num_proposals, dtype=tf.float32) # mask = tf.expand_dims(mask, axis=-1) # proba_r_given_c = utils.masked_softmax(data=branch2, mask=mask, dim=1) # proposal_scores = tf.multiply(proba_c_given_r, proba_r_given_c) tf.summary.histogram('midn/branch1', branch1) tf.summary.histogram('midn/branch2', branch2) tf.summary.histogram('midn/proposal_scores', proposal_scores) return proposal_scores