def _build_prediction(self, examples, post_process=True): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training _, caption_features = self._extract_text_feature( examples[InputDataFields.concat_caption_string], examples[InputDataFields.concat_caption_length], vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding) with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): caption_text_features = slim.fully_connected( caption_features, num_outputs=options.projected_dims, activation_fn=None) caption_text_features = self._text_encoding_fn( caption_features, examples[InputDataFields.concat_caption_length], is_training=is_training) with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): caption_predicting_logits = slim.fully_connected( caption_text_features, num_outputs=self._num_classes, activation_fn=None, scope='caption_predicting_logits') predictions = { OPPredictions.caption_predicting_logits: caption_predicting_logits, OPPredictions.caption_predicting_labels: tf.round(tf.nn.sigmoid(caption_predicting_logits)), } return predictions
def _calc_spp_proposal_feature(self, image_feature_cropped): """Calculates proposal feature using spp. Args: image_feature_cropped: A [batch, crop_size, crop_size, feature_dims] float tensor. Returns: spp_feature: A [batch, spp_feature_dims] float tensor. proposal_feature: A [batch, proposal_feature_dims] float tensor. """ options = self._model_proto is_training = self._is_training net = image_feature_cropped spp_feature = net = self._calc_spp_feature( net, spp_bins=[lv for lv in options.spp_bins]) for i in range(options.hidden_layers): with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): net = slim.fully_connected(net, num_outputs=options.hidden_units, scope='fc_{}'.format(i + 1)) net = slim.dropout(net, options.hidden_dropout_keep_prob, is_training=is_training) return spp_feature, net
def _calc_conv_proposal_feature(self, image_feature_cropped): """Calculates proposal feature using spp. Args: image_feature_cropped: A [batch, crop_size, crop_size, feature_dims] float tensor. Returns: proposal_feature: A [batch, proposal_feature_dims] float tensor. """ options = self._model_proto is_training = self._is_training with slim.arg_scope( build_hyperparams(options.conv_hyperparams, is_training)): net = image_feature_cropped with tf.variable_scope('conv_layers'): tf.logging.info('layer input: %s', net.get_shape()) net = slim.repeat(net, 1, slim.conv2d, 64, [3, 3], padding='VALID', scope='conv2d_3x3') net = tf.reduce_mean(net, axis=[1, 2]) tf.logging.info('layer output: %s', net.get_shape()) return net
def _project_images(self, feature_map, common_dimensions=300, scope="image_proj", hyperparams=None, is_training=False): """Adds additional 1x1 conv layer to project image features. Args: feature_map: [batch, feature_height, feature_width, feature_depth] float tensor, which is the CNN output. common_dimensions: depth of the image embedding. scope: variable scope of the projection layer. hyperparams: an instance of hyperparams_pb2.Hyperparams, used for the conv2d projection layer. is_training: if True, training graph is built. """ with slim.arg_scope(build_hyperparams(hyperparams, is_training)): with tf.variable_scope(scope): feature_map = tf.contrib.layers.conv2d( inputs=feature_map, num_outputs=common_dimensions, kernel_size=[1, 1], activation_fn=None) return feature_map
def _calc_conv_proposal_feature(self, image_feature_cropped): """Calculates proposal feature using spp. Args: image_feature_cropped: A [batch, crop_size, crop_size, feature_dims] float tensor. Returns: proposal_feature: A [batch, proposal_feature_dims] float tensor. """ options = self._model_proto is_training = self._is_training with slim.arg_scope( build_hyperparams(options.conv_hyperparams, is_training)): net = image_feature_cropped with tf.variable_scope('conv_layers'): for i in range(options.conv_layers): with tf.variable_scope('layer_{}'.format(i)): tf.logging.info('layer input: %s', net.get_shape()) net_cat = slim.conv2d(net, options.conv_units, [1, 1], stride=1, padding='SAME', scope='conv2d_1x1') net_cat = slim.dropout(net_cat, options.conv_dropout_keep_prob, is_training=is_training) net = tf.concat([net, net_cat], axis=-1) net = slim.max_pool2d(net, [2, 2], stride=2, padding='VALID', scope='maxpool_2x2') tf.logging.info('layer output: %s', net.get_shape()) with tf.variable_scope('conv_layers'): with tf.variable_scope('layer_{}'.format(options.conv_layers)): net = slim.conv2d(net, options.conv_units, [3, 3], stride=1, padding='VALID', scope='conv2d_3x3') net = slim.dropout(net, options.conv_dropout_keep_prob, is_training=is_training) proposal_feature = tf.squeeze(net, [1, 2]) tf.logging.info('proposal_feture: %s', proposal_feature) return proposal_feature
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Text Global-Maximum-Pooling features. (caption_string, caption_length) = (examples[InputDataFields.concat_caption_string], examples[InputDataFields.concat_caption_length]) (caption_token_ids, caption_features) = self._extract_text_feature( caption_string, caption_length, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding, max_norm=None) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): caption_features = slim.fully_connected( caption_features, num_outputs=self._num_classes, activation_fn=None, scope='caption') oov = len(self._open_vocabulary_list) caption_masks = tf.to_float( tf.logical_not(tf.equal(caption_token_ids, oov))) # logits shape = [batch, num_classes]. logits = utils.masked_maximum(data=caption_features, mask=tf.expand_dims(caption_masks, axis=-1), dim=1) logits = tf.squeeze(logits, axis=1) predictions = { TextClassificationPredictions.vocab: tf.constant(self._vocabulary_list), TextClassificationPredictions.logits: logits, } return predictions
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ if not self._is_training: self._model_proto.hyperparams.regularizer.l2_regularizer.weight = 0 with slim.arg_scope( build_hyperparams(self._model_proto.hyperparams, self._is_training)): return self._build_prediction(examples)
def _calc_spp_proposal_feature(self, image_feature_cropped): """Calculates proposal feature using spp. Args: image_feature_cropped: A [batch, crop_size, crop_size, feature_dims] float tensor. Returns: proposal_feature: A [batch, proposal_feature_dims] float tensor. """ options = self._model_proto is_training = self._is_training net = image_feature_cropped #with slim.arg_scope( # build_hyperparams(options.conv_hyperparams, is_training)): # for i in range(options.conv_layers): # net_add = slim.conv2d( # net, # options.conv_units, [1, 1], # padding='SAME', # scope='conv/fc_{}'.format(i + 1)) # net = tf.concat([net, net_add], axis=-1) # net = slim.dropout( # net, options.conv_dropout_keep_prob, is_training=is_training) net = self._calc_spp_feature(net, spp_bins=[lv for lv in options.spp_bins], max_pool=options.spp_max_pool) with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): for i in range(options.hidden_layers): net = slim.fully_connected(net, num_outputs=options.hidden_units, scope='hidden/fc_{}'.format(i + 1)) net = slim.dropout(net, options.hidden_dropout_keep_prob, is_training=is_training) return net
def _calc_saliency_score(self, inputs, scope, hyperparams=None, is_training=False): """Calculates saliency score. Args: inputs: input feature, a [..., feature_dimensions] float tensor. scope: variable scope. hyperparams: an instance of hyperparams_pb2.Hyperparams, used for the is_training: if True, build training graph. Returns: saliency_score: saliency score, a [..., 1] float tensor keeping the feature dimension. """ with slim.arg_scope(build_hyperparams(hyperparams, is_training)): saliency_score = tf.contrib.layers.fully_connected( inputs, num_outputs=1, activation_fn=None, scope=scope) return tf.squeeze(saliency_score, axis=-1)
def _project_image(self, image_feature, num_outputs=20, kernel_size=1, hyperparams=None, is_training=False): """Adds additional 1x1 conv layer to project image features. Args: image_feature: [batch, feature_height, feature_width, feature_depth] float tensor, which is the CNN output. num_outputs: number of output channels. hyperparams: an instance of hyperparams_pb2.Hyperparams, used for the conv2d projection layer. is_training: if True, training graph is built. """ with slim.arg_scope(build_hyperparams(hyperparams, is_training)): output = tf.contrib.layers.conv2d( inputs=image_feature, num_outputs=num_outputs, kernel_size=[kernel_size, kernel_size], activation_fn=None) return output
def _build_object_prediction_network(self, texts, text_lengths, open_vocabulary_list, embedding_dims): """Builds tf graph for predicting object labels from captions. Args: texts: A [batch, max_text_length] string tensor. text_lengths: A [batch] int tensor. open_vocabulary_list: A list of words. embedding_dims: Returns: predicted_logits: A [batch, num_classes] float tensor. """ options = self._model_proto is_training = self._is_training _, caption_features = self._extract_text_feature( texts, text_lengths, vocabulary_list=open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=embedding_dims, trainable=options.train_word_embedding) caption_text_features = self._text_encoding_fn( caption_features, text_lengths, is_training=is_training) with slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)): predicted_logits = slim.fully_connected( caption_text_features, num_outputs=self._num_classes, activation_fn=None, scope="predicted_logits") tf.summary.histogram("object_prediction/predicted_logits", predicted_logits) return predicted_logits
def build_prediction(self, examples, prediction_task=OICRTasks.image_label, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) # Extract `features_to_crop` from the original image. # shape = [batch, feature_height, feature_width, feature_depth]. preprocessed_inputs = self._feature_extractor.preprocess(inputs) (features_to_crop, _) = self._feature_extractor.extract_proposal_features( preprocessed_inputs, scope='first_stage_feature_extraction') (mipn_feature_map ) = self._feature_extractor.extract_box_classifier_features( features_to_crop, scope='second_stage_feature_extraction') with slim.arg_scope( build_hyperparams(options.conv_hyperparams, is_training)): (mipn_logits, mipn_num_proposals, mipn_proposals, mipn_proposal_scores, class_activation_map) = self._build_mipn_network( mipn_feature_map, num_proposals, proposals, kernel_size=options.mipn_conv_kernel_size, pooling=options.mipn_pooling) self._visl_class_activation_map(inputs, class_activation_map) self._visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=200) self._visl_proposals(inputs, mipn_num_proposals, mipn_proposals, name='proposals_mipn', top_k=200) # Substitude to use the top-ranked proposals. num_proposals = tf.minimum(mipn_num_proposals, options.mipn_max_num_proposals) proposals = mipn_proposals[:, :options.mipn_max_num_proposals, :] batch, max_num_proposals, _ = utils.get_tensor_shape(proposals) # Crop `flattened_proposal_features_maps`. # shape = [batch*max_num_proposals, crop_size, crop_size, feature_depth]. box_ind = tf.expand_dims(tf.range(batch), axis=-1) box_ind = tf.tile(box_ind, [1, max_num_proposals]) cropped_regions = tf.image.crop_and_resize( features_to_crop, boxes=tf.reshape(proposals, [-1, 4]), box_ind=tf.reshape(box_ind, [-1]), crop_size=[options.initial_crop_size, options.initial_crop_size]) flattened_proposal_features_maps = slim.max_pool2d( cropped_regions, [options.maxpool_kernel_size, options.maxpool_kernel_size], stride=options.maxpool_stride) # Extract `proposal_features`, # shape = [batch, max_num_proposals, feature_dims]. (box_classifier_features ) = self._feature_extractor.extract_box_classifier_features( flattened_proposal_features_maps, scope='second_stage_feature_extraction') flattened_roi_pooled_features = tf.reduce_mean(box_classifier_features, [1, 2], name='AvgPool') flattened_roi_pooled_features = slim.dropout( flattened_roi_pooled_features, keep_prob=options.dropout_keep_prob, is_training=is_training) proposal_features = tf.reshape(flattened_roi_pooled_features, [batch, max_num_proposals, -1]) # Assign weights from pre-trained checkpoint. tf.train.init_from_checkpoint( options.checkpoint_path, assignment_map={"/": "first_stage_feature_extraction/"}) tf.train.init_from_checkpoint( options.checkpoint_path, assignment_map={"/": "second_stage_feature_extraction/"}) # Build MIDN network. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): midn_logits, proba_r_given_c = self._build_midn_network( num_proposals, proposal_features, num_classes=self._num_classes) # Build the OICR network. # proposal_scores shape = [batch, max_num_proposals, 1 + num_classes]. # See `Multiple Instance Detection Network with OICR`. oicr_proposal_scores_list = [] with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): with tf.name_scope('online_instance_classifier_refinement'): for i in range(options.oicr_iterations): oicr_proposal_scores_at_i = slim.fully_connected( proposal_features, num_outputs=1 + self._num_classes, activation_fn=None, scope='oicr/iter{}'.format(i + 1)) oicr_proposal_scores_list.append(oicr_proposal_scores_at_i) predictions = { DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, OICRPredictions.midn_logits: midn_logits, OICRPredictions.midn_proba_r_given_c: proba_r_given_c, OICRPredictions.mipn_logits: mipn_logits, } # Post process to get the final detections. midn_proposal_scores = tf.multiply( tf.expand_dims(tf.nn.softmax(midn_logits), axis=1), proba_r_given_c) (predictions[DetectionResultFields.num_detections + '_at_{}'.format(0)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(0)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(0)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(0)]) = self._post_process( proposals, midn_proposal_scores) for i, oicr_proposal_scores_at_i in enumerate( oicr_proposal_scores_list): predictions[OICRPredictions.oicr_proposal_scores + '_at_{}'.format(i + 1)] = oicr_proposal_scores_at_i (predictions[DetectionResultFields.num_detections + '_at_{}'.format(i + 1)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i + 1)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i + 1)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i + 1)]) = self._post_process( proposals, tf.nn.softmax(oicr_proposal_scores_at_i, axis=-1)[:, :, 1:]) for i in range(1 + options.oicr_iterations): num_detections, detection_boxes, detection_scores, detection_classes = ( predictions[DetectionResultFields.num_detections + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i)]) self._visl_proposals_top_k(inputs, num_detections, detection_boxes, detection_scores, tf.gather( self._vocabulary_list, tf.to_int32(detection_classes - 1)), name='detection_{}'.format(i)) return predictions
def _build_prediction(self, examples, post_process=True): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) tf.summary.image('inputs', inputs, max_outputs=10) model_utils.visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=100) # FRCNN. proposal_features = self._extract_frcnn_feature( inputs, num_proposals, proposals) # Build MIDN network. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): if options.attention_type == nod_model_pb2.NODModel.PER_CLASS: midn_class_scores, midn_proposal_scores = self._build_midn_network( num_proposals, proposal_features, num_classes=self._num_classes) else: raise ValueError('Invalid attention type.') predictions = { DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, NODPredictions.midn_class_scores: midn_class_scores, } # Build the OICR network. # proposal_scores shape = [batch, max_num_proposals, 1 + num_classes]. # See `Multiple Instance Detection Network with OICR`. predictions[NODPredictions.oicr_proposal_scores + '_at_0'] = midn_proposal_scores with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): for i in range(options.oicr_iterations): predictions[NODPredictions.oicr_proposal_scores + '_at_{}'.format(i + 1)] = slim.fully_connected( proposal_features, num_outputs=1 + self._num_classes, activation_fn=None, scope='oicr/iter{}'.format(i + 1)) # Post process to get final predictions. if post_process: predictions.update(self._post_process(inputs, predictions)) return predictions
def build_prediction(self, examples, prediction_task=OICRTasks.image_label, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (image, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) # Use the CNN to extract feature. # image_feature shape=[batch, feature_height, feature_width, feature_dims] image_feature = model_utils.dilated_vgg16_conv(image, options.cnn, is_training=is_training) # Crop image feature from the CNN output. # image_feature_cropped_and_flattened # shape=[batch*max_num_proposals, crop_size, crop_size, feature_dims] batch, max_num_proposals, _ = utils.get_tensor_shape(proposals) box_ind = tf.expand_dims(tf.range(batch), axis=-1) box_ind = tf.tile(box_ind, [1, max_num_proposals]) crop_size = options.feature_crop_size image_feature_cropped = tf.image.crop_and_resize( image_feature, boxes=tf.reshape(proposals, [-1, 4]), box_ind=tf.reshape(box_ind, [-1]), crop_size=[crop_size, crop_size], method='bilinear') # Get the multi-resolutional feature. # proposal_feature shape=[batch, max_num_proposals, hidden_units]. if options.feature_extractor == oicr_model_pb2.OICRModel.SPP: spp_feature, proposal_feature = self._calc_spp_proposal_feature( image_feature_cropped) elif options.feature_extractor == oicr_model_pb2.OICRModel.VGG: spp_feature = proposal_feature = self._calc_vgg_proposal_feature( image_feature_cropped) elif options.feature_extractor == oicr_model_pb2.OICRModel.CONV: spp_feature = proposal_feature = self._calc_conv_proposal_feature( image_feature_cropped) else: raise ValueError('Invalid feature extractor') spp_feature = tf.reshape(spp_feature, [batch, max_num_proposals, -1]) proposal_feature = tf.reshape(proposal_feature, [batch, max_num_proposals, -1]) tf.summary.histogram('midn/proposal_feature', proposal_feature) # Build the MIDN network. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. # See `Multiple Instance Detection Network with OICR`. with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): midn_logits, proba_r_given_c = self._build_midn_network( num_proposals, spp_feature if options.use_spp_to_calc_logits else proposal_feature, proposal_feature, num_classes=self._num_classes, attention_normalizer=options.attention_normalizer, attention_tanh=options.attention_tanh, attention_scale_factor=options.attention_scale_factor) # Build the OICR network. # proposal_scores shape = [batch, max_num_proposals, 1 + num_classes]. # See `Multiple Instance Detection Network with OICR`. oicr_proposal_scores_list = [] with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): with tf.name_scope('online_instance_classifier_refinement'): for i in range(options.oicr_iterations): oicr_proposal_scores_at_i = slim.fully_connected( proposal_feature, num_outputs=1 + self._num_classes, activation_fn=None, scope='oicr/iter{}'.format(i + 1)) oicr_proposal_scores_list.append(oicr_proposal_scores_at_i) predictions = { DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, OICRPredictions.midn_proba_r_given_c: proba_r_given_c, OICRPredictions.midn_logits: midn_logits, } # Post process to get the final detections. labels = self._extract_class_label( class_texts=examples[InputDataFields.caption_strings], vocabulary_list=self._vocabulary_list) midn_proposal_scores = tf.multiply(proba_r_given_c, tf.expand_dims(labels, axis=1)) (predictions[DetectionResultFields.num_detections + '_at_{}'.format(0)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(0)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(0)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(0)]) = self._post_process( proposals, midn_proposal_scores) for i, oicr_proposal_scores_at_i in enumerate( oicr_proposal_scores_list): predictions[OICRPredictions.oicr_proposal_scores + '_at_{}'.format(i + 1)] = oicr_proposal_scores_at_i (predictions[DetectionResultFields.num_detections + '_at_{}'.format(i + 1)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i + 1)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i + 1)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i + 1)]) = self._post_process( proposals, tf.nn.softmax(oicr_proposal_scores_at_i, axis=-1)[:, :, 1:]) self._visl_proposals(image, num_proposals, proposals, name='proposals', top_k=2000) for i in range(1 + options.oicr_iterations): num_detections, detection_boxes, detection_scores, detection_classes = ( predictions[DetectionResultFields.num_detections + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i)]) self._visl_proposals_top_k(image, num_detections, detection_boxes, detection_scores, tf.gather( self._vocabulary_list, tf.to_int32(detection_classes - 1)), name='detection_{}'.format(i)) return predictions
def _build_prediction(self, examples): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ predictions = {} options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) # Fast-RCNN. proposal_features = model_utils.extract_frcnn_feature( inputs, num_proposals, proposals, options.frcnn_options, is_training) # Build MIDN network. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): (midn_class_logits, midn_proposal_scores, midn_proba_r_given_c) = self._build_midn_network( num_proposals, proposal_features, num_classes=self._label_extractor.num_classes) # Build the OICR network. # proposal_scores shape = [batch, max_num_proposals, 1 + num_classes]. # See `Multiple Instance Detection Network with OICR`. with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): for i in range(options.oicr_iterations): predictions[Cap2DetPredictions.oicr_proposal_scores + '_at_{}'.format( i + 1)] = proposal_scores = slim.fully_connected( proposal_features, num_outputs=1 + self._label_extractor.num_classes, activation_fn=None, scope='oicr/iter{}'.format(i + 1)) # Set the predictions. predictions.update({ DetectionResultFields.class_labels: tf.constant(self._label_extractor.classes), DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, Cap2DetPredictions.midn_class_logits: midn_class_logits, Cap2DetPredictions.midn_proba_r_given_c: midn_proba_r_given_c, Cap2DetPredictions.oicr_proposal_scores + '_at_0': midn_proposal_scores }) return predictions
def _build_prediction(self, examples, post_process=True): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) tf.summary.image('inputs', inputs, max_outputs=10) model_utils.visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=100) # FRCNN. proposal_features = self._extract_frcnn_feature( inputs, num_proposals, proposals) # Build the OICR network. # proposal_scores shape = [batch, max_num_proposals, 1 + num_classes]. # See `Multiple Instance Detection Network with OICR`. predictions = {} with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): for i in range(options.oicr_iterations): predictions[NOD2Predictions.oicr_proposal_scores + '_at_{}'.format( i + 1)] = proposal_scores = slim.fully_connected( proposal_features, num_outputs=1 + self._num_classes, activation_fn=None, scope='oicr/iter{}'.format(i + 1)) if post_process and options.HasField('pcl_preprocess'): proposal_scores = tf.nn.softmax(tf.stop_gradient(proposal_scores), axis=-1)[:, :, 1:] (num_proposals, proposals, _, _, additional_fields) = self._pcl_preprocess_fn( proposals, proposal_scores, {'proposal_features': proposal_features}) proposal_features = additional_fields['proposal_features'] # Build MIDN network. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): if options.attention_type == nod2_model_pb2.NOD2Model.PER_CLASS: (midn_class_logits, midn_proposal_scores, midn_proba_r_given_c) = self._build_midn_network( num_proposals, proposal_features, num_classes=self._num_classes) elif options.attention_type == nod2_model_pb2.NOD2Model.PER_CLASS_TANH: (midn_class_logits, midn_proposal_scores, midn_proba_r_given_c) = self._build_midn_network_tanh( num_proposals, proposal_features, num_classes=self._num_classes) else: raise ValueError('Invalid attention type.') predictions.update({ DetectionResultFields.class_labels: tf.constant(self._vocabulary_list), DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, NOD2Predictions.midn_class_logits: midn_class_logits, NOD2Predictions.midn_proba_r_given_c: midn_proba_r_given_c, NOD2Predictions.oicr_proposal_scores + '_at_0': midn_proposal_scores }) # Post process to get final predictions. if post_process: predictions.update(self._post_process(inputs, predictions)) return predictions
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Image CNN features. inputs = examples[InputDataFields.image] image_features = model_utils.calc_cnn_feature( inputs, options.cnn_options, is_training=is_training) with slim.arg_scope( build_hyperparams(options.image_fc_hyperparams, is_training)): image_features = slim.fully_connected( image_features, num_outputs=options.shared_dims, activation_fn=None, scope='image') # Text Global-Average-Pooling features. (image_id, num_captions, caption_strings, caption_lengths) = (examples[InputDataFields.image_id], examples[InputDataFields.num_captions], examples[InputDataFields.caption_strings], examples[InputDataFields.caption_lengths]) image_id = tf.string_to_number(image_id, out_type=tf.int64) (image_ids_gathered, caption_strings_gathered, caption_lengths_gathered) = model_utils.gather_in_batch_captions( image_id, num_captions, caption_strings, caption_lengths) (caption_token_ids_gathered, caption_features_gathered) = self._extract_text_feature( caption_strings_gathered, caption_lengths_gathered, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding, max_norm=None) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): if visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor: attn = slim.fully_connected( caption_features_gathered, num_outputs=1, activation_fn=None, scope='caption_attn') attn = tf.squeeze(attn, axis=-1) caption_features_gathered = slim.fully_connected( caption_features_gathered, num_outputs=options.shared_dims, activation_fn=None, scope='caption') oov = len(self._open_vocabulary_list) caption_masks_gathered = tf.logical_not( tf.equal(caption_token_ids_gathered, oov)) caption_masks_gathered = tf.to_float(caption_masks_gathered) if visual_w2v_model_pb2.VisualW2vModel.GAP == options.text_feature_extractor: caption_features_gathered = utils.masked_avg_nd( data=caption_features_gathered, mask=caption_masks_gathered, dim=1) caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1) elif visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor: attn = utils.masked_softmax(attn, mask=caption_masks_gathered, dim=-1) caption_features_gathered = tf.multiply( tf.expand_dims(attn, axis=-1), caption_features_gathered) caption_features_gathered = utils.masked_sum_nd( caption_features_gathered, mask=caption_masks_gathered, dim=1) caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1) else: raise ValueError('Invalid text feature extractor.') # Export token embeddings. with tf.variable_scope(tf.get_variable_scope(), reuse=True): _, token_embeddings = self._encode_tokens( tokens=tf.constant(self._open_vocabulary_list), embedding_dims=options.embedding_dims, vocabulary_list=self._open_vocabulary_list, initial_embedding=self._open_vocabulary_initial_embedding, trainable=options.train_word_embedding) with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): token_embeddings = slim.fully_connected( token_embeddings, num_outputs=options.shared_dims, activation_fn=None, scope='caption') var_to_assign = tf.get_variable( name='weights_proj', shape=[len(self._open_vocabulary_list), options.shared_dims]) var_to_assign = tf.assign(var_to_assign, token_embeddings) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, var_to_assign) tf.summary.histogram('token_embedding_proj', token_embeddings) # Compute similarity. similarity = model_utils.calc_pairwise_similarity( feature_a=image_features, feature_b=caption_features_gathered, l2_normalize=True, dropout_keep_prob=options.cross_modal_dropout_keep_prob, is_training=is_training) predictions = { VisualW2vPredictions.image_id: image_id, VisualW2vPredictions.image_ids_gathered: image_ids_gathered, VisualW2vPredictions.similarity: similarity, VisualW2vPredictions.word2vec: var_to_assign, } return predictions
def _build_prediction(self, examples, post_process=True): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training # Gather image and proposals. (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) tf.summary.image('inputs', inputs, max_outputs=10) model_utils.visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=100) # Gather in-batch captions. (image_id, num_captions, caption_strings, caption_lengths) = (examples[InputDataFields.image_id], examples[InputDataFields.num_captions], examples[InputDataFields.caption_strings], examples[InputDataFields.caption_lengths]) image_id = tf.string_to_number(image_id, out_type=tf.int64) batch = utils.get_tensor_shape(image_id)[0] caption_indices0 = tf.range(batch, dtype=tf.int32) caption_indices1 = tf.mod( tf.random_uniform([batch], maxval=9999, dtype=tf.int32), num_captions) caption_indices = tf.stack([caption_indices0, caption_indices1], axis=-1) (caption_strings, caption_lengths) = (tf.gather_nd(caption_strings, caption_indices), tf.gather_nd(caption_lengths, caption_indices)) # Word embedding caption_features = self._extract_text_feature( caption_strings, caption_lengths, vocabulary_list=self._open_vocabulary_list, embedding_dims=options.embedding_dims, trainable=options.train_word_embedding) # FRCNN. proposal_features = self._extract_frcnn_feature( inputs, num_proposals, proposals) # Build the OICR network. # proposal_scores shape = [batch, max_num_proposals, 1 + num_classes]. # See `Multiple Instance Detection Network with OICR`. predictions = {} with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): for i in range(options.oicr_iterations): predictions[NOD3Predictions.oicr_proposal_scores + '_at_{}'.format( i + 1)] = proposal_scores = slim.fully_connected( proposal_features, num_outputs=1 + self._num_classes, activation_fn=None, scope='oicr/iter{}'.format(i + 1)) if post_process and options.HasField('pcl_preprocess'): proposal_scores = tf.nn.softmax(tf.stop_gradient(proposal_scores), axis=-1)[:, :, 1:] (num_proposals, proposals, _, _, additional_fields) = self._pcl_preprocess_fn( proposals, proposal_scores, {'proposal_features': proposal_features}) proposal_features = additional_fields['proposal_features'] # Build MIDN network, for both image and text. # class_logits shape = [batch, num_classes] # proposal_scores shape = [batch, max_num_proposals, num_classes]. # proba_r_given_c shape = [batch, max_num_proposals, num_classes]. assert options.attention_type == nod3_model_pb2.NOD3Model.PER_CLASS with slim.arg_scope( build_hyperparams(options.fc_hyperparams, is_training)): (midn_class_logits, midn_proposal_scores, midn_proba_r_given_c) = self._build_midn_network( num_proposals, proposal_features, num_classes=self._num_classes, name_scope='image_midn', var_scope='image_midn') with slim.arg_scope( build_hyperparams(options.text_fc_hyperparams, is_training)): (text_class_logits, text_proposal_scores, text_proba_r_given_c) = self._build_midn_network( caption_lengths, caption_features, num_classes=self._num_classes, name_scope='text_midn', var_scope='text_midn') # Compute image-text similarity. tf.summary.histogram('triplet/image_logits', midn_class_logits) tf.summary.histogram('triplet/text_logits', text_class_logits) with tf.name_scope('calc_cross_modal_similarity'): similarity = model_utils.calc_pairwise_similarity( feature_a=midn_class_logits, feature_b=text_class_logits, l2_normalize=True, dropout_keep_prob=options.cross_modal_dropout_keep_prob, is_training=is_training) predictions.update({ DetectionResultFields.class_labels: tf.constant(self._vocabulary_list), DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, NOD3Predictions.midn_class_logits: midn_class_logits, NOD3Predictions.midn_proba_r_given_c: midn_proba_r_given_c, NOD3Predictions.oicr_proposal_scores + '_at_0': midn_proposal_scores, NOD3Predictions.training_only_caption_strings: caption_strings, NOD3Predictions.training_only_caption_lengths: caption_lengths, NOD3Predictions.image_id: image_id, NOD3Predictions.similarity: similarity, }) # Post process to get final predictions. if post_process: predictions.update(self._post_process(inputs, predictions)) return predictions
def build_prediction(self, examples, **kwargs): """Builds tf graph for prediction. Args: examples: dict of input tensors keyed by name. prediction_task: the specific prediction task. Returns: predictions: dict of prediction results keyed by name. """ options = self._model_proto is_training = self._is_training (inputs, num_proposals, proposals) = (examples[InputDataFields.image], examples[InputDataFields.num_proposals], examples[InputDataFields.proposals]) predictions = { DetectionResultFields.num_proposals: num_proposals, DetectionResultFields.proposal_boxes: proposals, } # FRCNN. # `proposal_features` shape = [batch, max_num_proposals, feature_dims]. # `proposal_masks` shape = [batch, max_num_proposals]. proposal_features = self._extract_frcnn_feature( inputs, num_proposals, proposals) batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features) proposal_masks = tf.sequence_mask(num_proposals, maxlen=max_num_proposals, dtype=tf.float32) # Build the SADDN predictions. # `logits_c_given_r` shape = [batch, max_num_proposals, num_classes]. # `logits_r_given_c` shape = [batch, max_num_proposals, num_classes]. with tf.variable_scope('SADDN'), \ slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)): logits_c_given_r = slim.fully_connected( proposal_features, num_outputs=self._num_classes, activation_fn=None, scope='proba_c_given_r') logits_r_given_c = slim.fully_connected( proposal_features, num_outputs=self._num_classes, activation_fn=None, scope='proba_r_given_c') proba_c_given_r = tf.nn.softmax(logits_c_given_r) proba_r_given_c = utils.masked_softmax( data=logits_r_given_c, mask=tf.expand_dims(proposal_masks, axis=-1), dim=1) proba_r_given_c = tf.multiply( tf.expand_dims(proposal_masks, axis=-1), proba_r_given_c) tf.summary.image('inputs', inputs, max_outputs=10) model_utils.visl_proposals(inputs, num_proposals, proposals, name='proposals', top_k=2000) # SADDN iterations. logits_at_0 = utils.masked_avg_nd(data=logits_c_given_r, mask=proposal_masks, dim=1) logits_at_0 = tf.squeeze(logits_at_0, axis=1) logits_at_i = logits_at_0 for i in range(options.saddn_iterations): # Infer the proba_r_given_c. # Infer the proba_c. proba_c_at_i = tf.nn.softmax(logits_at_i) import pdb pdb.set_trace() proba_r_at_i = tf.multiply(tf.expand_dims(proba_c_at_i, axis=1), proba_r_given_c) proba_r_at_i = tf.reduce_sum(proba_r_at_i, axis=-1, keepdims=True) # Infer the detection results at iter `i`. (num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, detection_classes_at_i) = model_utils.post_process( proposals, proba_r_at_i * proba_c_given_r) (predictions[StackedAttnPredictions.logits + '_at_{}'.format(i)], predictions[DetectionResultFields.num_detections + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_boxes + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_scores + '_at_{}'.format(i)], predictions[DetectionResultFields.detection_classes + '_at_{}'.format(i)]) = (logits_at_i, num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, detection_classes_at_i) model_utils.visl_proposals_top_k( inputs, num_detections_at_i, detection_boxes_at_i, detection_scores_at_i, tf.gather(self._vocabulary_list, tf.to_int32(detection_classes_at_i - 1)), name='detection_{}'.format(i)) # `logits_at_i` for the next iteration. logits_at_i = tf.multiply(proba_r_at_i, logits_c_given_r) logits_at_i = tf.reduce_sum(logits_at_i, axis=1) return predictions
def _calc_conv_proposal_feature(self, image_feature_cropped): """Calculates proposal feature using spp. Args: image_feature_cropped: A [batch, crop_size, crop_size, feature_dims] float tensor. Returns: proposal_feature: A [batch, proposal_feature_dims] float tensor. """ options = self._model_proto is_training = self._is_training with slim.arg_scope( build_hyperparams(options.conv_hyperparams, is_training)): net = image_feature_cropped with tf.variable_scope('conv_hidden_layers'): with tf.variable_scope('reduce'): net = slim.conv2d(net, num_outputs=96, kernel_size=[1, 1], padding='SAME') net = slim.dropout(net, options.hidden_dropout_keep_prob, is_training=is_training) for i in range(options.hidden_layers): with tf.variable_scope('layer_{}'.format(i)): with tf.variable_scope('branch_0'): branch_0 = slim.conv2d(net, 64, [3, 3], stride=2, padding='VALID', scope='conv2d_3x3') branch_0 = slim.dropout( branch_0, options.hidden_dropout_keep_prob, is_training=is_training) with tf.variable_scope('branch_1'): branch_1 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='maxpool_3x3') net = tf.concat([branch_0, branch_1], axis=-1) tf.logging.info(net) #for i in range(options.hidden_layers): # with tf.variable_scope('layer_{}'.format(i)): # with tf.variable_scope('reduce'): # net = slim.conv2d( # net, num_outputs=128, kernel_size=[1, 1], padding='SAME') # net = slim.dropout( # net, # options.hidden_dropout_keep_prob, # is_training=is_training) # with tf.variable_scope('branch_0'): # branch_0 = slim.conv2d( # net, # 128, [3, 3], # stride=2, # padding='VALID', # scope='conv2d_3x3') # branch_0 = slim.dropout( # branch_0, # options.hidden_dropout_keep_prob, # is_training=is_training) # with tf.variable_scope('branch_1'): # branch_1 = slim.conv2d(net, 64, [1, 1], scope='conv2d_1x1') # branch_1 = slim.dropout( # branch_1, # options.hidden_dropout_keep_prob, # is_training=is_training) # branch_1 = slim.conv2d( # branch_1, # 128, [3, 3], # stride=2, # padding='VALID', # scope='conv2d_3x3') # branch_1 = slim.dropout( # branch_1, # options.hidden_dropout_keep_prob, # is_training=is_training) # with tf.variable_scope('branch_2'): # branch_2 = slim.max_pool2d( # net, [3, 3], stride=2, padding='VALID', scope='maxpool_3x3') # net = tf.concat([branch_0, branch_1, branch_2], axis=-1) # tf.logging.info(net) proposal_feature = tf.squeeze(net, [1, 2]) tf.logging.info('proposal_feture: %s', proposal_feature) return proposal_feature