Beispiel #1
0
    def _build_prediction(self, examples, post_process=True):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        _, caption_features = self._extract_text_feature(
            examples[InputDataFields.concat_caption_string],
            examples[InputDataFields.concat_caption_length],
            vocabulary_list=self._open_vocabulary_list,
            initial_embedding=self._open_vocabulary_initial_embedding,
            embedding_dims=options.embedding_dims,
            trainable=options.train_word_embedding)

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            caption_text_features = slim.fully_connected(
                caption_features,
                num_outputs=options.projected_dims,
                activation_fn=None)

        caption_text_features = self._text_encoding_fn(
            caption_features,
            examples[InputDataFields.concat_caption_length],
            is_training=is_training)

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            caption_predicting_logits = slim.fully_connected(
                caption_text_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='caption_predicting_logits')

        predictions = {
            OPPredictions.caption_predicting_logits:
            caption_predicting_logits,
            OPPredictions.caption_predicting_labels:
            tf.round(tf.nn.sigmoid(caption_predicting_logits)),
        }

        return predictions
Beispiel #2
0
    def _calc_spp_proposal_feature(self, image_feature_cropped):
        """Calculates proposal feature using spp.

    Args:
      image_feature_cropped: A [batch, crop_size, crop_size, feature_dims]
        float tensor.

    Returns:
      spp_feature: A [batch, spp_feature_dims] float tensor.
      proposal_feature: A [batch, proposal_feature_dims] float tensor.
    """
        options = self._model_proto
        is_training = self._is_training

        net = image_feature_cropped

        spp_feature = net = self._calc_spp_feature(
            net, spp_bins=[lv for lv in options.spp_bins])

        for i in range(options.hidden_layers):
            with slim.arg_scope(
                    build_hyperparams(options.fc_hyperparams, is_training)):
                net = slim.fully_connected(net,
                                           num_outputs=options.hidden_units,
                                           scope='fc_{}'.format(i + 1))
                net = slim.dropout(net,
                                   options.hidden_dropout_keep_prob,
                                   is_training=is_training)
        return spp_feature, net
Beispiel #3
0
    def _calc_conv_proposal_feature(self, image_feature_cropped):
        """Calculates proposal feature using spp.

    Args:
      image_feature_cropped: A [batch, crop_size, crop_size, feature_dims]
        float tensor.

    Returns:
      proposal_feature: A [batch, proposal_feature_dims] float tensor.
    """
        options = self._model_proto
        is_training = self._is_training

        with slim.arg_scope(
                build_hyperparams(options.conv_hyperparams, is_training)):

            net = image_feature_cropped

            with tf.variable_scope('conv_layers'):
                tf.logging.info('layer input: %s', net.get_shape())
                net = slim.repeat(net,
                                  1,
                                  slim.conv2d,
                                  64, [3, 3],
                                  padding='VALID',
                                  scope='conv2d_3x3')
                net = tf.reduce_mean(net, axis=[1, 2])
                tf.logging.info('layer output: %s', net.get_shape())

        return net
Beispiel #4
0
    def _project_images(self,
                        feature_map,
                        common_dimensions=300,
                        scope="image_proj",
                        hyperparams=None,
                        is_training=False):
        """Adds additional 1x1 conv layer to project image features.

    Args:
      feature_map: [batch, feature_height, feature_width, feature_depth] float
        tensor, which is the CNN output.
      common_dimensions: depth of the image embedding.
      scope: variable scope of the projection layer.
      hyperparams: an instance of hyperparams_pb2.Hyperparams, used for the
        conv2d projection layer.
      is_training: if True, training graph is built.
    """
        with slim.arg_scope(build_hyperparams(hyperparams, is_training)):
            with tf.variable_scope(scope):
                feature_map = tf.contrib.layers.conv2d(
                    inputs=feature_map,
                    num_outputs=common_dimensions,
                    kernel_size=[1, 1],
                    activation_fn=None)
        return feature_map
Beispiel #5
0
    def _calc_conv_proposal_feature(self, image_feature_cropped):
        """Calculates proposal feature using spp.

    Args:
      image_feature_cropped: A [batch, crop_size, crop_size, feature_dims]
        float tensor.

    Returns:
      proposal_feature: A [batch, proposal_feature_dims] float tensor.
    """
        options = self._model_proto
        is_training = self._is_training

        with slim.arg_scope(
                build_hyperparams(options.conv_hyperparams, is_training)):

            net = image_feature_cropped

            with tf.variable_scope('conv_layers'):
                for i in range(options.conv_layers):
                    with tf.variable_scope('layer_{}'.format(i)):
                        tf.logging.info('layer input: %s', net.get_shape())
                        net_cat = slim.conv2d(net,
                                              options.conv_units, [1, 1],
                                              stride=1,
                                              padding='SAME',
                                              scope='conv2d_1x1')
                        net_cat = slim.dropout(net_cat,
                                               options.conv_dropout_keep_prob,
                                               is_training=is_training)

                        net = tf.concat([net, net_cat], axis=-1)
                        net = slim.max_pool2d(net, [2, 2],
                                              stride=2,
                                              padding='VALID',
                                              scope='maxpool_2x2')

                        tf.logging.info('layer output: %s', net.get_shape())

            with tf.variable_scope('conv_layers'):
                with tf.variable_scope('layer_{}'.format(options.conv_layers)):
                    net = slim.conv2d(net,
                                      options.conv_units, [3, 3],
                                      stride=1,
                                      padding='VALID',
                                      scope='conv2d_3x3')
                    net = slim.dropout(net,
                                       options.conv_dropout_keep_prob,
                                       is_training=is_training)

            proposal_feature = tf.squeeze(net, [1, 2])

        tf.logging.info('proposal_feture: %s', proposal_feature)
        return proposal_feature
    def build_prediction(self, examples, **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        # Text Global-Maximum-Pooling features.

        (caption_string,
         caption_length) = (examples[InputDataFields.concat_caption_string],
                            examples[InputDataFields.concat_caption_length])

        (caption_token_ids, caption_features) = self._extract_text_feature(
            caption_string,
            caption_length,
            vocabulary_list=self._open_vocabulary_list,
            initial_embedding=self._open_vocabulary_initial_embedding,
            embedding_dims=options.embedding_dims,
            trainable=options.train_word_embedding,
            max_norm=None)

        with slim.arg_scope(
                build_hyperparams(options.text_fc_hyperparams, is_training)):
            caption_features = slim.fully_connected(
                caption_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='caption')

        oov = len(self._open_vocabulary_list)
        caption_masks = tf.to_float(
            tf.logical_not(tf.equal(caption_token_ids, oov)))

        # logits shape = [batch, num_classes].

        logits = utils.masked_maximum(data=caption_features,
                                      mask=tf.expand_dims(caption_masks,
                                                          axis=-1),
                                      dim=1)
        logits = tf.squeeze(logits, axis=1)

        predictions = {
            TextClassificationPredictions.vocab:
            tf.constant(self._vocabulary_list),
            TextClassificationPredictions.logits: logits,
        }
        return predictions
Beispiel #7
0
    def build_prediction(self, examples, **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        if not self._is_training:
            self._model_proto.hyperparams.regularizer.l2_regularizer.weight = 0
        with slim.arg_scope(
                build_hyperparams(self._model_proto.hyperparams,
                                  self._is_training)):
            return self._build_prediction(examples)
Beispiel #8
0
    def _calc_spp_proposal_feature(self, image_feature_cropped):
        """Calculates proposal feature using spp.

    Args:
      image_feature_cropped: A [batch, crop_size, crop_size, feature_dims]
        float tensor.

    Returns:
      proposal_feature: A [batch, proposal_feature_dims] float tensor.
    """
        options = self._model_proto
        is_training = self._is_training

        net = image_feature_cropped

        #with slim.arg_scope(
        #    build_hyperparams(options.conv_hyperparams, is_training)):
        #  for i in range(options.conv_layers):
        #    net_add = slim.conv2d(
        #        net,
        #        options.conv_units, [1, 1],
        #        padding='SAME',
        #        scope='conv/fc_{}'.format(i + 1))
        #    net = tf.concat([net, net_add], axis=-1)
        #    net = slim.dropout(
        #        net, options.conv_dropout_keep_prob, is_training=is_training)

        net = self._calc_spp_feature(net,
                                     spp_bins=[lv for lv in options.spp_bins],
                                     max_pool=options.spp_max_pool)

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            for i in range(options.hidden_layers):
                net = slim.fully_connected(net,
                                           num_outputs=options.hidden_units,
                                           scope='hidden/fc_{}'.format(i + 1))
                net = slim.dropout(net,
                                   options.hidden_dropout_keep_prob,
                                   is_training=is_training)
        return net
Beispiel #9
0
    def _calc_saliency_score(self,
                             inputs,
                             scope,
                             hyperparams=None,
                             is_training=False):
        """Calculates saliency score.

    Args:
      inputs: input feature, a [..., feature_dimensions] float tensor.
      scope: variable scope.
      hyperparams: an instance of hyperparams_pb2.Hyperparams, used for the
      is_training: if True, build training graph.

    Returns:
      saliency_score: saliency score, a [..., 1] float tensor keeping the
        feature dimension.
    """
        with slim.arg_scope(build_hyperparams(hyperparams, is_training)):
            saliency_score = tf.contrib.layers.fully_connected(
                inputs, num_outputs=1, activation_fn=None, scope=scope)
        return tf.squeeze(saliency_score, axis=-1)
Beispiel #10
0
    def _project_image(self,
                       image_feature,
                       num_outputs=20,
                       kernel_size=1,
                       hyperparams=None,
                       is_training=False):
        """Adds additional 1x1 conv layer to project image features.

    Args:
      image_feature: [batch, feature_height, feature_width, feature_depth] float
        tensor, which is the CNN output.
      num_outputs: number of output channels.
      hyperparams: an instance of hyperparams_pb2.Hyperparams, used for the
        conv2d projection layer.
      is_training: if True, training graph is built.
    """
        with slim.arg_scope(build_hyperparams(hyperparams, is_training)):
            output = tf.contrib.layers.conv2d(
                inputs=image_feature,
                num_outputs=num_outputs,
                kernel_size=[kernel_size, kernel_size],
                activation_fn=None)
        return output
Beispiel #11
0
  def _build_object_prediction_network(self, texts, text_lengths,
                                       open_vocabulary_list, embedding_dims):
    """Builds tf graph for predicting object labels from captions.

    Args:
      texts: A [batch, max_text_length] string tensor.
      text_lengths: A [batch] int tensor.
      open_vocabulary_list: A list of words.
      embedding_dims:

    Returns:
      predicted_logits: A [batch, num_classes] float tensor.
    """
    options = self._model_proto
    is_training = self._is_training

    _, caption_features = self._extract_text_feature(
        texts,
        text_lengths,
        vocabulary_list=open_vocabulary_list,
        initial_embedding=self._open_vocabulary_initial_embedding,
        embedding_dims=embedding_dims,
        trainable=options.train_word_embedding)

    caption_text_features = self._text_encoding_fn(
        caption_features, text_lengths, is_training=is_training)

    with slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)):
      predicted_logits = slim.fully_connected(
          caption_text_features,
          num_outputs=self._num_classes,
          activation_fn=None,
          scope="predicted_logits")

    tf.summary.histogram("object_prediction/predicted_logits", predicted_logits)

    return predicted_logits
Beispiel #12
0
    def build_prediction(self,
                         examples,
                         prediction_task=OICRTasks.image_label,
                         **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        # Extract `features_to_crop` from the original image.
        #   shape = [batch, feature_height, feature_width, feature_depth].

        preprocessed_inputs = self._feature_extractor.preprocess(inputs)

        (features_to_crop,
         _) = self._feature_extractor.extract_proposal_features(
             preprocessed_inputs, scope='first_stage_feature_extraction')
        (mipn_feature_map
         ) = self._feature_extractor.extract_box_classifier_features(
             features_to_crop, scope='second_stage_feature_extraction')

        with slim.arg_scope(
                build_hyperparams(options.conv_hyperparams, is_training)):
            (mipn_logits, mipn_num_proposals, mipn_proposals,
             mipn_proposal_scores,
             class_activation_map) = self._build_mipn_network(
                 mipn_feature_map,
                 num_proposals,
                 proposals,
                 kernel_size=options.mipn_conv_kernel_size,
                 pooling=options.mipn_pooling)

        self._visl_class_activation_map(inputs, class_activation_map)

        self._visl_proposals(inputs,
                             num_proposals,
                             proposals,
                             name='proposals',
                             top_k=200)
        self._visl_proposals(inputs,
                             mipn_num_proposals,
                             mipn_proposals,
                             name='proposals_mipn',
                             top_k=200)

        # Substitude to use the top-ranked proposals.

        num_proposals = tf.minimum(mipn_num_proposals,
                                   options.mipn_max_num_proposals)
        proposals = mipn_proposals[:, :options.mipn_max_num_proposals, :]

        batch, max_num_proposals, _ = utils.get_tensor_shape(proposals)

        # Crop `flattened_proposal_features_maps`.
        #   shape = [batch*max_num_proposals, crop_size, crop_size, feature_depth].

        box_ind = tf.expand_dims(tf.range(batch), axis=-1)
        box_ind = tf.tile(box_ind, [1, max_num_proposals])

        cropped_regions = tf.image.crop_and_resize(
            features_to_crop,
            boxes=tf.reshape(proposals, [-1, 4]),
            box_ind=tf.reshape(box_ind, [-1]),
            crop_size=[options.initial_crop_size, options.initial_crop_size])

        flattened_proposal_features_maps = slim.max_pool2d(
            cropped_regions,
            [options.maxpool_kernel_size, options.maxpool_kernel_size],
            stride=options.maxpool_stride)

        # Extract `proposal_features`,
        #   shape = [batch, max_num_proposals, feature_dims].

        (box_classifier_features
         ) = self._feature_extractor.extract_box_classifier_features(
             flattened_proposal_features_maps,
             scope='second_stage_feature_extraction')

        flattened_roi_pooled_features = tf.reduce_mean(box_classifier_features,
                                                       [1, 2],
                                                       name='AvgPool')
        flattened_roi_pooled_features = slim.dropout(
            flattened_roi_pooled_features,
            keep_prob=options.dropout_keep_prob,
            is_training=is_training)

        proposal_features = tf.reshape(flattened_roi_pooled_features,
                                       [batch, max_num_proposals, -1])

        # Assign weights from pre-trained checkpoint.

        tf.train.init_from_checkpoint(
            options.checkpoint_path,
            assignment_map={"/": "first_stage_feature_extraction/"})
        tf.train.init_from_checkpoint(
            options.checkpoint_path,
            assignment_map={"/": "second_stage_feature_extraction/"})

        # Build MIDN network.
        #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            midn_logits, proba_r_given_c = self._build_midn_network(
                num_proposals,
                proposal_features,
                num_classes=self._num_classes)

        # Build the OICR network.
        #   proposal_scores shape = [batch, max_num_proposals, 1 + num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        oicr_proposal_scores_list = []
        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            with tf.name_scope('online_instance_classifier_refinement'):
                for i in range(options.oicr_iterations):
                    oicr_proposal_scores_at_i = slim.fully_connected(
                        proposal_features,
                        num_outputs=1 + self._num_classes,
                        activation_fn=None,
                        scope='oicr/iter{}'.format(i + 1))
                    oicr_proposal_scores_list.append(oicr_proposal_scores_at_i)

        predictions = {
            DetectionResultFields.num_proposals: num_proposals,
            DetectionResultFields.proposal_boxes: proposals,
            OICRPredictions.midn_logits: midn_logits,
            OICRPredictions.midn_proba_r_given_c: proba_r_given_c,
            OICRPredictions.mipn_logits: mipn_logits,
        }

        # Post process to get the final detections.

        midn_proposal_scores = tf.multiply(
            tf.expand_dims(tf.nn.softmax(midn_logits), axis=1),
            proba_r_given_c)

        (predictions[DetectionResultFields.num_detections +
                     '_at_{}'.format(0)],
         predictions[DetectionResultFields.detection_boxes +
                     '_at_{}'.format(0)],
         predictions[DetectionResultFields.detection_scores +
                     '_at_{}'.format(0)],
         predictions[DetectionResultFields.detection_classes +
                     '_at_{}'.format(0)]) = self._post_process(
                         proposals, midn_proposal_scores)

        for i, oicr_proposal_scores_at_i in enumerate(
                oicr_proposal_scores_list):
            predictions[OICRPredictions.oicr_proposal_scores +
                        '_at_{}'.format(i + 1)] = oicr_proposal_scores_at_i

            (predictions[DetectionResultFields.num_detections +
                         '_at_{}'.format(i + 1)],
             predictions[DetectionResultFields.detection_boxes +
                         '_at_{}'.format(i + 1)],
             predictions[DetectionResultFields.detection_scores +
                         '_at_{}'.format(i + 1)],
             predictions[DetectionResultFields.detection_classes +
                         '_at_{}'.format(i + 1)]) = self._post_process(
                             proposals,
                             tf.nn.softmax(oicr_proposal_scores_at_i,
                                           axis=-1)[:, :, 1:])

        for i in range(1 + options.oicr_iterations):
            num_detections, detection_boxes, detection_scores, detection_classes = (
                predictions[DetectionResultFields.num_detections +
                            '_at_{}'.format(i)],
                predictions[DetectionResultFields.detection_boxes +
                            '_at_{}'.format(i)],
                predictions[DetectionResultFields.detection_scores +
                            '_at_{}'.format(i)],
                predictions[DetectionResultFields.detection_classes +
                            '_at_{}'.format(i)])
            self._visl_proposals_top_k(inputs,
                                       num_detections,
                                       detection_boxes,
                                       detection_scores,
                                       tf.gather(
                                           self._vocabulary_list,
                                           tf.to_int32(detection_classes - 1)),
                                       name='detection_{}'.format(i))

        return predictions
Beispiel #13
0
    def _build_prediction(self, examples, post_process=True):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        tf.summary.image('inputs', inputs, max_outputs=10)
        model_utils.visl_proposals(inputs,
                                   num_proposals,
                                   proposals,
                                   name='proposals',
                                   top_k=100)

        # FRCNN.

        proposal_features = self._extract_frcnn_feature(
            inputs, num_proposals, proposals)

        # Build MIDN network.
        #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            if options.attention_type == nod_model_pb2.NODModel.PER_CLASS:
                midn_class_scores, midn_proposal_scores = self._build_midn_network(
                    num_proposals,
                    proposal_features,
                    num_classes=self._num_classes)
            else:
                raise ValueError('Invalid attention type.')

        predictions = {
            DetectionResultFields.num_proposals: num_proposals,
            DetectionResultFields.proposal_boxes: proposals,
            NODPredictions.midn_class_scores: midn_class_scores,
        }

        # Build the OICR network.
        #   proposal_scores shape = [batch, max_num_proposals, 1 + num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        predictions[NODPredictions.oicr_proposal_scores +
                    '_at_0'] = midn_proposal_scores

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            for i in range(options.oicr_iterations):
                predictions[NODPredictions.oicr_proposal_scores +
                            '_at_{}'.format(i + 1)] = slim.fully_connected(
                                proposal_features,
                                num_outputs=1 + self._num_classes,
                                activation_fn=None,
                                scope='oicr/iter{}'.format(i + 1))

        # Post process to get final predictions.

        if post_process:
            predictions.update(self._post_process(inputs, predictions))

        return predictions
Beispiel #14
0
    def build_prediction(self,
                         examples,
                         prediction_task=OICRTasks.image_label,
                         **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (image, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        # Use the CNN to extract feature.
        #   image_feature shape=[batch, feature_height, feature_width, feature_dims]

        image_feature = model_utils.dilated_vgg16_conv(image,
                                                       options.cnn,
                                                       is_training=is_training)

        # Crop image feature from the CNN output.
        #   image_feature_cropped_and_flattened
        #   shape=[batch*max_num_proposals, crop_size, crop_size, feature_dims]

        batch, max_num_proposals, _ = utils.get_tensor_shape(proposals)
        box_ind = tf.expand_dims(tf.range(batch), axis=-1)
        box_ind = tf.tile(box_ind, [1, max_num_proposals])

        crop_size = options.feature_crop_size
        image_feature_cropped = tf.image.crop_and_resize(
            image_feature,
            boxes=tf.reshape(proposals, [-1, 4]),
            box_ind=tf.reshape(box_ind, [-1]),
            crop_size=[crop_size, crop_size],
            method='bilinear')

        # Get the multi-resolutional feature.
        #   proposal_feature shape=[batch, max_num_proposals, hidden_units].

        if options.feature_extractor == oicr_model_pb2.OICRModel.SPP:
            spp_feature, proposal_feature = self._calc_spp_proposal_feature(
                image_feature_cropped)
        elif options.feature_extractor == oicr_model_pb2.OICRModel.VGG:
            spp_feature = proposal_feature = self._calc_vgg_proposal_feature(
                image_feature_cropped)
        elif options.feature_extractor == oicr_model_pb2.OICRModel.CONV:
            spp_feature = proposal_feature = self._calc_conv_proposal_feature(
                image_feature_cropped)
        else:
            raise ValueError('Invalid feature extractor')

        spp_feature = tf.reshape(spp_feature, [batch, max_num_proposals, -1])
        proposal_feature = tf.reshape(proposal_feature,
                                      [batch, max_num_proposals, -1])

        tf.summary.histogram('midn/proposal_feature', proposal_feature)

        # Build the MIDN network.
        #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            midn_logits, proba_r_given_c = self._build_midn_network(
                num_proposals,
                spp_feature
                if options.use_spp_to_calc_logits else proposal_feature,
                proposal_feature,
                num_classes=self._num_classes,
                attention_normalizer=options.attention_normalizer,
                attention_tanh=options.attention_tanh,
                attention_scale_factor=options.attention_scale_factor)

        # Build the OICR network.
        #   proposal_scores shape = [batch, max_num_proposals, 1 + num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        oicr_proposal_scores_list = []
        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            with tf.name_scope('online_instance_classifier_refinement'):
                for i in range(options.oicr_iterations):
                    oicr_proposal_scores_at_i = slim.fully_connected(
                        proposal_feature,
                        num_outputs=1 + self._num_classes,
                        activation_fn=None,
                        scope='oicr/iter{}'.format(i + 1))
                    oicr_proposal_scores_list.append(oicr_proposal_scores_at_i)

        predictions = {
            DetectionResultFields.num_proposals: num_proposals,
            DetectionResultFields.proposal_boxes: proposals,
            OICRPredictions.midn_proba_r_given_c: proba_r_given_c,
            OICRPredictions.midn_logits: midn_logits,
        }

        # Post process to get the final detections.
        labels = self._extract_class_label(
            class_texts=examples[InputDataFields.caption_strings],
            vocabulary_list=self._vocabulary_list)

        midn_proposal_scores = tf.multiply(proba_r_given_c,
                                           tf.expand_dims(labels, axis=1))

        (predictions[DetectionResultFields.num_detections +
                     '_at_{}'.format(0)],
         predictions[DetectionResultFields.detection_boxes +
                     '_at_{}'.format(0)],
         predictions[DetectionResultFields.detection_scores +
                     '_at_{}'.format(0)],
         predictions[DetectionResultFields.detection_classes +
                     '_at_{}'.format(0)]) = self._post_process(
                         proposals, midn_proposal_scores)

        for i, oicr_proposal_scores_at_i in enumerate(
                oicr_proposal_scores_list):
            predictions[OICRPredictions.oicr_proposal_scores +
                        '_at_{}'.format(i + 1)] = oicr_proposal_scores_at_i

            (predictions[DetectionResultFields.num_detections +
                         '_at_{}'.format(i + 1)],
             predictions[DetectionResultFields.detection_boxes +
                         '_at_{}'.format(i + 1)],
             predictions[DetectionResultFields.detection_scores +
                         '_at_{}'.format(i + 1)],
             predictions[DetectionResultFields.detection_classes +
                         '_at_{}'.format(i + 1)]) = self._post_process(
                             proposals,
                             tf.nn.softmax(oicr_proposal_scores_at_i,
                                           axis=-1)[:, :, 1:])

        self._visl_proposals(image,
                             num_proposals,
                             proposals,
                             name='proposals',
                             top_k=2000)
        for i in range(1 + options.oicr_iterations):
            num_detections, detection_boxes, detection_scores, detection_classes = (
                predictions[DetectionResultFields.num_detections +
                            '_at_{}'.format(i)],
                predictions[DetectionResultFields.detection_boxes +
                            '_at_{}'.format(i)],
                predictions[DetectionResultFields.detection_scores +
                            '_at_{}'.format(i)],
                predictions[DetectionResultFields.detection_classes +
                            '_at_{}'.format(i)])
            self._visl_proposals_top_k(image,
                                       num_detections,
                                       detection_boxes,
                                       detection_scores,
                                       tf.gather(
                                           self._vocabulary_list,
                                           tf.to_int32(detection_classes - 1)),
                                       name='detection_{}'.format(i))

        return predictions
Beispiel #15
0
    def _build_prediction(self, examples):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        predictions = {}
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        # Fast-RCNN.

        proposal_features = model_utils.extract_frcnn_feature(
            inputs, num_proposals, proposals, options.frcnn_options,
            is_training)

        # Build MIDN network.
        #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            (midn_class_logits, midn_proposal_scores,
             midn_proba_r_given_c) = self._build_midn_network(
                 num_proposals,
                 proposal_features,
                 num_classes=self._label_extractor.num_classes)

        # Build the OICR network.
        #   proposal_scores shape = [batch, max_num_proposals, 1 + num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            for i in range(options.oicr_iterations):
                predictions[Cap2DetPredictions.oicr_proposal_scores +
                            '_at_{}'.format(
                                i +
                                1)] = proposal_scores = slim.fully_connected(
                                    proposal_features,
                                    num_outputs=1 +
                                    self._label_extractor.num_classes,
                                    activation_fn=None,
                                    scope='oicr/iter{}'.format(i + 1))

        # Set the predictions.

        predictions.update({
            DetectionResultFields.class_labels:
            tf.constant(self._label_extractor.classes),
            DetectionResultFields.num_proposals:
            num_proposals,
            DetectionResultFields.proposal_boxes:
            proposals,
            Cap2DetPredictions.midn_class_logits:
            midn_class_logits,
            Cap2DetPredictions.midn_proba_r_given_c:
            midn_proba_r_given_c,
            Cap2DetPredictions.oicr_proposal_scores + '_at_0':
            midn_proposal_scores
        })

        return predictions
Beispiel #16
0
    def _build_prediction(self, examples, post_process=True):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        tf.summary.image('inputs', inputs, max_outputs=10)
        model_utils.visl_proposals(inputs,
                                   num_proposals,
                                   proposals,
                                   name='proposals',
                                   top_k=100)

        # FRCNN.

        proposal_features = self._extract_frcnn_feature(
            inputs, num_proposals, proposals)

        # Build the OICR network.
        #   proposal_scores shape = [batch, max_num_proposals, 1 + num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        predictions = {}
        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            for i in range(options.oicr_iterations):
                predictions[NOD2Predictions.oicr_proposal_scores +
                            '_at_{}'.format(
                                i +
                                1)] = proposal_scores = slim.fully_connected(
                                    proposal_features,
                                    num_outputs=1 + self._num_classes,
                                    activation_fn=None,
                                    scope='oicr/iter{}'.format(i + 1))

        if post_process and options.HasField('pcl_preprocess'):
            proposal_scores = tf.nn.softmax(tf.stop_gradient(proposal_scores),
                                            axis=-1)[:, :, 1:]
            (num_proposals, proposals, _, _,
             additional_fields) = self._pcl_preprocess_fn(
                 proposals, proposal_scores,
                 {'proposal_features': proposal_features})
            proposal_features = additional_fields['proposal_features']

        # Build MIDN network.
        #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            if options.attention_type == nod2_model_pb2.NOD2Model.PER_CLASS:
                (midn_class_logits, midn_proposal_scores,
                 midn_proba_r_given_c) = self._build_midn_network(
                     num_proposals,
                     proposal_features,
                     num_classes=self._num_classes)
            elif options.attention_type == nod2_model_pb2.NOD2Model.PER_CLASS_TANH:
                (midn_class_logits, midn_proposal_scores,
                 midn_proba_r_given_c) = self._build_midn_network_tanh(
                     num_proposals,
                     proposal_features,
                     num_classes=self._num_classes)
            else:
                raise ValueError('Invalid attention type.')

        predictions.update({
            DetectionResultFields.class_labels:
            tf.constant(self._vocabulary_list),
            DetectionResultFields.num_proposals:
            num_proposals,
            DetectionResultFields.proposal_boxes:
            proposals,
            NOD2Predictions.midn_class_logits:
            midn_class_logits,
            NOD2Predictions.midn_proba_r_given_c:
            midn_proba_r_given_c,
            NOD2Predictions.oicr_proposal_scores + '_at_0':
            midn_proposal_scores
        })

        # Post process to get final predictions.

        if post_process:
            predictions.update(self._post_process(inputs, predictions))

        return predictions
Beispiel #17
0
  def build_prediction(self, examples, **kwargs):
    """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
    options = self._model_proto
    is_training = self._is_training

    # Image CNN features.

    inputs = examples[InputDataFields.image]
    image_features = model_utils.calc_cnn_feature(
        inputs, options.cnn_options, is_training=is_training)

    with slim.arg_scope(
        build_hyperparams(options.image_fc_hyperparams, is_training)):
      image_features = slim.fully_connected(
          image_features,
          num_outputs=options.shared_dims,
          activation_fn=None,
          scope='image')

    # Text Global-Average-Pooling features.

    (image_id, num_captions, caption_strings,
     caption_lengths) = (examples[InputDataFields.image_id],
                         examples[InputDataFields.num_captions],
                         examples[InputDataFields.caption_strings],
                         examples[InputDataFields.caption_lengths])
    image_id = tf.string_to_number(image_id, out_type=tf.int64)

    (image_ids_gathered, caption_strings_gathered,
     caption_lengths_gathered) = model_utils.gather_in_batch_captions(
         image_id, num_captions, caption_strings, caption_lengths)

    (caption_token_ids_gathered,
     caption_features_gathered) = self._extract_text_feature(
         caption_strings_gathered,
         caption_lengths_gathered,
         vocabulary_list=self._open_vocabulary_list,
         initial_embedding=self._open_vocabulary_initial_embedding,
         embedding_dims=options.embedding_dims,
         trainable=options.train_word_embedding,
         max_norm=None)

    with slim.arg_scope(
        build_hyperparams(options.text_fc_hyperparams, is_training)):
      if visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor:
        attn = slim.fully_connected(
            caption_features_gathered,
            num_outputs=1,
            activation_fn=None,
            scope='caption_attn')
        attn = tf.squeeze(attn, axis=-1)
      caption_features_gathered = slim.fully_connected(
          caption_features_gathered,
          num_outputs=options.shared_dims,
          activation_fn=None,
          scope='caption')

    oov = len(self._open_vocabulary_list)
    caption_masks_gathered = tf.logical_not(
        tf.equal(caption_token_ids_gathered, oov))
    caption_masks_gathered = tf.to_float(caption_masks_gathered)

    if visual_w2v_model_pb2.VisualW2vModel.GAP == options.text_feature_extractor:
      caption_features_gathered = utils.masked_avg_nd(
          data=caption_features_gathered, mask=caption_masks_gathered, dim=1)
      caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1)
    elif visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor:
      attn = utils.masked_softmax(attn, mask=caption_masks_gathered, dim=-1)
      caption_features_gathered = tf.multiply(
          tf.expand_dims(attn, axis=-1), caption_features_gathered)
      caption_features_gathered = utils.masked_sum_nd(
          caption_features_gathered, mask=caption_masks_gathered, dim=1)
      caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1)
    else:
      raise ValueError('Invalid text feature extractor.')

    # Export token embeddings.

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      _, token_embeddings = self._encode_tokens(
          tokens=tf.constant(self._open_vocabulary_list),
          embedding_dims=options.embedding_dims,
          vocabulary_list=self._open_vocabulary_list,
          initial_embedding=self._open_vocabulary_initial_embedding,
          trainable=options.train_word_embedding)
      with slim.arg_scope(
          build_hyperparams(options.text_fc_hyperparams, is_training)):
        token_embeddings = slim.fully_connected(
            token_embeddings,
            num_outputs=options.shared_dims,
            activation_fn=None,
            scope='caption')
    var_to_assign = tf.get_variable(
        name='weights_proj',
        shape=[len(self._open_vocabulary_list), options.shared_dims])
    var_to_assign = tf.assign(var_to_assign, token_embeddings)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, var_to_assign)

    tf.summary.histogram('token_embedding_proj', token_embeddings)

    # Compute similarity.

    similarity = model_utils.calc_pairwise_similarity(
        feature_a=image_features,
        feature_b=caption_features_gathered,
        l2_normalize=True,
        dropout_keep_prob=options.cross_modal_dropout_keep_prob,
        is_training=is_training)

    predictions = {
        VisualW2vPredictions.image_id: image_id,
        VisualW2vPredictions.image_ids_gathered: image_ids_gathered,
        VisualW2vPredictions.similarity: similarity,
        VisualW2vPredictions.word2vec: var_to_assign,
    }
    return predictions
Beispiel #18
0
    def _build_prediction(self, examples, post_process=True):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        # Gather image and proposals.

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        tf.summary.image('inputs', inputs, max_outputs=10)
        model_utils.visl_proposals(inputs,
                                   num_proposals,
                                   proposals,
                                   name='proposals',
                                   top_k=100)

        # Gather in-batch captions.

        (image_id, num_captions, caption_strings,
         caption_lengths) = (examples[InputDataFields.image_id],
                             examples[InputDataFields.num_captions],
                             examples[InputDataFields.caption_strings],
                             examples[InputDataFields.caption_lengths])
        image_id = tf.string_to_number(image_id, out_type=tf.int64)

        batch = utils.get_tensor_shape(image_id)[0]
        caption_indices0 = tf.range(batch, dtype=tf.int32)
        caption_indices1 = tf.mod(
            tf.random_uniform([batch], maxval=9999, dtype=tf.int32),
            num_captions)
        caption_indices = tf.stack([caption_indices0, caption_indices1],
                                   axis=-1)

        (caption_strings,
         caption_lengths) = (tf.gather_nd(caption_strings, caption_indices),
                             tf.gather_nd(caption_lengths, caption_indices))

        # Word embedding

        caption_features = self._extract_text_feature(
            caption_strings,
            caption_lengths,
            vocabulary_list=self._open_vocabulary_list,
            embedding_dims=options.embedding_dims,
            trainable=options.train_word_embedding)

        # FRCNN.

        proposal_features = self._extract_frcnn_feature(
            inputs, num_proposals, proposals)

        # Build the OICR network.
        #   proposal_scores shape = [batch, max_num_proposals, 1 + num_classes].
        #   See `Multiple Instance Detection Network with OICR`.

        predictions = {}
        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            for i in range(options.oicr_iterations):
                predictions[NOD3Predictions.oicr_proposal_scores +
                            '_at_{}'.format(
                                i +
                                1)] = proposal_scores = slim.fully_connected(
                                    proposal_features,
                                    num_outputs=1 + self._num_classes,
                                    activation_fn=None,
                                    scope='oicr/iter{}'.format(i + 1))

        if post_process and options.HasField('pcl_preprocess'):
            proposal_scores = tf.nn.softmax(tf.stop_gradient(proposal_scores),
                                            axis=-1)[:, :, 1:]
            (num_proposals, proposals, _, _,
             additional_fields) = self._pcl_preprocess_fn(
                 proposals, proposal_scores,
                 {'proposal_features': proposal_features})
            proposal_features = additional_fields['proposal_features']

        # Build MIDN network, for both image and text.
        #   class_logits shape = [batch, num_classes]
        #   proposal_scores shape = [batch, max_num_proposals, num_classes].
        #   proba_r_given_c shape = [batch, max_num_proposals, num_classes].

        assert options.attention_type == nod3_model_pb2.NOD3Model.PER_CLASS

        with slim.arg_scope(
                build_hyperparams(options.fc_hyperparams, is_training)):
            (midn_class_logits, midn_proposal_scores,
             midn_proba_r_given_c) = self._build_midn_network(
                 num_proposals,
                 proposal_features,
                 num_classes=self._num_classes,
                 name_scope='image_midn',
                 var_scope='image_midn')

        with slim.arg_scope(
                build_hyperparams(options.text_fc_hyperparams, is_training)):
            (text_class_logits, text_proposal_scores,
             text_proba_r_given_c) = self._build_midn_network(
                 caption_lengths,
                 caption_features,
                 num_classes=self._num_classes,
                 name_scope='text_midn',
                 var_scope='text_midn')

        # Compute image-text similarity.

        tf.summary.histogram('triplet/image_logits', midn_class_logits)
        tf.summary.histogram('triplet/text_logits', text_class_logits)

        with tf.name_scope('calc_cross_modal_similarity'):
            similarity = model_utils.calc_pairwise_similarity(
                feature_a=midn_class_logits,
                feature_b=text_class_logits,
                l2_normalize=True,
                dropout_keep_prob=options.cross_modal_dropout_keep_prob,
                is_training=is_training)

        predictions.update({
            DetectionResultFields.class_labels:
            tf.constant(self._vocabulary_list),
            DetectionResultFields.num_proposals:
            num_proposals,
            DetectionResultFields.proposal_boxes:
            proposals,
            NOD3Predictions.midn_class_logits:
            midn_class_logits,
            NOD3Predictions.midn_proba_r_given_c:
            midn_proba_r_given_c,
            NOD3Predictions.oicr_proposal_scores + '_at_0':
            midn_proposal_scores,
            NOD3Predictions.training_only_caption_strings:
            caption_strings,
            NOD3Predictions.training_only_caption_lengths:
            caption_lengths,
            NOD3Predictions.image_id:
            image_id,
            NOD3Predictions.similarity:
            similarity,
        })

        # Post process to get final predictions.

        if post_process:
            predictions.update(self._post_process(inputs, predictions))

        return predictions
Beispiel #19
0
    def build_prediction(self, examples, **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        predictions = {
            DetectionResultFields.num_proposals: num_proposals,
            DetectionResultFields.proposal_boxes: proposals,
        }

        # FRCNN.
        #   `proposal_features` shape = [batch, max_num_proposals, feature_dims].
        #   `proposal_masks` shape = [batch, max_num_proposals].

        proposal_features = self._extract_frcnn_feature(
            inputs, num_proposals, proposals)

        batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features)
        proposal_masks = tf.sequence_mask(num_proposals,
                                          maxlen=max_num_proposals,
                                          dtype=tf.float32)

        # Build the SADDN predictions.
        #   `logits_c_given_r` shape = [batch, max_num_proposals, num_classes].
        #   `logits_r_given_c` shape = [batch, max_num_proposals, num_classes].

        with tf.variable_scope('SADDN'), \
            slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)):

            logits_c_given_r = slim.fully_connected(
                proposal_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='proba_c_given_r')
            logits_r_given_c = slim.fully_connected(
                proposal_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='proba_r_given_c')

            proba_c_given_r = tf.nn.softmax(logits_c_given_r)
            proba_r_given_c = utils.masked_softmax(
                data=logits_r_given_c,
                mask=tf.expand_dims(proposal_masks, axis=-1),
                dim=1)
            proba_r_given_c = tf.multiply(
                tf.expand_dims(proposal_masks, axis=-1), proba_r_given_c)

        tf.summary.image('inputs', inputs, max_outputs=10)
        model_utils.visl_proposals(inputs,
                                   num_proposals,
                                   proposals,
                                   name='proposals',
                                   top_k=2000)

        # SADDN iterations.

        logits_at_0 = utils.masked_avg_nd(data=logits_c_given_r,
                                          mask=proposal_masks,
                                          dim=1)
        logits_at_0 = tf.squeeze(logits_at_0, axis=1)

        logits_at_i = logits_at_0
        for i in range(options.saddn_iterations):
            # Infer the proba_r_given_c.

            # Infer the proba_c.

            proba_c_at_i = tf.nn.softmax(logits_at_i)
            import pdb
            pdb.set_trace()

            proba_r_at_i = tf.multiply(tf.expand_dims(proba_c_at_i, axis=1),
                                       proba_r_given_c)
            proba_r_at_i = tf.reduce_sum(proba_r_at_i, axis=-1, keepdims=True)

            # Infer the detection results at iter `i`.

            (num_detections_at_i, detection_boxes_at_i, detection_scores_at_i,
             detection_classes_at_i) = model_utils.post_process(
                 proposals, proba_r_at_i * proba_c_given_r)

            (predictions[StackedAttnPredictions.logits + '_at_{}'.format(i)],
             predictions[DetectionResultFields.num_detections +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_boxes +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_scores +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_classes +
                         '_at_{}'.format(i)]) = (logits_at_i,
                                                 num_detections_at_i,
                                                 detection_boxes_at_i,
                                                 detection_scores_at_i,
                                                 detection_classes_at_i)

            model_utils.visl_proposals_top_k(
                inputs,
                num_detections_at_i,
                detection_boxes_at_i,
                detection_scores_at_i,
                tf.gather(self._vocabulary_list,
                          tf.to_int32(detection_classes_at_i - 1)),
                name='detection_{}'.format(i))

            # `logits_at_i` for the next iteration.

            logits_at_i = tf.multiply(proba_r_at_i, logits_c_given_r)
            logits_at_i = tf.reduce_sum(logits_at_i, axis=1)

        return predictions
Beispiel #20
0
    def _calc_conv_proposal_feature(self, image_feature_cropped):
        """Calculates proposal feature using spp.

    Args:
      image_feature_cropped: A [batch, crop_size, crop_size, feature_dims]
        float tensor.

    Returns:
      proposal_feature: A [batch, proposal_feature_dims] float tensor.
    """
        options = self._model_proto
        is_training = self._is_training

        with slim.arg_scope(
                build_hyperparams(options.conv_hyperparams, is_training)):
            net = image_feature_cropped

            with tf.variable_scope('conv_hidden_layers'):

                with tf.variable_scope('reduce'):
                    net = slim.conv2d(net,
                                      num_outputs=96,
                                      kernel_size=[1, 1],
                                      padding='SAME')
                    net = slim.dropout(net,
                                       options.hidden_dropout_keep_prob,
                                       is_training=is_training)

                for i in range(options.hidden_layers):

                    with tf.variable_scope('layer_{}'.format(i)):
                        with tf.variable_scope('branch_0'):
                            branch_0 = slim.conv2d(net,
                                                   64, [3, 3],
                                                   stride=2,
                                                   padding='VALID',
                                                   scope='conv2d_3x3')
                            branch_0 = slim.dropout(
                                branch_0,
                                options.hidden_dropout_keep_prob,
                                is_training=is_training)
                        with tf.variable_scope('branch_1'):
                            branch_1 = slim.max_pool2d(net, [3, 3],
                                                       stride=2,
                                                       padding='VALID',
                                                       scope='maxpool_3x3')
                        net = tf.concat([branch_0, branch_1], axis=-1)
                        tf.logging.info(net)

                #for i in range(options.hidden_layers):

                #  with tf.variable_scope('layer_{}'.format(i)):
                #    with tf.variable_scope('reduce'):
                #      net = slim.conv2d(
                #          net, num_outputs=128, kernel_size=[1, 1], padding='SAME')
                #      net = slim.dropout(
                #          net,
                #          options.hidden_dropout_keep_prob,
                #          is_training=is_training)

                #    with tf.variable_scope('branch_0'):
                #      branch_0 = slim.conv2d(
                #          net,
                #          128, [3, 3],
                #          stride=2,
                #          padding='VALID',
                #          scope='conv2d_3x3')
                #      branch_0 = slim.dropout(
                #          branch_0,
                #          options.hidden_dropout_keep_prob,
                #          is_training=is_training)

                #    with tf.variable_scope('branch_1'):
                #      branch_1 = slim.conv2d(net, 64, [1, 1], scope='conv2d_1x1')
                #      branch_1 = slim.dropout(
                #          branch_1,
                #          options.hidden_dropout_keep_prob,
                #          is_training=is_training)
                #      branch_1 = slim.conv2d(
                #          branch_1,
                #          128, [3, 3],
                #          stride=2,
                #          padding='VALID',
                #          scope='conv2d_3x3')
                #      branch_1 = slim.dropout(
                #          branch_1,
                #          options.hidden_dropout_keep_prob,
                #          is_training=is_training)

                #    with tf.variable_scope('branch_2'):
                #      branch_2 = slim.max_pool2d(
                #          net, [3, 3], stride=2, padding='VALID', scope='maxpool_3x3')

                #    net = tf.concat([branch_0, branch_1, branch_2], axis=-1)
                #    tf.logging.info(net)
            proposal_feature = tf.squeeze(net, [1, 2])

        tf.logging.info('proposal_feture: %s', proposal_feature)
        return proposal_feature