Esempio n. 1
0
  def _serving_model_graph(features, params):
    """Build the model graph for serving."""
    model_outputs = mask_rcnn_model.build_model_graph(
        features, labels=None, is_training=False, params=params)

    if output_source_id:
      model_outputs.update({
          'source_id': features['source_id'],
      })

    if output_image_info:
      model_outputs.update({
          'image_info': features['image_info'],
      })

    final_boxes = model_outputs['detection_boxes']
    if output_box_features:
      final_box_rois = model_outputs['detection_boxes']
      final_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
          model_outputs['fpn_feats'], final_box_rois, output_size=7)
      _, _, final_box_features = heads.box_head(
          final_roi_features, num_classes=params['num_classes'],
          mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
      model_outputs.update({
          'detection_features': final_box_features,
      })

    if output_normalized_coordinates:
      model_outputs['detection_boxes'] = box_utils.to_normalized_coordinates(
          final_boxes,
          tf.expand_dims(features['image_info'][:, 0], axis=-1),
          tf.expand_dims(features['image_info'][:, 1], axis=-1))

    return model_outputs
Esempio n. 2
0
def build_model_graph(features, labels, is_training, params):
    """Builds the forward model graph."""
    use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
    is_gpu_inference = (not is_training and use_batched_nms)
    model_outputs = {}

    if is_training:
        if params['transpose_input']:
            features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
    batch_size, image_height, image_width, _ = (
        features['images'].get_shape().as_list())

    # Handles space-to-depth transform.
    conv0_space_to_depth_block_size = 0
    if is_training:
        conv0_space_to_depth_block_size = params[
            'conv0_space_to_depth_block_size']
        image_height *= conv0_space_to_depth_block_size
        image_width *= conv0_space_to_depth_block_size

    if 'source_ids' not in features:
        features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)

    all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                  params['num_scales'],
                                  params['aspect_ratios'],
                                  params['anchor_scale'],
                                  (image_height, image_width))

    if 'resnet' in params['backbone']:
        with tf.variable_scope(params['backbone']):
            resnet_fn = resnet.resnet_v1(
                params['backbone'],
                conv0_kernel_size=params['conv0_kernel_size'],
                conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
                num_batch_norm_group=params['num_batch_norm_group'])
            backbone_feats = resnet_fn(
                features['images'], (params['is_training_bn'] and is_training))
    elif 'mnasnet' in params['backbone']:
        with tf.variable_scope(params['backbone']):
            _, endpoints = mnasnet_models.build_mnasnet_base(
                features['images'],
                params['backbone'],
                training=(params['is_training_bn'] and is_training),
                override_params={'use_keras': False})

            backbone_feats = {
                2: endpoints['reduction_2'],
                3: endpoints['reduction_3'],
                4: endpoints['reduction_4'],
                5: endpoints['reduction_5'],
            }
    else:
        raise ValueError('Not a valid backbone option: %s' %
                         params['backbone'])

    fpn_feats = fpn.fpn(backbone_feats, params['min_level'],
                        params['max_level'])
    model_outputs.update({
        'fpn_features': fpn_feats,
    })

    rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
        fpn_feats, params['min_level'], params['max_level'],
        len(params['aspect_ratios'] * params['num_scales']))

    if is_training:
        rpn_pre_nms_topn = params['rpn_pre_nms_topn']
        rpn_post_nms_topn = params['rpn_post_nms_topn']
    else:
        rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
        rpn_post_nms_topn = params['test_rpn_post_nms_topn']

    rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
        rpn_score_outputs,
        rpn_box_outputs,
        all_anchors,
        features['image_info'],
        rpn_pre_nms_topn,
        rpn_post_nms_topn,
        params['rpn_nms_threshold'],
        params['rpn_min_size'],
        bbox_reg_weights=None,
        use_batched_nms=use_batched_nms)
    rpn_box_rois = tf.to_float(rpn_box_rois)
    if is_training:
        rpn_box_rois = tf.stop_gradient(rpn_box_rois)
        rpn_box_scores = tf.stop_gradient(rpn_box_scores)

    if is_training:
        # Sampling
        box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
            training_ops.proposal_label_op(
                rpn_box_rois,
                labels['gt_boxes'],
                labels['gt_classes'],
                features['image_info'],
                batch_size_per_im=params['batch_size_per_im'],
                fg_fraction=params['fg_fraction'],
                fg_thresh=params['fg_thresh'],
                bg_thresh_hi=params['bg_thresh_hi'],
                bg_thresh_lo=params['bg_thresh_lo']))

    # Performs multi-level RoIAlign.
    box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_feats,
        rpn_box_rois,
        output_size=7,
        is_gpu_inference=is_gpu_inference)

    class_outputs, box_outputs, _ = heads.box_head(
        box_roi_features,
        num_classes=params['num_classes'],
        mlp_head_dim=params['fast_rcnn_mlp_head_dim'])

    if not is_training:
        if is_gpu_inference:
            generate_detections_fn = postprocess_ops.generate_detections_gpu
        else:
            generate_detections_fn = postprocess_ops.generate_detections_tpu
        detections = generate_detections_fn(
            class_outputs, box_outputs, rpn_box_rois, features['image_info'],
            params['test_rpn_post_nms_topn'],
            params['test_detections_per_image'], params['test_nms'],
            params['bbox_reg_weights'])

        model_outputs.update({
            'num_detections': detections[0],
            'detection_boxes': detections[1],
            'detection_classes': detections[2],
            'detection_scores': detections[3],
        })
    else:
        encoded_box_targets = training_ops.encode_box_targets(
            rpn_box_rois, box_targets, class_targets,
            params['bbox_reg_weights'])
        model_outputs.update({
            'rpn_score_outputs': rpn_score_outputs,
            'rpn_box_outputs': rpn_box_outputs,
            'class_outputs': class_outputs,
            'box_outputs': box_outputs,
            'class_targets': class_targets,
            'box_targets': encoded_box_targets,
            'box_rois': rpn_box_rois,
        })

    # Faster-RCNN mode.
    if not params['include_mask']:
        return model_outputs

    # Mask sampling
    if not is_training:
        selected_box_rois = model_outputs['detection_boxes']
        class_indices = model_outputs['detection_classes']
        # If using GPU for inference, delay the cast until when Gather ops show up
        # since GPU inference supports float point better.
        # TODO(laigd): revisit this when newer versions of GPU libraries is
        # released.
        if not is_gpu_inference:
            class_indices = tf.to_int32(class_indices)
    else:
        (selected_class_targets, selected_box_targets, selected_box_rois,
         proposal_to_label_map) = (training_ops.select_fg_for_masks(
             class_targets,
             box_targets,
             rpn_box_rois,
             proposal_to_label_map,
             max_num_fg=int(params['batch_size_per_im'] *
                            params['fg_fraction'])))
        class_indices = tf.to_int32(selected_class_targets)

    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_feats,
        selected_box_rois,
        output_size=14,
        is_gpu_inference=is_gpu_inference)
    mask_outputs = heads.mask_head(mask_roi_features,
                                   class_indices,
                                   num_classes=params['num_classes'],
                                   mrcnn_resolution=params['mrcnn_resolution'],
                                   is_gpu_inference=is_gpu_inference)

    if is_training:
        mask_targets = training_ops.get_mask_targets(
            selected_box_rois, proposal_to_label_map, selected_box_targets,
            labels['cropped_gt_masks'], params['mrcnn_resolution'])
        model_outputs.update({
            'mask_outputs': mask_outputs,
            'mask_targets': mask_targets,
            'selected_class_targets': selected_class_targets,
        })
    else:
        model_outputs.update({
            'detection_masks': tf.nn.sigmoid(mask_outputs),
        })

    return model_outputs
Esempio n. 3
0
def build_model_graph(features, labels, is_training, params):
    """Builds the forward model graph."""
    model_outputs = {}

    if params['transpose_input'] and is_training:
        features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
    batch_size, image_height, image_width, _ = (
        features['images'].get_shape().as_list())
    if 'source_ids' not in features:
        features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)

    all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                  params['num_scales'],
                                  params['aspect_ratios'],
                                  params['anchor_scale'],
                                  (image_height, image_width))

    with tf.variable_scope('resnet%s' % params['resnet_depth']):
        resnet_fn = resnet.resnet_v1(
            params['resnet_depth'],
            num_batch_norm_group=params['num_batch_norm_group'])
        backbone_feats = resnet_fn(features['images'],
                                   (params['is_training_bn'] and is_training))

    fpn_feats = fpn.fpn(backbone_feats, params['min_level'],
                        params['max_level'])

    rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
        fpn_feats, params['min_level'], params['max_level'],
        len(params['aspect_ratios'] * params['num_scales']))

    if is_training:
        rpn_pre_nms_topn = params['rpn_pre_nms_topn']
        rpn_post_nms_topn = params['rpn_post_nms_topn']
    else:
        rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
        rpn_post_nms_topn = params['test_rpn_post_nms_topn']

    rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
        rpn_score_outputs,
        rpn_box_outputs,
        all_anchors,
        features['image_info'],
        rpn_pre_nms_topn,
        rpn_post_nms_topn,
        params['rpn_nms_threshold'],
        params['rpn_min_size'],
        bbox_reg_weights=None,
        use_tpu=params['use_tpu'])
    rpn_box_rois = tf.to_float(rpn_box_rois)
    if is_training:
        rpn_box_rois = tf.stop_gradient(rpn_box_rois)
        rpn_box_scores = tf.stop_gradient(rpn_box_scores)

    if is_training:
        # Sampling
        box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
            training_ops.proposal_label_op(
                rpn_box_rois,
                labels['gt_boxes'],
                labels['gt_classes'],
                features['image_info'],
                batch_size_per_im=params['batch_size_per_im'],
                fg_fraction=params['fg_fraction'],
                fg_thresh=params['fg_thresh'],
                bg_thresh_hi=params['bg_thresh_hi'],
                bg_thresh_lo=params['bg_thresh_lo']))

    # Performs multi-level RoIAlign.
    box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_feats, rpn_box_rois, output_size=7)

    class_outputs, box_outputs, _ = heads.box_head(
        box_roi_features,
        num_classes=params['num_classes'],
        mlp_head_dim=params['fast_rcnn_mlp_head_dim'])

    if not is_training:
        if params['use_tpu']:
            detections = postprocess_ops.generate_detections_tpu(
                class_outputs, box_outputs, rpn_box_rois,
                features['source_ids'], features['image_info'],
                params['test_rpn_post_nms_topn'],
                params['test_detections_per_image'], params['test_nms'],
                params['bbox_reg_weights'])
        else:
            detections = postprocess_ops.generate_detections_gpu(
                class_outputs, box_outputs, rpn_box_rois,
                features['source_ids'], features['image_info'],
                params['test_rpn_post_nms_topn'],
                params['test_detections_per_image'], params['test_nms'],
                params['bbox_reg_weights'])

        model_outputs.update({
            'detections':
            tf.identity(detections, 'Detections'),
        })
        if params['output_box_features']:
            final_box_rois = detections[:, :, 1:5]
            final_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
                fpn_feats, final_box_rois, output_size=7)
            _, _, final_box_features = heads.box_head(
                final_roi_features,
                num_classes=params['num_classes'],
                mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
            model_outputs.update({
                'box_features':
                tf.identity(final_box_features, 'BoxFeatures'),
            })
    else:
        encoded_box_targets = training_ops.encode_box_targets(
            rpn_box_rois, box_targets, class_targets,
            params['bbox_reg_weights'])
        model_outputs.update({
            'rpn_score_outputs': rpn_score_outputs,
            'rpn_box_outputs': rpn_box_outputs,
            'class_outputs': class_outputs,
            'box_outputs': box_outputs,
            'class_targets': class_targets,
            'box_targets': encoded_box_targets,
            'box_rois': rpn_box_rois,
        })

    # Faster-RCNN mode.
    if not params['include_mask']:
        return model_outputs

    # Mask sampling
    if not is_training:
        selected_box_rois = detections[:, :, 1:5]
        class_indices = tf.to_int32(detections[:, :, 6])
    else:
        (selected_class_targets, selected_box_targets, selected_box_rois,
         proposal_to_label_map) = (training_ops.select_fg_for_masks(
             class_targets,
             box_targets,
             rpn_box_rois,
             proposal_to_label_map,
             max_num_fg=int(params['batch_size_per_im'] *
                            params['fg_fraction'])))
        class_indices = tf.to_int32(selected_class_targets)

    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_feats, selected_box_rois, output_size=14)
    mask_outputs = heads.mask_head(mask_roi_features,
                                   class_indices,
                                   num_classes=params['num_classes'],
                                   mrcnn_resolution=params['mrcnn_resolution'])

    model_outputs.update({
        'mask_outputs': mask_outputs,
    })

    if is_training:
        mask_targets = training_ops.get_mask_targets(
            selected_box_rois, proposal_to_label_map, selected_box_targets,
            labels['cropped_gt_masks'], params['mrcnn_resolution'])
        model_outputs.update({
            'mask_targets': mask_targets,
            'selected_class_targets': selected_class_targets,
        })
    else:
        model_outputs['mask_outputs'] = tf.identity(
            tf.nn.sigmoid(model_outputs['mask_outputs']), 'Masks')

    return model_outputs
Esempio n. 4
0
    def _model_outputs():
        """Generates outputs from the model."""

        model_outputs = {}

        with tf.variable_scope('resnet%s' % params['resnet_depth']):
            resnet_fn = resnet.resnet_v1(
                params['resnet_depth'],
                num_batch_norm_group=params['num_batch_norm_group'])
            backbone_feats = resnet_fn(features['images'],
                                       params['is_training_bn'])

        fpn_feats = fpn.fpn(backbone_feats, params['min_level'],
                            params['max_level'])

        rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
            fpn_feats, params['min_level'], params['max_level'],
            len(params['aspect_ratios'] * params['num_scales']))

        if mode == tf.estimator.ModeKeys.TRAIN:
            rpn_pre_nms_topn = params['rpn_pre_nms_topn']
            rpn_post_nms_topn = params['rpn_post_nms_topn']
        else:
            rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
            rpn_post_nms_topn = params['test_rpn_post_nms_topn']

        _, rpn_box_rois = mask_rcnn_architecture.proposal_op(
            rpn_score_outputs, rpn_box_outputs, all_anchors,
            features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn,
            params['rpn_nms_threshold'], params['rpn_min_size'])
        rpn_box_rois = tf.to_float(rpn_box_rois)

        if mode == tf.estimator.ModeKeys.TRAIN:
            # Sampling
            box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
                mask_rcnn_architecture.proposal_label_op(
                    rpn_box_rois,
                    labels['gt_boxes'],
                    labels['gt_classes'],
                    features['image_info'],
                    batch_size_per_im=params['batch_size_per_im'],
                    fg_fraction=params['fg_fraction'],
                    fg_thresh=params['fg_thresh'],
                    bg_thresh_hi=params['bg_thresh_hi'],
                    bg_thresh_lo=params['bg_thresh_lo']))

        # Performs multi-level RoIAlign.
        box_roi_features = ops.multilevel_crop_and_resize(fpn_feats,
                                                          rpn_box_rois,
                                                          output_size=7)

        class_outputs, box_outputs = heads.box_head(
            box_roi_features,
            num_classes=params['num_classes'],
            mlp_head_dim=params['fast_rcnn_mlp_head_dim'])

        if mode != tf.estimator.ModeKeys.TRAIN:
            batch_size, _, _ = class_outputs.get_shape().as_list()
            detections = []
            softmax_class_outputs = tf.nn.softmax(class_outputs)
            for i in range(batch_size):
                detections.append(
                    anchors.generate_detections_per_image_op(
                        softmax_class_outputs[i], box_outputs[i],
                        rpn_box_rois[i], features['source_ids'][i],
                        features['image_info'][i],
                        params['test_detections_per_image'],
                        params['test_rpn_post_nms_topn'], params['test_nms'],
                        params['bbox_reg_weights']))
            detections = tf.stack(detections, axis=0)
            model_outputs.update({
                'detections': detections,
            })
        else:
            encoded_box_targets = mask_rcnn_architecture.encode_box_targets(
                rpn_box_rois, box_targets, class_targets,
                params['bbox_reg_weights'])
            model_outputs.update({
                'rpn_score_outputs': rpn_score_outputs,
                'rpn_box_outputs': rpn_box_outputs,
                'class_outputs': class_outputs,
                'box_outputs': box_outputs,
                'class_targets': class_targets,
                'box_targets': encoded_box_targets,
                'box_rois': rpn_box_rois,
            })

        # Faster-RCNN mode.
        if not params['include_mask']:
            return model_outputs

        # Mask sampling
        if mode != tf.estimator.ModeKeys.TRAIN:
            selected_box_rois = detections[:, :, 1:5]
            class_indices = tf.to_int32(detections[:, :, 6])
        else:
            (selected_class_targets, selected_box_targets, selected_box_rois,
             proposal_to_label_map) = (
                 mask_rcnn_architecture.select_fg_for_masks(
                     class_targets,
                     box_targets,
                     rpn_box_rois,
                     proposal_to_label_map,
                     max_num_fg=int(params['batch_size_per_im'] *
                                    params['fg_fraction'])))
            class_indices = tf.to_int32(selected_class_targets)

        mask_roi_features = ops.multilevel_crop_and_resize(fpn_feats,
                                                           selected_box_rois,
                                                           output_size=14)
        mask_outputs = heads.mask_head(
            mask_roi_features,
            class_indices,
            num_classes=params['num_classes'],
            mrcnn_resolution=params['mrcnn_resolution'])

        model_outputs.update({
            'mask_outputs': mask_outputs,
        })

        if mode == tf.estimator.ModeKeys.TRAIN:
            mask_targets = mask_rcnn_architecture.get_mask_targets(
                selected_box_rois, proposal_to_label_map, selected_box_targets,
                labels['cropped_gt_masks'], params['mrcnn_resolution'])
            model_outputs.update({
                'mask_targets':
                mask_targets,
                'selected_class_targets':
                selected_class_targets,
            })

        return model_outputs
Esempio n. 5
0
  def _model_outputs():
    """Generates outputs from the model."""

    model_outputs = {}

    with tf.variable_scope('resnet%s' % params['resnet_depth']):
      resnet_fn = resnet.resnet_v1(
          params['resnet_depth'],
          num_batch_norm_group=params['num_batch_norm_group'])
      backbone_feats = resnet_fn(features['images'], params['is_training_bn'])

    fpn_feats = fpn.fpn(
        backbone_feats, params['min_level'], params['max_level'])

    rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
        fpn_feats,
        params['min_level'], params['max_level'],
        len(params['aspect_ratios'] * params['num_scales']))

    if mode == tf.estimator.ModeKeys.TRAIN:
      rpn_pre_nms_topn = params['rpn_pre_nms_topn']
      rpn_post_nms_topn = params['rpn_post_nms_topn']
    else:
      rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
      rpn_post_nms_topn = params['test_rpn_post_nms_topn']

    rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
        rpn_score_outputs,
        rpn_box_outputs,
        all_anchors,
        features['image_info'],
        rpn_pre_nms_topn,
        rpn_post_nms_topn,
        params['rpn_nms_threshold'],
        params['rpn_min_size'],
        bbox_reg_weights=None,
        use_tpu=params['use_tpu'])
    rpn_box_rois = tf.to_float(rpn_box_rois)
    if mode == tf.estimator.ModeKeys.TRAIN:
      rpn_box_rois = tf.stop_gradient(rpn_box_rois)
      rpn_box_scores = tf.stop_gradient(rpn_box_scores)

    if mode == tf.estimator.ModeKeys.TRAIN:
      # Sampling
      box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
          training_ops.proposal_label_op(
              rpn_box_rois,
              labels['gt_boxes'],
              labels['gt_classes'],
              features['image_info'],
              batch_size_per_im=params['batch_size_per_im'],
              fg_fraction=params['fg_fraction'],
              fg_thresh=params['fg_thresh'],
              bg_thresh_hi=params['bg_thresh_hi'],
              bg_thresh_lo=params['bg_thresh_lo']))

    # Performs multi-level RoIAlign.
    box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_feats, rpn_box_rois, output_size=7)

    class_outputs, box_outputs, _ = heads.box_head(
        box_roi_features, num_classes=params['num_classes'],
        mlp_head_dim=params['fast_rcnn_mlp_head_dim'])

    if mode != tf.estimator.ModeKeys.TRAIN:
      if params['use_tpu']:
        detections = postprocess_ops.generate_detections_tpu(
            class_outputs,
            box_outputs,
            rpn_box_rois,
            features['source_ids'],
            features['image_info'],
            params['test_rpn_post_nms_topn'],
            params['test_detections_per_image'],
            params['test_nms'],
            params['bbox_reg_weights'])
      else:
        detections = postprocess_ops.generate_detections_gpu(
            class_outputs,
            box_outputs,
            rpn_box_rois,
            features['source_ids'],
            features['image_info'],
            params['test_rpn_post_nms_topn'],
            params['test_detections_per_image'],
            params['test_nms'],
            params['bbox_reg_weights'])

      detections = tf.identity(detections, 'Detections')
      model_outputs.update({
          'detections': detections,
      })
      if params['output_box_features']:
        final_box_rois = detections[:, :, 1:5]
        final_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_feats, final_box_rois, output_size=7)
        _, _, final_box_features = heads.box_head(
            final_roi_features, num_classes=params['num_classes'],
            mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
        final_box_features = tf.identity(final_box_features, 'BoxFeatures')
        model_outputs.update({
            'box_features': final_box_features,
        })
    else:
      encoded_box_targets = training_ops.encode_box_targets(
          rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
      model_outputs.update({
          'rpn_score_outputs': rpn_score_outputs,
          'rpn_box_outputs': rpn_box_outputs,
          'class_outputs': class_outputs,
          'box_outputs': box_outputs,
          'class_targets': class_targets,
          'box_targets': encoded_box_targets,
          'box_rois': rpn_box_rois,
      })

    # Faster-RCNN mode.
    if not params['include_mask']:
      return model_outputs

    # Mask sampling
    if mode != tf.estimator.ModeKeys.TRAIN:
      selected_box_rois = detections[:, :, 1:5]
      class_indices = tf.to_int32(detections[:, :, 6])
    else:
      (selected_class_targets, selected_box_targets, selected_box_rois,
       proposal_to_label_map) = (
           training_ops.select_fg_for_masks(
               class_targets, box_targets, rpn_box_rois,
               proposal_to_label_map,
               max_num_fg=int(
                   params['batch_size_per_im'] * params['fg_fraction'])))
      class_indices = tf.to_int32(selected_class_targets)

    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_feats, selected_box_rois, output_size=14)
    mask_outputs = heads.mask_head(
        mask_roi_features,
        class_indices,
        num_classes=params['num_classes'],
        mrcnn_resolution=params['mrcnn_resolution'])

    mask_outputs = tf.identity(mask_outputs, 'Masks')
    model_outputs.update({
        'mask_outputs': mask_outputs,
    })

    if mode == tf.estimator.ModeKeys.TRAIN:
      mask_targets = training_ops.get_mask_targets(
          selected_box_rois, proposal_to_label_map, selected_box_targets,
          labels['cropped_gt_masks'], params['mrcnn_resolution'])
      model_outputs.update({
          'mask_targets': mask_targets,
          'selected_class_targets': selected_class_targets,
      })

    return model_outputs