Ejemplo n.º 1
0
 def testAnchorGeneration(self, min_level, max_level, num_scales,
                          aspect_ratios, anchor_size, expected_boxes):
     image_size = [64, 64]
     anchors = anchor.Anchor(min_level, max_level, num_scales,
                             aspect_ratios, anchor_size, image_size)
     boxes = anchors.boxes.numpy()
     self.assertEqual(expected_boxes, boxes.tolist())
Ejemplo n.º 2
0
 def testAnchorGenerationWithImageSizeAsTensor(self, min_level, max_level,
                                               num_scales, aspect_ratios,
                                               anchor_size, expected_boxes):
     image_size = tf.constant([64, 64], tf.int32)
     anchors = anchor.Anchor(min_level, max_level, num_scales,
                             aspect_ratios, anchor_size, image_size)
     boxes = anchors.boxes.numpy()
     self.assertEqual(expected_boxes, boxes.tolist())
  def test_num_params(self,
                      num_classes,
                      min_level,
                      max_level,
                      num_scales,
                      aspect_ratios,
                      resnet_model_id,
                      use_separable_conv,
                      fpn_num_filters,
                      head_num_convs,
                      head_num_filters,
                      expected_num_params):
    num_anchors_per_location = num_scales * len(aspect_ratios)
    image_size = 384
    images = np.random.rand(2, image_size, image_size, 3)
    image_shape = np.array([[image_size, image_size], [image_size, image_size]])

    anchor_boxes = anchor.Anchor(
        min_level=min_level,
        max_level=max_level,
        num_scales=num_scales,
        aspect_ratios=aspect_ratios,
        anchor_size=3,
        image_size=(image_size, image_size)).multilevel_boxes
    for l in anchor_boxes:
      anchor_boxes[l] = tf.tile(
          tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])

    backbone = resnet.ResNet(model_id=resnet_model_id)
    decoder = fpn.FPN(
        input_specs=backbone.output_specs,
        min_level=min_level,
        max_level=max_level,
        num_filters=fpn_num_filters,
        use_separable_conv=use_separable_conv)
    head = dense_prediction_heads.RetinaNetHead(
        min_level=min_level,
        max_level=max_level,
        num_classes=num_classes,
        num_anchors_per_location=num_anchors_per_location,
        use_separable_conv=use_separable_conv,
        num_convs=head_num_convs,
        num_filters=head_num_filters)
    generator = detection_generator.MultilevelDetectionGenerator(
        max_num_detections=10)
    model = retinanet_model.RetinaNetModel(
        backbone=backbone,
        decoder=decoder,
        head=head,
        detection_generator=generator)

    _ = model(images, image_shape, anchor_boxes, training=True)
    self.assertEqual(expected_num_params, model.count_params())
Ejemplo n.º 4
0
def construct_model_and_anchors(image_size, use_gt_boxes_for_masks):
    num_classes = 3
    min_level = 3
    max_level = 4
    num_scales = 3
    aspect_ratios = [1.0]

    anchor_boxes = anchor.Anchor(min_level=min_level,
                                 max_level=max_level,
                                 num_scales=num_scales,
                                 aspect_ratios=aspect_ratios,
                                 anchor_size=3,
                                 image_size=image_size).multilevel_boxes
    num_anchors_per_location = len(aspect_ratios) * num_scales

    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
    decoder = fpn.FPN(min_level=min_level,
                      max_level=max_level,
                      input_specs=backbone.output_specs)
    rpn_head = dense_prediction_heads.RPNHead(
        min_level=min_level,
        max_level=max_level,
        num_anchors_per_location=num_anchors_per_location)
    detection_head = instance_heads.DetectionHead(num_classes=num_classes)
    roi_generator_obj = roi_generator.MultilevelROIGenerator()
    roi_sampler_obj = roi_sampler.ROISampler()
    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
    detection_generator_obj = detection_generator.DetectionGenerator()
    mask_head = deep_instance_heads.DeepMaskHead(num_classes=num_classes,
                                                 upsample_factor=2)
    mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                num_sampled_masks=1)
    mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)

    model = maskrcnn_model.DeepMaskRCNNModel(
        backbone,
        decoder,
        rpn_head,
        detection_head,
        roi_generator_obj,
        roi_sampler_obj,
        roi_aligner_obj,
        detection_generator_obj,
        mask_head,
        mask_sampler_obj,
        mask_roi_aligner_obj,
        use_gt_boxes_for_masks=use_gt_boxes_for_masks)

    return model, anchor_boxes
Ejemplo n.º 5
0
    def testEquivalentResult(self, min_level, max_level, aspect_ratios,
                             num_scales, anchor_size, image_size):
        anchor_gen = anchor.build_anchor_generator(min_level=min_level,
                                                   max_level=max_level,
                                                   num_scales=num_scales,
                                                   aspect_ratios=aspect_ratios,
                                                   anchor_size=anchor_size)
        anchors = anchor_gen(image_size)
        expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
                                            aspect_ratios, anchor_size,
                                            image_size)

        expected_anchors = expected_anchor_gen.multilevel_boxes
        for k in expected_anchors.keys():
            self.assertAllClose(expected_anchors[k], anchors[k])
Ejemplo n.º 6
0
    def test_build_model(self, include_mask, use_separable_conv,
                         build_anchor_boxes, is_training):
        num_classes = 3
        min_level = 3
        max_level = 7
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        resnet_model_id = 50
        num_anchors_per_location = num_scales * len(aspect_ratios)
        image_size = 384
        images = np.random.rand(2, image_size, image_size, 3)
        image_shape = np.array([[image_size, image_size],
                                [image_size, image_size]])

        if build_anchor_boxes:
            anchor_boxes = anchor.Anchor(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=3,
                image_size=(image_size, image_size)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
        else:
            anchor_boxes = None

        backbone = resnet.ResNet(model_id=resnet_model_id)
        decoder = fpn.FPN(input_specs=backbone.output_specs,
                          min_level=min_level,
                          max_level=max_level,
                          use_separable_conv=use_separable_conv)
        rpn_head = dense_prediction_heads.RPNHead(
            min_level=min_level,
            max_level=max_level,
            num_anchors_per_location=num_anchors_per_location,
            num_convs=1)
        detection_head = instance_heads.DetectionHead(num_classes=num_classes)
        roi_generator_obj = roi_generator.MultilevelROIGenerator()
        roi_sampler_obj = roi_sampler.ROISampler()
        roi_aligner_obj = roi_aligner.MultilevelROIAligner()
        detection_generator_obj = detection_generator.DetectionGenerator()
        if include_mask:
            mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                                upsample_factor=2)
            mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                        num_sampled_masks=1)
            mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
                crop_size=14)
        else:
            mask_head = None
            mask_sampler_obj = None
            mask_roi_aligner_obj = None
        model = maskrcnn_model.MaskRCNNModel(backbone,
                                             decoder,
                                             rpn_head,
                                             detection_head,
                                             roi_generator_obj,
                                             roi_sampler_obj,
                                             roi_aligner_obj,
                                             detection_generator_obj,
                                             mask_head,
                                             mask_sampler_obj,
                                             mask_roi_aligner_obj,
                                             min_level=min_level,
                                             max_level=max_level,
                                             num_scales=num_scales,
                                             aspect_ratios=aspect_ratios,
                                             anchor_size=anchor_size)

        gt_boxes = np.array(
            [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
             [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
            dtype=np.float32)
        gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
        if include_mask:
            gt_masks = np.ones((2, 3, 100, 100))
        else:
            gt_masks = None

        # Results will be checked in test_forward.
        _ = model(images,
                  image_shape,
                  anchor_boxes,
                  gt_boxes,
                  gt_classes,
                  gt_masks,
                  training=is_training)
Ejemplo n.º 7
0
    def test_forward(self, strategy, include_mask, build_anchor_boxes,
                     training, use_cascade_heads):
        num_classes = 3
        min_level = 3
        max_level = 4
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        if use_cascade_heads:
            cascade_iou_thresholds = [0.6]
            class_agnostic_bbox_pred = True
            cascade_class_ensemble = True
        else:
            cascade_iou_thresholds = None
            class_agnostic_bbox_pred = False
            cascade_class_ensemble = False

        image_size = (256, 256)
        images = np.random.rand(2, image_size[0], image_size[1], 3)
        image_shape = np.array([[224, 100], [100, 224]])
        with strategy.scope():
            if build_anchor_boxes:
                anchor_boxes = anchor.Anchor(
                    min_level=min_level,
                    max_level=max_level,
                    num_scales=num_scales,
                    aspect_ratios=aspect_ratios,
                    anchor_size=anchor_size,
                    image_size=image_size).multilevel_boxes
            else:
                anchor_boxes = None
            num_anchors_per_location = len(aspect_ratios) * num_scales

            input_specs = tf.keras.layers.InputSpec(
                shape=[None, None, None, 3])
            backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
            decoder = fpn.FPN(min_level=min_level,
                              max_level=max_level,
                              input_specs=backbone.output_specs)
            rpn_head = dense_prediction_heads.RPNHead(
                min_level=min_level,
                max_level=max_level,
                num_anchors_per_location=num_anchors_per_location)
            detection_head = instance_heads.DetectionHead(
                num_classes=num_classes,
                class_agnostic_bbox_pred=class_agnostic_bbox_pred)
            roi_generator_obj = roi_generator.MultilevelROIGenerator()

            roi_sampler_cascade = []
            roi_sampler_obj = roi_sampler.ROISampler()
            roi_sampler_cascade.append(roi_sampler_obj)
            if cascade_iou_thresholds:
                for iou in cascade_iou_thresholds:
                    roi_sampler_obj = roi_sampler.ROISampler(
                        mix_gt_boxes=False,
                        foreground_iou_threshold=iou,
                        background_iou_high_threshold=iou,
                        background_iou_low_threshold=0.0,
                        skip_subsampling=True)
                    roi_sampler_cascade.append(roi_sampler_obj)
            roi_aligner_obj = roi_aligner.MultilevelROIAligner()
            detection_generator_obj = detection_generator.DetectionGenerator()
            if include_mask:
                mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                                    upsample_factor=2)
                mask_sampler_obj = mask_sampler.MaskSampler(
                    mask_target_size=28, num_sampled_masks=1)
                mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
                    crop_size=14)
            else:
                mask_head = None
                mask_sampler_obj = None
                mask_roi_aligner_obj = None
            model = maskrcnn_model.MaskRCNNModel(
                backbone,
                decoder,
                rpn_head,
                detection_head,
                roi_generator_obj,
                roi_sampler_obj,
                roi_aligner_obj,
                detection_generator_obj,
                mask_head,
                mask_sampler_obj,
                mask_roi_aligner_obj,
                class_agnostic_bbox_pred=class_agnostic_bbox_pred,
                cascade_class_ensemble=cascade_class_ensemble,
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=anchor_size)

            gt_boxes = np.array(
                [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
                 [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
                dtype=np.float32)
            gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
            if include_mask:
                gt_masks = np.ones((2, 3, 100, 100))
            else:
                gt_masks = None

            results = model(images,
                            image_shape,
                            anchor_boxes,
                            gt_boxes,
                            gt_classes,
                            gt_masks,
                            training=training)

        self.assertIn('rpn_boxes', results)
        self.assertIn('rpn_scores', results)
        if training:
            self.assertIn('class_targets', results)
            self.assertIn('box_targets', results)
            self.assertIn('class_outputs', results)
            self.assertIn('box_outputs', results)
            if include_mask:
                self.assertIn('mask_outputs', results)
        else:
            self.assertIn('detection_boxes', results)
            self.assertIn('detection_scores', results)
            self.assertIn('detection_classes', results)
            self.assertIn('num_detections', results)
            if include_mask:
                self.assertIn('detection_masks', results)
Ejemplo n.º 8
0
    def _call_box_outputs(
        self,
        images: tf.Tensor,
        image_shape: tf.Tensor,
        anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
        gt_boxes: Optional[tf.Tensor] = None,
        gt_classes: Optional[tf.Tensor] = None,
        training: Optional[bool] = None
    ) -> Tuple[Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
        """Implementation of the Faster-RCNN logic for boxes."""
        model_outputs = {}

        # Feature extraction.
        (backbone_features,
         decoder_features) = self._get_backbone_and_decoder_features(images)

        # Region proposal network.
        rpn_scores, rpn_boxes = self.rpn_head(decoder_features)

        model_outputs.update({
            'backbone_features': backbone_features,
            'decoder_features': decoder_features,
            'rpn_boxes': rpn_boxes,
            'rpn_scores': rpn_scores
        })

        # Generate anchor boxes for this batch if not provided.
        if anchor_boxes is None:
            _, image_height, image_width, _ = images.get_shape().as_list()
            anchor_boxes = anchor.Anchor(
                min_level=self._config_dict['min_level'],
                max_level=self._config_dict['max_level'],
                num_scales=self._config_dict['num_scales'],
                aspect_ratios=self._config_dict['aspect_ratios'],
                anchor_size=self._config_dict['anchor_size'],
                image_size=(image_height, image_width)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0),
                    [tf.shape(images)[0], 1, 1, 1])

        # Generate RoIs.
        current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores,
                                             anchor_boxes, image_shape,
                                             training)

        next_rois = current_rois
        all_class_outputs = []
        for cascade_num in range(len(self.roi_sampler)):
            # In cascade RCNN we want the higher layers to have different regression
            # weights as the predicted deltas become smaller and smaller.
            regression_weights = self._cascade_layer_to_weights[cascade_num]
            current_rois = next_rois

            (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
             matched_gt_classes, matched_gt_indices,
             current_rois) = self._run_frcnn_head(
                 features=decoder_features,
                 rois=current_rois,
                 gt_boxes=gt_boxes,
                 gt_classes=gt_classes,
                 training=training,
                 model_outputs=model_outputs,
                 cascade_num=cascade_num,
                 regression_weights=regression_weights)
            all_class_outputs.append(class_outputs)

            # Generate ROIs for the next cascade head if there is any.
            if cascade_num < len(self.roi_sampler) - 1:
                next_rois = box_ops.decode_boxes(tf.cast(
                    box_outputs, tf.float32),
                                                 current_rois,
                                                 weights=regression_weights)
                next_rois = box_ops.clip_boxes(
                    next_rois, tf.expand_dims(image_shape, axis=1))

        if not training:
            if self._config_dict['cascade_class_ensemble']:
                class_outputs = tf.add_n(all_class_outputs) / len(
                    all_class_outputs)

            detections = self.detection_generator(
                box_outputs,
                class_outputs,
                current_rois,
                image_shape,
                regression_weights,
                bbox_per_class=(
                    not self._config_dict['class_agnostic_bbox_pred']))
            model_outputs.update({
                'cls_outputs': class_outputs,
                'box_outputs': box_outputs,
            })
            if self.detection_generator.get_config()['apply_nms']:
                model_outputs.update({
                    'detection_boxes':
                    detections['detection_boxes'],
                    'detection_scores':
                    detections['detection_scores'],
                    'detection_classes':
                    detections['detection_classes'],
                    'num_detections':
                    detections['num_detections']
                })
            else:
                model_outputs.update({
                    'decoded_boxes':
                    detections['decoded_boxes'],
                    'decoded_box_scores':
                    detections['decoded_box_scores']
                })

        intermediate_outputs = {
            'matched_gt_boxes': matched_gt_boxes,
            'matched_gt_indices': matched_gt_indices,
            'matched_gt_classes': matched_gt_classes,
            'current_rois': current_rois,
        }
        return (model_outputs, intermediate_outputs)
    def test_forward(self, include_mask, training):
        num_classes = 3
        min_level = 3
        max_level = 4
        num_scales = 3
        aspect_ratios = [1.0]
        image_size = (256, 256)
        images = np.random.rand(2, image_size[0], image_size[1], 3)
        image_shape = np.array([[224, 100], [100, 224]])
        anchor_boxes = anchor.Anchor(min_level=min_level,
                                     max_level=max_level,
                                     num_scales=num_scales,
                                     aspect_ratios=aspect_ratios,
                                     anchor_size=3,
                                     image_size=image_size).multilevel_boxes
        num_anchors_per_location = len(aspect_ratios) * num_scales

        input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
        backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
        decoder = fpn.FPN(min_level=min_level,
                          max_level=max_level,
                          input_specs=backbone.output_specs)
        rpn_head = dense_prediction_heads.RPNHead(
            min_level=min_level,
            max_level=max_level,
            num_anchors_per_location=num_anchors_per_location)
        detection_head = instance_heads.DetectionHead(num_classes=num_classes)
        roi_generator_obj = roi_generator.MultilevelROIGenerator()
        roi_sampler_obj = roi_sampler.ROISampler()
        roi_aligner_obj = roi_aligner.MultilevelROIAligner()
        detection_generator_obj = detection_generator.DetectionGenerator()
        if include_mask:
            mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                                upsample_factor=2)
            mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                        num_sampled_masks=1)
            mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
                crop_size=14)
        else:
            mask_head = None
            mask_sampler_obj = None
            mask_roi_aligner_obj = None
        model = maskrcnn_model.MaskRCNNModel(backbone, decoder, rpn_head,
                                             detection_head, roi_generator_obj,
                                             roi_sampler_obj, roi_aligner_obj,
                                             detection_generator_obj,
                                             mask_head, mask_sampler_obj,
                                             mask_roi_aligner_obj)

        gt_boxes = np.array(
            [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
             [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
            dtype=np.float32)
        gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
        if include_mask:
            gt_masks = np.ones((2, 3, 100, 100))
        else:
            gt_masks = None

        results = model(images,
                        image_shape,
                        anchor_boxes,
                        gt_boxes,
                        gt_classes,
                        gt_masks,
                        training=training)

        self.assertIn('rpn_boxes', results)
        self.assertIn('rpn_scores', results)
        if training:
            self.assertIn('class_targets', results)
            self.assertIn('box_targets', results)
            self.assertIn('class_outputs', results)
            self.assertIn('box_outputs', results)
            if include_mask:
                self.assertIn('mask_outputs', results)
        else:
            self.assertIn('detection_boxes', results)
            self.assertIn('detection_scores', results)
            self.assertIn('detection_classes', results)
            self.assertIn('num_detections', results)
            if include_mask:
                self.assertIn('detection_masks', results)
Ejemplo n.º 10
0
    def test_build_model(self, use_separable_conv, build_anchor_boxes,
                         is_training, has_att_heads):
        num_classes = 3
        min_level = 3
        max_level = 7
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        fpn_num_filters = 256
        head_num_convs = 4
        head_num_filters = 256
        num_anchors_per_location = num_scales * len(aspect_ratios)
        image_size = 384
        images = np.random.rand(2, image_size, image_size, 3)
        image_shape = np.array([[image_size, image_size],
                                [image_size, image_size]])

        if build_anchor_boxes:
            anchor_boxes = anchor.Anchor(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=anchor_size,
                image_size=(image_size, image_size)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
        else:
            anchor_boxes = None

        if has_att_heads:
            attribute_heads = [dict(name='depth', type='regression', size=1)]
        else:
            attribute_heads = None

        backbone = resnet.ResNet(model_id=50)
        decoder = fpn.FPN(input_specs=backbone.output_specs,
                          min_level=min_level,
                          max_level=max_level,
                          num_filters=fpn_num_filters,
                          use_separable_conv=use_separable_conv)
        head = dense_prediction_heads.RetinaNetHead(
            min_level=min_level,
            max_level=max_level,
            num_classes=num_classes,
            attribute_heads=attribute_heads,
            num_anchors_per_location=num_anchors_per_location,
            use_separable_conv=use_separable_conv,
            num_convs=head_num_convs,
            num_filters=head_num_filters)
        generator = detection_generator.MultilevelDetectionGenerator(
            max_num_detections=10)
        model = retinanet_model.RetinaNetModel(backbone=backbone,
                                               decoder=decoder,
                                               head=head,
                                               detection_generator=generator,
                                               min_level=min_level,
                                               max_level=max_level,
                                               num_scales=num_scales,
                                               aspect_ratios=aspect_ratios,
                                               anchor_size=anchor_size)

        _ = model(images, image_shape, anchor_boxes, training=is_training)
Ejemplo n.º 11
0
    def call(self,
             images: tf.Tensor,
             image_shape: Optional[tf.Tensor] = None,
             anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
             training: bool = None) -> Mapping[str, tf.Tensor]:
        """Forward pass of the RetinaNet model.

    Args:
      images: `Tensor`, the input batched images, whose shape is
        [batch, height, width, 3].
      image_shape: `Tensor`, the actual shape of the input images, whose shape
        is [batch, 2] where the last dimension is [height, width]. Note that
        this is the actual image shape excluding paddings. For example, images
        in the batch may be resized into different shapes before padding to the
        fixed size.
      anchor_boxes: a dict of tensors which includes multilevel anchors.
        - key: `str`, the level of the multilevel predictions.
        - values: `Tensor`, the anchor coordinates of a particular feature
            level, whose shape is [height_l, width_l, num_anchors_per_location].
      training: `bool`, indicating whether it is in training mode.

    Returns:
      scores: a dict of tensors which includes scores of the predictions.
        - key: `str`, the level of the multilevel predictions.
        - values: `Tensor`, the box scores predicted from a particular feature
            level, whose shape is
            [batch, height_l, width_l, num_classes * num_anchors_per_location].
      boxes: a dict of tensors which includes coordinates of the predictions.
        - key: `str`, the level of the multilevel predictions.
        - values: `Tensor`, the box coordinates predicted from a particular
            feature level, whose shape is
            [batch, height_l, width_l, 4 * num_anchors_per_location].
      attributes: a dict of (attribute_name, attribute_predictions). Each
        attribute prediction is a dict that includes:
        - key: `str`, the level of the multilevel predictions.
        - values: `Tensor`, the attribute predictions from a particular
            feature level, whose shape is
            [batch, height_l, width_l, att_size * num_anchors_per_location].
    """
        # Feature extraction.
        features = self.backbone(images)
        if self.decoder:
            features = self.decoder(features)

        # Dense prediction. `raw_attributes` can be empty.
        raw_scores, raw_boxes, raw_attributes = self.head(features)

        if training:
            outputs = {
                'cls_outputs': raw_scores,
                'box_outputs': raw_boxes,
            }
            if raw_attributes:
                outputs.update({'attribute_outputs': raw_attributes})
            return outputs
        else:
            # Generate anchor boxes for this batch if not provided.
            if anchor_boxes is None:
                _, image_height, image_width, _ = images.get_shape().as_list()
                anchor_boxes = anchor.Anchor(
                    min_level=self._config_dict['min_level'],
                    max_level=self._config_dict['max_level'],
                    num_scales=self._config_dict['num_scales'],
                    aspect_ratios=self._config_dict['aspect_ratios'],
                    anchor_size=self._config_dict['anchor_size'],
                    image_size=(image_height, image_width)).multilevel_boxes
                for l in anchor_boxes:
                    anchor_boxes[l] = tf.tile(
                        tf.expand_dims(anchor_boxes[l], axis=0),
                        [tf.shape(images)[0], 1, 1, 1])

            # Post-processing.
            final_results = self.detection_generator(raw_boxes, raw_scores,
                                                     anchor_boxes, image_shape,
                                                     raw_attributes)
            outputs = {
                'cls_outputs': raw_scores,
                'box_outputs': raw_boxes,
            }
            if self.detection_generator.get_config()['apply_nms']:
                outputs.update({
                    'detection_boxes':
                    final_results['detection_boxes'],
                    'detection_scores':
                    final_results['detection_scores'],
                    'detection_classes':
                    final_results['detection_classes'],
                    'num_detections':
                    final_results['num_detections']
                })
            else:
                outputs.update({
                    'decoded_boxes':
                    final_results['decoded_boxes'],
                    'decoded_box_scores':
                    final_results['decoded_box_scores']
                })

            if raw_attributes:
                outputs.update({
                    'attribute_outputs':
                    raw_attributes,
                    'detection_attributes':
                    final_results['detection_attributes'],
                })
            return outputs
Ejemplo n.º 12
0
    def call(self,
             images: tf.Tensor,
             image_shape: tf.Tensor,
             anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
             gt_boxes: tf.Tensor = None,
             gt_classes: tf.Tensor = None,
             gt_masks: tf.Tensor = None,
             training: bool = None) -> Mapping[str, tf.Tensor]:
        model_outputs = {}

        # Feature extraction.
        features = self.backbone(images)
        if self.decoder:
            features = self.decoder(features)

        # Region proposal network.
        rpn_scores, rpn_boxes = self.rpn_head(features)

        model_outputs.update({
            'rpn_boxes': rpn_boxes,
            'rpn_scores': rpn_scores
        })

        # Generate anchor boxes for this batch if not provided.
        if anchor_boxes is None:
            _, image_height, image_width, _ = images.get_shape().as_list()
            anchor_boxes = anchor.Anchor(
                min_level=self._config_dict['min_level'],
                max_level=self._config_dict['max_level'],
                num_scales=self._config_dict['num_scales'],
                aspect_ratios=self._config_dict['aspect_ratios'],
                anchor_size=self._config_dict['anchor_size'],
                image_size=(image_height, image_width)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0),
                    [tf.shape(images)[0], 1, 1, 1])

        # Generate RoIs.
        current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores,
                                             anchor_boxes, image_shape,
                                             training)

        next_rois = current_rois
        all_class_outputs = []
        for cascade_num in range(len(self.roi_sampler)):
            # In cascade RCNN we want the higher layers to have different regression
            # weights as the predicted deltas become smaller and smaller.
            regression_weights = self._cascade_layer_to_weights[cascade_num]
            current_rois = next_rois

            (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
             matched_gt_classes, matched_gt_indices,
             current_rois) = self._run_frcnn_head(
                 features=features,
                 rois=current_rois,
                 gt_boxes=gt_boxes,
                 gt_classes=gt_classes,
                 training=training,
                 model_outputs=model_outputs,
                 layer_num=cascade_num,
                 regression_weights=regression_weights)
            all_class_outputs.append(class_outputs)

            # Generate ROIs for the next cascade head if there is any.
            if cascade_num < len(self.roi_sampler) - 1:
                next_rois = box_ops.decode_boxes(tf.cast(
                    box_outputs, tf.float32),
                                                 current_rois,
                                                 weights=regression_weights)
                next_rois = box_ops.clip_boxes(
                    next_rois, tf.expand_dims(image_shape, axis=1))

        if not training:
            if self._config_dict['cascade_class_ensemble']:
                class_outputs = tf.add_n(all_class_outputs) / len(
                    all_class_outputs)

            detections = self.detection_generator(
                box_outputs,
                class_outputs,
                current_rois,
                image_shape,
                regression_weights,
                bbox_per_class=(
                    not self._config_dict['class_agnostic_bbox_pred']))
            model_outputs.update({
                'detection_boxes':
                detections['detection_boxes'],
                'detection_scores':
                detections['detection_scores'],
                'detection_classes':
                detections['detection_classes'],
                'num_detections':
                detections['num_detections'],
            })

        if not self._include_mask:
            return model_outputs

        if training:
            current_rois, roi_classes, roi_masks = self.mask_sampler(
                current_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, gt_masks)
            roi_masks = tf.stop_gradient(roi_masks)

            model_outputs.update({
                'mask_class_targets': roi_classes,
                'mask_targets': roi_masks,
            })
        else:
            current_rois = model_outputs['detection_boxes']
            roi_classes = model_outputs['detection_classes']

        # Mask RoI align.
        mask_roi_features = self.mask_roi_aligner(features, current_rois)

        # Mask head.
        raw_masks = self.mask_head([mask_roi_features, roi_classes])

        if training:
            model_outputs.update({
                'mask_outputs': raw_masks,
            })
        else:
            model_outputs.update({
                'detection_masks': tf.math.sigmoid(raw_masks),
            })
        return model_outputs
    def test_build_model(self,
                         use_separable_conv,
                         build_anchor_boxes,
                         shared_backbone,
                         shared_decoder,
                         is_training=True):
        num_classes = 3
        min_level = 2
        max_level = 6
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        resnet_model_id = 50
        segmentation_resnet_model_id = 50
        aspp_dilation_rates = [6, 12, 18]
        aspp_decoder_level = 2
        fpn_decoder_level = 2
        num_anchors_per_location = num_scales * len(aspect_ratios)
        image_size = 128
        images = tf.random.normal([2, image_size, image_size, 3])
        image_info = tf.convert_to_tensor([[[image_size, image_size],
                                            [image_size, image_size], [1, 1],
                                            [0, 0]],
                                           [[image_size, image_size],
                                            [image_size, image_size], [1, 1],
                                            [0, 0]]])
        shared_decoder = shared_decoder and shared_backbone
        if build_anchor_boxes or not is_training:
            anchor_boxes = anchor.Anchor(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=3,
                image_size=(image_size, image_size)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
        else:
            anchor_boxes = None

        backbone = resnet.ResNet(model_id=resnet_model_id)
        decoder = fpn.FPN(input_specs=backbone.output_specs,
                          min_level=min_level,
                          max_level=max_level,
                          use_separable_conv=use_separable_conv)
        rpn_head = dense_prediction_heads.RPNHead(
            min_level=min_level,
            max_level=max_level,
            num_anchors_per_location=num_anchors_per_location,
            num_convs=1)
        detection_head = instance_heads.DetectionHead(num_classes=num_classes)
        roi_generator_obj = roi_generator.MultilevelROIGenerator()
        roi_sampler_obj = roi_sampler.ROISampler()
        roi_aligner_obj = roi_aligner.MultilevelROIAligner()
        detection_generator_obj = detection_generator.DetectionGenerator()
        panoptic_segmentation_generator_obj = panoptic_segmentation_generator.PanopticSegmentationGenerator(
            output_size=[image_size, image_size],
            max_num_detections=100,
            stuff_classes_offset=90)
        mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                            upsample_factor=2)
        mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                    num_sampled_masks=1)
        mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)

        if shared_backbone:
            segmentation_backbone = None
        else:
            segmentation_backbone = resnet.ResNet(
                model_id=segmentation_resnet_model_id)
        if not shared_decoder:
            feature_fusion = 'deeplabv3plus'
            level = aspp_decoder_level
            segmentation_decoder = aspp.ASPP(
                level=level, dilation_rates=aspp_dilation_rates)
        else:
            feature_fusion = 'panoptic_fpn_fusion'
            level = fpn_decoder_level
            segmentation_decoder = None
        segmentation_head = segmentation_heads.SegmentationHead(
            num_classes=2,  # stuff and common class for things,
            level=level,
            feature_fusion=feature_fusion,
            decoder_min_level=min_level,
            decoder_max_level=max_level,
            num_convs=2)

        model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
            backbone,
            decoder,
            rpn_head,
            detection_head,
            roi_generator_obj,
            roi_sampler_obj,
            roi_aligner_obj,
            detection_generator_obj,
            panoptic_segmentation_generator_obj,
            mask_head,
            mask_sampler_obj,
            mask_roi_aligner_obj,
            segmentation_backbone=segmentation_backbone,
            segmentation_decoder=segmentation_decoder,
            segmentation_head=segmentation_head,
            min_level=min_level,
            max_level=max_level,
            num_scales=num_scales,
            aspect_ratios=aspect_ratios,
            anchor_size=anchor_size)

        gt_boxes = tf.convert_to_tensor(
            [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
             [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
            dtype=tf.float32)
        gt_classes = tf.convert_to_tensor([[2, 1, -1], [1, -1, -1]],
                                          dtype=tf.int32)
        gt_masks = tf.ones((2, 3, 100, 100))

        # Results will be checked in test_forward.
        _ = model(images,
                  image_info,
                  anchor_boxes,
                  gt_boxes,
                  gt_classes,
                  gt_masks,
                  training=is_training)
    def test_forward(self, strategy, training, shared_backbone, shared_decoder,
                     generate_panoptic_masks):
        num_classes = 3
        min_level = 2
        max_level = 6
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        segmentation_resnet_model_id = 101
        aspp_dilation_rates = [6, 12, 18]
        aspp_decoder_level = 2
        fpn_decoder_level = 2

        class_agnostic_bbox_pred = False
        cascade_class_ensemble = False

        image_size = (256, 256)
        images = tf.random.normal([2, image_size[0], image_size[1], 3])
        image_info = tf.convert_to_tensor([[[224, 100], [224, 100], [1, 1],
                                            [0, 0]],
                                           [[224, 100], [224, 100], [1, 1],
                                            [0, 0]]])
        shared_decoder = shared_decoder and shared_backbone
        with strategy.scope():

            anchor_boxes = anchor.Anchor(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=anchor_size,
                image_size=image_size).multilevel_boxes

            num_anchors_per_location = len(aspect_ratios) * num_scales

            input_specs = tf.keras.layers.InputSpec(
                shape=[None, None, None, 3])
            backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
            decoder = fpn.FPN(min_level=min_level,
                              max_level=max_level,
                              input_specs=backbone.output_specs)
            rpn_head = dense_prediction_heads.RPNHead(
                min_level=min_level,
                max_level=max_level,
                num_anchors_per_location=num_anchors_per_location)
            detection_head = instance_heads.DetectionHead(
                num_classes=num_classes,
                class_agnostic_bbox_pred=class_agnostic_bbox_pred)
            roi_generator_obj = roi_generator.MultilevelROIGenerator()

            roi_sampler_cascade = []
            roi_sampler_obj = roi_sampler.ROISampler()
            roi_sampler_cascade.append(roi_sampler_obj)
            roi_aligner_obj = roi_aligner.MultilevelROIAligner()
            detection_generator_obj = detection_generator.DetectionGenerator()

            if generate_panoptic_masks:
                panoptic_segmentation_generator_obj = panoptic_segmentation_generator.PanopticSegmentationGenerator(
                    output_size=list(image_size),
                    max_num_detections=100,
                    stuff_classes_offset=90)
            else:
                panoptic_segmentation_generator_obj = None

            mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                                upsample_factor=2)
            mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                        num_sampled_masks=1)
            mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
                crop_size=14)

            if shared_backbone:
                segmentation_backbone = None
            else:
                segmentation_backbone = resnet.ResNet(
                    model_id=segmentation_resnet_model_id)
            if not shared_decoder:
                feature_fusion = 'deeplabv3plus'
                level = aspp_decoder_level
                segmentation_decoder = aspp.ASPP(
                    level=level, dilation_rates=aspp_dilation_rates)
            else:
                feature_fusion = 'panoptic_fpn_fusion'
                level = fpn_decoder_level
                segmentation_decoder = None
            segmentation_head = segmentation_heads.SegmentationHead(
                num_classes=2,  # stuff and common class for things,
                level=level,
                feature_fusion=feature_fusion,
                decoder_min_level=min_level,
                decoder_max_level=max_level,
                num_convs=2)

            model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
                backbone,
                decoder,
                rpn_head,
                detection_head,
                roi_generator_obj,
                roi_sampler_obj,
                roi_aligner_obj,
                detection_generator_obj,
                panoptic_segmentation_generator_obj,
                mask_head,
                mask_sampler_obj,
                mask_roi_aligner_obj,
                segmentation_backbone=segmentation_backbone,
                segmentation_decoder=segmentation_decoder,
                segmentation_head=segmentation_head,
                class_agnostic_bbox_pred=class_agnostic_bbox_pred,
                cascade_class_ensemble=cascade_class_ensemble,
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=anchor_size)

            gt_boxes = tf.convert_to_tensor(
                [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
                 [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
                dtype=tf.float32)
            gt_classes = tf.convert_to_tensor([[2, 1, -1], [1, -1, -1]],
                                              dtype=tf.int32)
            gt_masks = tf.ones((2, 3, 100, 100))

            results = model(images,
                            image_info,
                            anchor_boxes,
                            gt_boxes,
                            gt_classes,
                            gt_masks,
                            training=training)

        self.assertIn('rpn_boxes', results)
        self.assertIn('rpn_scores', results)
        if training:
            self.assertIn('class_targets', results)
            self.assertIn('box_targets', results)
            self.assertIn('class_outputs', results)
            self.assertIn('box_outputs', results)
            self.assertIn('mask_outputs', results)
        else:
            self.assertIn('detection_boxes', results)
            self.assertIn('detection_scores', results)
            self.assertIn('detection_classes', results)
            self.assertIn('num_detections', results)
            self.assertIn('detection_masks', results)
            self.assertIn('segmentation_outputs', results)

            self.assertAllEqual([
                2, image_size[0] // (2**level), image_size[1] // (2**level), 2
            ], results['segmentation_outputs'].numpy().shape)

            if generate_panoptic_masks:
                self.assertIn('panoptic_outputs', results)
                self.assertIn('category_mask', results['panoptic_outputs'])
                self.assertIn('instance_mask', results['panoptic_outputs'])
                self.assertAllEqual(
                    [2, image_size[0], image_size[1]],
                    results['panoptic_outputs']['category_mask'].numpy().shape)
                self.assertAllEqual(
                    [2, image_size[0], image_size[1]],
                    results['panoptic_outputs']['instance_mask'].numpy().shape)
            else:
                self.assertNotIn('panoptic_outputs', results)
Ejemplo n.º 15
0
    def test_build_model(self,
                         use_separable_conv,
                         build_anchor_boxes,
                         shared_backbone,
                         shared_decoder,
                         is_training=True):
        num_classes = 3
        min_level = 3
        max_level = 7
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        resnet_model_id = 50
        segmentation_resnet_model_id = 50
        segmentation_output_stride = 16
        aspp_dilation_rates = [6, 12, 18]
        aspp_decoder_level = int(np.math.log2(segmentation_output_stride))
        fpn_decoder_level = 3
        num_anchors_per_location = num_scales * len(aspect_ratios)
        image_size = 128
        images = np.random.rand(2, image_size, image_size, 3)
        image_shape = np.array([[image_size, image_size],
                                [image_size, image_size]])
        shared_decoder = shared_decoder and shared_backbone
        if build_anchor_boxes:
            anchor_boxes = anchor.Anchor(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=3,
                image_size=(image_size, image_size)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
        else:
            anchor_boxes = None

        backbone = resnet.ResNet(model_id=resnet_model_id)
        decoder = fpn.FPN(input_specs=backbone.output_specs,
                          min_level=min_level,
                          max_level=max_level,
                          use_separable_conv=use_separable_conv)
        rpn_head = dense_prediction_heads.RPNHead(
            min_level=min_level,
            max_level=max_level,
            num_anchors_per_location=num_anchors_per_location,
            num_convs=1)
        detection_head = instance_heads.DetectionHead(num_classes=num_classes)
        roi_generator_obj = roi_generator.MultilevelROIGenerator()
        roi_sampler_obj = roi_sampler.ROISampler()
        roi_aligner_obj = roi_aligner.MultilevelROIAligner()
        detection_generator_obj = detection_generator.DetectionGenerator()
        mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                            upsample_factor=2)
        mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                    num_sampled_masks=1)
        mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)

        if shared_backbone:
            segmentation_backbone = None
        else:
            segmentation_backbone = resnet.ResNet(
                model_id=segmentation_resnet_model_id)
        if not shared_decoder:
            level = aspp_decoder_level
            segmentation_decoder = aspp.ASPP(
                level=level, dilation_rates=aspp_dilation_rates)
        else:
            level = fpn_decoder_level
            segmentation_decoder = None
        segmentation_head = segmentation_heads.SegmentationHead(
            num_classes=2,  # stuff and common class for things,
            level=level,
            num_convs=2)

        model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
            backbone,
            decoder,
            rpn_head,
            detection_head,
            roi_generator_obj,
            roi_sampler_obj,
            roi_aligner_obj,
            detection_generator_obj,
            mask_head,
            mask_sampler_obj,
            mask_roi_aligner_obj,
            segmentation_backbone=segmentation_backbone,
            segmentation_decoder=segmentation_decoder,
            segmentation_head=segmentation_head,
            min_level=min_level,
            max_level=max_level,
            num_scales=num_scales,
            aspect_ratios=aspect_ratios,
            anchor_size=anchor_size)

        gt_boxes = np.array(
            [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
             [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
            dtype=np.float32)
        gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
        gt_masks = np.ones((2, 3, 100, 100))

        # Results will be checked in test_forward.
        _ = model(images,
                  image_shape,
                  anchor_boxes,
                  gt_boxes,
                  gt_classes,
                  gt_masks,
                  training=is_training)
Ejemplo n.º 16
0
    def test_forward(self, strategy, training, shared_backbone,
                     shared_decoder):
        num_classes = 3
        min_level = 3
        max_level = 4
        num_scales = 3
        aspect_ratios = [1.0]
        anchor_size = 3
        segmentation_resnet_model_id = 101
        segmentation_output_stride = 16
        aspp_dilation_rates = [6, 12, 18]
        aspp_decoder_level = int(np.math.log2(segmentation_output_stride))
        fpn_decoder_level = 3

        class_agnostic_bbox_pred = False
        cascade_class_ensemble = False

        image_size = (256, 256)
        images = np.random.rand(2, image_size[0], image_size[1], 3)
        image_shape = np.array([[224, 100], [100, 224]])
        shared_decoder = shared_decoder and shared_backbone
        with strategy.scope():

            anchor_boxes = anchor.Anchor(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=anchor_size,
                image_size=image_size).multilevel_boxes

            num_anchors_per_location = len(aspect_ratios) * num_scales

            input_specs = tf.keras.layers.InputSpec(
                shape=[None, None, None, 3])
            backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
            decoder = fpn.FPN(min_level=min_level,
                              max_level=max_level,
                              input_specs=backbone.output_specs)
            rpn_head = dense_prediction_heads.RPNHead(
                min_level=min_level,
                max_level=max_level,
                num_anchors_per_location=num_anchors_per_location)
            detection_head = instance_heads.DetectionHead(
                num_classes=num_classes,
                class_agnostic_bbox_pred=class_agnostic_bbox_pred)
            roi_generator_obj = roi_generator.MultilevelROIGenerator()

            roi_sampler_cascade = []
            roi_sampler_obj = roi_sampler.ROISampler()
            roi_sampler_cascade.append(roi_sampler_obj)
            roi_aligner_obj = roi_aligner.MultilevelROIAligner()
            detection_generator_obj = detection_generator.DetectionGenerator()
            mask_head = instance_heads.MaskHead(num_classes=num_classes,
                                                upsample_factor=2)
            mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28,
                                                        num_sampled_masks=1)
            mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(
                crop_size=14)

            if shared_backbone:
                segmentation_backbone = None
            else:
                segmentation_backbone = resnet.ResNet(
                    model_id=segmentation_resnet_model_id)
            if not shared_decoder:
                level = aspp_decoder_level
                segmentation_decoder = aspp.ASPP(
                    level=level, dilation_rates=aspp_dilation_rates)
            else:
                level = fpn_decoder_level
                segmentation_decoder = None
            segmentation_head = segmentation_heads.SegmentationHead(
                num_classes=2,  # stuff and common class for things,
                level=level,
                num_convs=2)

            model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
                backbone,
                decoder,
                rpn_head,
                detection_head,
                roi_generator_obj,
                roi_sampler_obj,
                roi_aligner_obj,
                detection_generator_obj,
                mask_head,
                mask_sampler_obj,
                mask_roi_aligner_obj,
                segmentation_backbone=segmentation_backbone,
                segmentation_decoder=segmentation_decoder,
                segmentation_head=segmentation_head,
                class_agnostic_bbox_pred=class_agnostic_bbox_pred,
                cascade_class_ensemble=cascade_class_ensemble,
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=anchor_size)

            gt_boxes = np.array(
                [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
                 [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
                dtype=np.float32)
            gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
            gt_masks = np.ones((2, 3, 100, 100))

            results = model(images,
                            image_shape,
                            anchor_boxes,
                            gt_boxes,
                            gt_classes,
                            gt_masks,
                            training=training)

        self.assertIn('rpn_boxes', results)
        self.assertIn('rpn_scores', results)
        if training:
            self.assertIn('class_targets', results)
            self.assertIn('box_targets', results)
            self.assertIn('class_outputs', results)
            self.assertIn('box_outputs', results)
            self.assertIn('mask_outputs', results)
        else:
            self.assertIn('detection_boxes', results)
            self.assertIn('detection_scores', results)
            self.assertIn('detection_classes', results)
            self.assertIn('num_detections', results)
            self.assertIn('detection_masks', results)
            self.assertIn('segmentation_outputs', results)
            self.assertAllEqual([
                2, image_size[0] // (2**level), image_size[1] // (2**level), 2
            ], results['segmentation_outputs'].numpy().shape)