def testAnchorGeneration(self, min_level, max_level, num_scales, aspect_ratios, anchor_size, expected_boxes): image_size = [64, 64] anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios, anchor_size, image_size) boxes = anchors.boxes.numpy() self.assertEqual(expected_boxes, boxes.tolist())
def testAnchorGenerationWithImageSizeAsTensor(self, min_level, max_level, num_scales, aspect_ratios, anchor_size, expected_boxes): image_size = tf.constant([64, 64], tf.int32) anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios, anchor_size, image_size) boxes = anchors.boxes.numpy() self.assertEqual(expected_boxes, boxes.tolist())
def construct_model_and_anchors(image_size, use_gt_boxes_for_masks): num_classes = 3 min_level = 3 max_level = 4 num_scales = 3 aspect_ratios = [1.0] anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=3, image_size=image_size).multilevel_boxes num_anchors_per_location = len(aspect_ratios) * num_scales input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3]) backbone = resnet.ResNet(model_id=50, input_specs=input_specs) decoder = fpn.FPN( min_level=min_level, max_level=max_level, input_specs=backbone.output_specs) rpn_head = dense_prediction_heads.RPNHead( min_level=min_level, max_level=max_level, num_anchors_per_location=num_anchors_per_location) detection_head = instance_heads.DetectionHead( num_classes=num_classes) roi_generator_obj = roi_generator.MultilevelROIGenerator() roi_sampler_obj = roi_sampler.ROISampler() roi_aligner_obj = roi_aligner.MultilevelROIAligner() detection_generator_obj = detection_generator.DetectionGenerator() mask_head = deep_instance_heads.DeepMaskHead( num_classes=num_classes, upsample_factor=2) mask_sampler_obj = mask_sampler.MaskSampler( mask_target_size=28, num_sampled_masks=1) mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14) model = maskrcnn_model.DeepMaskRCNNModel( backbone, decoder, rpn_head, detection_head, roi_generator_obj, roi_sampler_obj, roi_aligner_obj, detection_generator_obj, mask_head, mask_sampler_obj, mask_roi_aligner_obj, use_gt_boxes_for_masks=use_gt_boxes_for_masks) return model, anchor_boxes
def testEquivalentResult(self, min_level, max_level, aspect_ratios, num_scales, anchor_size, image_size): anchor_gen = anchor.build_anchor_generator(min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) anchors = anchor_gen(image_size) expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios, anchor_size, image_size) expected_anchors = expected_anchor_gen.multilevel_boxes for k in expected_anchors.keys(): self.assertAllClose(expected_anchors[k], anchors[k])
def test_build_model(self, use_separable_conv, build_anchor_boxes, is_training, has_att_heads): num_classes = 3 min_level = 3 max_level = 7 num_scales = 3 aspect_ratios = [1.0] anchor_size = 3 fpn_num_filters = 256 head_num_convs = 4 head_num_filters = 256 num_anchors_per_location = num_scales * len(aspect_ratios) image_size = 384 images = np.random.rand(2, image_size, image_size, 3) image_shape = np.array([[image_size, image_size], [image_size, image_size]]) if build_anchor_boxes: anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size, image_size=(image_size, image_size)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) else: anchor_boxes = None if has_att_heads: attribute_heads = [dict(name='depth', type='regression', size=1)] else: attribute_heads = None backbone = resnet.ResNet(model_id=50) decoder = fpn.FPN(input_specs=backbone.output_specs, min_level=min_level, max_level=max_level, num_filters=fpn_num_filters, use_separable_conv=use_separable_conv) head = dense_prediction_heads.RetinaNetHead( min_level=min_level, max_level=max_level, num_classes=num_classes, attribute_heads=attribute_heads, num_anchors_per_location=num_anchors_per_location, use_separable_conv=use_separable_conv, num_convs=head_num_convs, num_filters=head_num_filters) generator = detection_generator.MultilevelDetectionGenerator( max_num_detections=10) model = retinanet_model.RetinaNetModel(backbone=backbone, decoder=decoder, head=head, detection_generator=generator, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) _ = model(images, image_shape, anchor_boxes, training=is_training)
def test_build_model(self, use_separable_conv, build_anchor_boxes, shared_backbone, shared_decoder, is_training=True): num_classes = 3 min_level = 2 max_level = 6 num_scales = 3 aspect_ratios = [1.0] anchor_size = 3 resnet_model_id = 50 segmentation_resnet_model_id = 50 aspp_dilation_rates = [6, 12, 18] aspp_decoder_level = 2 fpn_decoder_level = 2 num_anchors_per_location = num_scales * len(aspect_ratios) image_size = 128 images = tf.random.normal([2, image_size, image_size, 3]) image_info = tf.convert_to_tensor([[[image_size, image_size], [image_size, image_size], [1, 1], [0, 0]], [[image_size, image_size], [image_size, image_size], [1, 1], [0, 0]]]) shared_decoder = shared_decoder and shared_backbone if build_anchor_boxes or not is_training: anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=3, image_size=(image_size, image_size)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) else: anchor_boxes = None backbone = resnet.ResNet(model_id=resnet_model_id) decoder = fpn.FPN(input_specs=backbone.output_specs, min_level=min_level, max_level=max_level, use_separable_conv=use_separable_conv) rpn_head = dense_prediction_heads.RPNHead( min_level=min_level, max_level=max_level, num_anchors_per_location=num_anchors_per_location, num_convs=1) detection_head = instance_heads.DetectionHead(num_classes=num_classes) roi_generator_obj = roi_generator.MultilevelROIGenerator() roi_sampler_obj = roi_sampler.ROISampler() roi_aligner_obj = roi_aligner.MultilevelROIAligner() detection_generator_obj = detection_generator.DetectionGenerator() panoptic_segmentation_generator_obj = panoptic_segmentation_generator.PanopticSegmentationGenerator( output_size=[image_size, image_size], max_num_detections=100, stuff_classes_offset=90) mask_head = instance_heads.MaskHead(num_classes=num_classes, upsample_factor=2) mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28, num_sampled_masks=1) mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14) if shared_backbone: segmentation_backbone = None else: segmentation_backbone = resnet.ResNet( model_id=segmentation_resnet_model_id) if not shared_decoder: feature_fusion = 'deeplabv3plus' level = aspp_decoder_level segmentation_decoder = aspp.ASPP( level=level, dilation_rates=aspp_dilation_rates) else: feature_fusion = 'panoptic_fpn_fusion' level = fpn_decoder_level segmentation_decoder = None segmentation_head = segmentation_heads.SegmentationHead( num_classes=2, # stuff and common class for things, level=level, feature_fusion=feature_fusion, decoder_min_level=min_level, decoder_max_level=max_level, num_convs=2) model = panoptic_maskrcnn_model.PanopticMaskRCNNModel( backbone, decoder, rpn_head, detection_head, roi_generator_obj, roi_sampler_obj, roi_aligner_obj, detection_generator_obj, panoptic_segmentation_generator_obj, mask_head, mask_sampler_obj, mask_roi_aligner_obj, segmentation_backbone=segmentation_backbone, segmentation_decoder=segmentation_decoder, segmentation_head=segmentation_head, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) gt_boxes = tf.convert_to_tensor( [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]], [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]], dtype=tf.float32) gt_classes = tf.convert_to_tensor([[2, 1, -1], [1, -1, -1]], dtype=tf.int32) gt_masks = tf.ones((2, 3, 100, 100)) # Results will be checked in test_forward. _ = model(images, image_info, anchor_boxes, gt_boxes, gt_classes, gt_masks, training=is_training)
def test_forward(self, strategy, training, shared_backbone, shared_decoder, generate_panoptic_masks): num_classes = 3 min_level = 2 max_level = 6 num_scales = 3 aspect_ratios = [1.0] anchor_size = 3 segmentation_resnet_model_id = 101 aspp_dilation_rates = [6, 12, 18] aspp_decoder_level = 2 fpn_decoder_level = 2 class_agnostic_bbox_pred = False cascade_class_ensemble = False image_size = (256, 256) images = tf.random.normal([2, image_size[0], image_size[1], 3]) image_info = tf.convert_to_tensor([[[224, 100], [224, 100], [1, 1], [0, 0]], [[224, 100], [224, 100], [1, 1], [0, 0]]]) shared_decoder = shared_decoder and shared_backbone with strategy.scope(): anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size, image_size=image_size).multilevel_boxes num_anchors_per_location = len(aspect_ratios) * num_scales input_specs = tf.keras.layers.InputSpec( shape=[None, None, None, 3]) backbone = resnet.ResNet(model_id=50, input_specs=input_specs) decoder = fpn.FPN(min_level=min_level, max_level=max_level, input_specs=backbone.output_specs) rpn_head = dense_prediction_heads.RPNHead( min_level=min_level, max_level=max_level, num_anchors_per_location=num_anchors_per_location) detection_head = instance_heads.DetectionHead( num_classes=num_classes, class_agnostic_bbox_pred=class_agnostic_bbox_pred) roi_generator_obj = roi_generator.MultilevelROIGenerator() roi_sampler_cascade = [] roi_sampler_obj = roi_sampler.ROISampler() roi_sampler_cascade.append(roi_sampler_obj) roi_aligner_obj = roi_aligner.MultilevelROIAligner() detection_generator_obj = detection_generator.DetectionGenerator() if generate_panoptic_masks: panoptic_segmentation_generator_obj = panoptic_segmentation_generator.PanopticSegmentationGenerator( output_size=list(image_size), max_num_detections=100, stuff_classes_offset=90) else: panoptic_segmentation_generator_obj = None mask_head = instance_heads.MaskHead(num_classes=num_classes, upsample_factor=2) mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28, num_sampled_masks=1) mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner( crop_size=14) if shared_backbone: segmentation_backbone = None else: segmentation_backbone = resnet.ResNet( model_id=segmentation_resnet_model_id) if not shared_decoder: feature_fusion = 'deeplabv3plus' level = aspp_decoder_level segmentation_decoder = aspp.ASPP( level=level, dilation_rates=aspp_dilation_rates) else: feature_fusion = 'panoptic_fpn_fusion' level = fpn_decoder_level segmentation_decoder = None segmentation_head = segmentation_heads.SegmentationHead( num_classes=2, # stuff and common class for things, level=level, feature_fusion=feature_fusion, decoder_min_level=min_level, decoder_max_level=max_level, num_convs=2) model = panoptic_maskrcnn_model.PanopticMaskRCNNModel( backbone, decoder, rpn_head, detection_head, roi_generator_obj, roi_sampler_obj, roi_aligner_obj, detection_generator_obj, panoptic_segmentation_generator_obj, mask_head, mask_sampler_obj, mask_roi_aligner_obj, segmentation_backbone=segmentation_backbone, segmentation_decoder=segmentation_decoder, segmentation_head=segmentation_head, class_agnostic_bbox_pred=class_agnostic_bbox_pred, cascade_class_ensemble=cascade_class_ensemble, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) gt_boxes = tf.convert_to_tensor( [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]], [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]], dtype=tf.float32) gt_classes = tf.convert_to_tensor([[2, 1, -1], [1, -1, -1]], dtype=tf.int32) gt_masks = tf.ones((2, 3, 100, 100)) results = model(images, image_info, anchor_boxes, gt_boxes, gt_classes, gt_masks, training=training) self.assertIn('rpn_boxes', results) self.assertIn('rpn_scores', results) if training: self.assertIn('class_targets', results) self.assertIn('box_targets', results) self.assertIn('class_outputs', results) self.assertIn('box_outputs', results) self.assertIn('mask_outputs', results) else: self.assertIn('detection_boxes', results) self.assertIn('detection_scores', results) self.assertIn('detection_classes', results) self.assertIn('num_detections', results) self.assertIn('detection_masks', results) self.assertIn('segmentation_outputs', results) self.assertAllEqual([ 2, image_size[0] // (2**level), image_size[1] // (2**level), 2 ], results['segmentation_outputs'].numpy().shape) if generate_panoptic_masks: self.assertIn('panoptic_outputs', results) self.assertIn('category_mask', results['panoptic_outputs']) self.assertIn('instance_mask', results['panoptic_outputs']) self.assertAllEqual( [2, image_size[0], image_size[1]], results['panoptic_outputs']['category_mask'].numpy().shape) self.assertAllEqual( [2, image_size[0], image_size[1]], results['panoptic_outputs']['instance_mask'].numpy().shape) else: self.assertNotIn('panoptic_outputs', results)
def test_build_model(self, include_mask, use_separable_conv, build_anchor_boxes, is_training): num_classes = 3 min_level = 3 max_level = 7 num_scales = 3 aspect_ratios = [1.0] anchor_size = 3 resnet_model_id = 50 num_anchors_per_location = num_scales * len(aspect_ratios) image_size = 384 images = np.random.rand(2, image_size, image_size, 3) image_shape = np.array([[image_size, image_size], [image_size, image_size]]) if build_anchor_boxes: anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=3, image_size=(image_size, image_size)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) else: anchor_boxes = None backbone = resnet.ResNet(model_id=resnet_model_id) decoder = fpn.FPN(input_specs=backbone.output_specs, min_level=min_level, max_level=max_level, use_separable_conv=use_separable_conv) rpn_head = dense_prediction_heads.RPNHead( min_level=min_level, max_level=max_level, num_anchors_per_location=num_anchors_per_location, num_convs=1) detection_head = instance_heads.DetectionHead(num_classes=num_classes) roi_generator_obj = roi_generator.MultilevelROIGenerator() roi_sampler_obj = roi_sampler.ROISampler() roi_aligner_obj = roi_aligner.MultilevelROIAligner() detection_generator_obj = detection_generator.DetectionGenerator() if include_mask: mask_head = instance_heads.MaskHead(num_classes=num_classes, upsample_factor=2) mask_sampler_obj = mask_sampler.MaskSampler(mask_target_size=28, num_sampled_masks=1) mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner( crop_size=14) else: mask_head = None mask_sampler_obj = None mask_roi_aligner_obj = None model = maskrcnn_model.MaskRCNNModel(backbone, decoder, rpn_head, detection_head, roi_generator_obj, roi_sampler_obj, roi_aligner_obj, detection_generator_obj, mask_head, mask_sampler_obj, mask_roi_aligner_obj, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) gt_boxes = np.array( [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]], [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]], dtype=np.float32) gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32) if include_mask: gt_masks = np.ones((2, 3, 100, 100)) else: gt_masks = None # Results will be checked in test_forward. _ = model(images, image_shape, anchor_boxes, gt_boxes, gt_classes, gt_masks, training=is_training)
def test_forward(self, strategy, include_mask, build_anchor_boxes, training, use_cascade_heads): num_classes = 3 min_level = 3 max_level = 4 num_scales = 3 aspect_ratios = [1.0] anchor_size = 3 if use_cascade_heads: cascade_iou_thresholds = [0.6] class_agnostic_bbox_pred = True cascade_class_ensemble = True else: cascade_iou_thresholds = None class_agnostic_bbox_pred = False cascade_class_ensemble = False image_size = (256, 256) images = np.random.rand(2, image_size[0], image_size[1], 3) image_shape = np.array([[224, 100], [100, 224]]) with strategy.scope(): if build_anchor_boxes: anchor_boxes = anchor.Anchor( min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size, image_size=image_size).multilevel_boxes else: anchor_boxes = None num_anchors_per_location = len(aspect_ratios) * num_scales input_specs = tf.keras.layers.InputSpec( shape=[None, None, None, 3]) backbone = resnet.ResNet(model_id=50, input_specs=input_specs) decoder = fpn.FPN(min_level=min_level, max_level=max_level, input_specs=backbone.output_specs) rpn_head = dense_prediction_heads.RPNHead( min_level=min_level, max_level=max_level, num_anchors_per_location=num_anchors_per_location) detection_head = instance_heads.DetectionHead( num_classes=num_classes, class_agnostic_bbox_pred=class_agnostic_bbox_pred) roi_generator_obj = roi_generator.MultilevelROIGenerator() roi_sampler_cascade = [] roi_sampler_obj = roi_sampler.ROISampler() roi_sampler_cascade.append(roi_sampler_obj) if cascade_iou_thresholds: for iou in cascade_iou_thresholds: roi_sampler_obj = roi_sampler.ROISampler( mix_gt_boxes=False, foreground_iou_threshold=iou, background_iou_high_threshold=iou, background_iou_low_threshold=0.0, skip_subsampling=True) roi_sampler_cascade.append(roi_sampler_obj) roi_aligner_obj = roi_aligner.MultilevelROIAligner() detection_generator_obj = detection_generator.DetectionGenerator() if include_mask: mask_head = instance_heads.MaskHead(num_classes=num_classes, upsample_factor=2) mask_sampler_obj = mask_sampler.MaskSampler( mask_target_size=28, num_sampled_masks=1) mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner( crop_size=14) else: mask_head = None mask_sampler_obj = None mask_roi_aligner_obj = None model = maskrcnn_model.MaskRCNNModel( backbone, decoder, rpn_head, detection_head, roi_generator_obj, roi_sampler_obj, roi_aligner_obj, detection_generator_obj, mask_head, mask_sampler_obj, mask_roi_aligner_obj, class_agnostic_bbox_pred=class_agnostic_bbox_pred, cascade_class_ensemble=cascade_class_ensemble, min_level=min_level, max_level=max_level, num_scales=num_scales, aspect_ratios=aspect_ratios, anchor_size=anchor_size) gt_boxes = np.array( [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]], [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]], dtype=np.float32) gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32) if include_mask: gt_masks = np.ones((2, 3, 100, 100)) else: gt_masks = None results = model(images, image_shape, anchor_boxes, gt_boxes, gt_classes, gt_masks, training=training) self.assertIn('rpn_boxes', results) self.assertIn('rpn_scores', results) if training: self.assertIn('class_targets', results) self.assertIn('box_targets', results) self.assertIn('class_outputs', results) self.assertIn('box_outputs', results) if include_mask: self.assertIn('mask_outputs', results) else: self.assertIn('detection_boxes', results) self.assertIn('detection_scores', results) self.assertIn('detection_classes', results) self.assertIn('num_detections', results) if include_mask: self.assertIn('detection_masks', results)
def _call_box_outputs( self, images: tf.Tensor, image_shape: tf.Tensor, anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, gt_boxes: Optional[tf.Tensor] = None, gt_classes: Optional[tf.Tensor] = None, training: Optional[bool] = None) -> Tuple[ Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]: """Implementation of the Faster-RCNN logic for boxes.""" model_outputs = {} # Feature extraction. (backbone_features, decoder_features) = self._get_backbone_and_decoder_features(images) # Region proposal network. rpn_scores, rpn_boxes = self.rpn_head(decoder_features) model_outputs.update({ 'backbone_features': backbone_features, 'decoder_features': decoder_features, 'rpn_boxes': rpn_boxes, 'rpn_scores': rpn_scores }) # Generate anchor boxes for this batch if not provided. if anchor_boxes is None: _, image_height, image_width, _ = images.get_shape().as_list() anchor_boxes = anchor.Anchor( min_level=self._config_dict['min_level'], max_level=self._config_dict['max_level'], num_scales=self._config_dict['num_scales'], aspect_ratios=self._config_dict['aspect_ratios'], anchor_size=self._config_dict['anchor_size'], image_size=(image_height, image_width)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [tf.shape(images)[0], 1, 1, 1]) # Generate RoIs. current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes, image_shape, training) next_rois = current_rois all_class_outputs = [] for cascade_num in range(len(self.roi_sampler)): # In cascade RCNN we want the higher layers to have different regression # weights as the predicted deltas become smaller and smaller. regression_weights = self._cascade_layer_to_weights[cascade_num] current_rois = next_rois (class_outputs, box_outputs, model_outputs, matched_gt_boxes, matched_gt_classes, matched_gt_indices, current_rois) = self._run_frcnn_head( features=decoder_features, rois=current_rois, gt_boxes=gt_boxes, gt_classes=gt_classes, training=training, model_outputs=model_outputs, cascade_num=cascade_num, regression_weights=regression_weights) all_class_outputs.append(class_outputs) # Generate ROIs for the next cascade head if there is any. if cascade_num < len(self.roi_sampler) - 1: next_rois = box_ops.decode_boxes( tf.cast(box_outputs, tf.float32), current_rois, weights=regression_weights) next_rois = box_ops.clip_boxes(next_rois, tf.expand_dims(image_shape, axis=1)) if not training: if self._config_dict['cascade_class_ensemble']: class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs) detections = self.detection_generator( box_outputs, class_outputs, current_rois, image_shape, regression_weights, bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred'])) model_outputs.update({ 'cls_outputs': class_outputs, 'box_outputs': box_outputs, }) if self.detection_generator.get_config()['apply_nms']: model_outputs.update({ 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'] }) else: model_outputs.update({ 'decoded_boxes': detections['decoded_boxes'], 'decoded_box_scores': detections['decoded_box_scores'] }) intermediate_outputs = { 'matched_gt_boxes': matched_gt_boxes, 'matched_gt_indices': matched_gt_indices, 'matched_gt_classes': matched_gt_classes, 'current_rois': current_rois, } return (model_outputs, intermediate_outputs)
def call(self, images: Union[tf.Tensor, Sequence[tf.Tensor]], image_shape: Optional[tf.Tensor] = None, anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, output_intermediate_features: bool = False, training: bool = None) -> Mapping[str, tf.Tensor]: """Forward pass of the RetinaNet model. Args: images: `Tensor` or a sequence of `Tensor`, the input batched images to the backbone network, whose shape(s) is [batch, height, width, 3]. If it is a sequence of `Tensor`, we will assume the anchors are generated based on the shape of the first image(s). image_shape: `Tensor`, the actual shape of the input images, whose shape is [batch, 2] where the last dimension is [height, width]. Note that this is the actual image shape excluding paddings. For example, images in the batch may be resized into different shapes before padding to the fixed size. anchor_boxes: a dict of tensors which includes multilevel anchors. - key: `str`, the level of the multilevel predictions. - values: `Tensor`, the anchor coordinates of a particular feature level, whose shape is [height_l, width_l, num_anchors_per_location]. output_intermediate_features: `bool` indicating whether to return the intermediate feature maps generated by backbone and decoder. training: `bool`, indicating whether it is in training mode. Returns: scores: a dict of tensors which includes scores of the predictions. - key: `str`, the level of the multilevel predictions. - values: `Tensor`, the box scores predicted from a particular feature level, whose shape is [batch, height_l, width_l, num_classes * num_anchors_per_location]. boxes: a dict of tensors which includes coordinates of the predictions. - key: `str`, the level of the multilevel predictions. - values: `Tensor`, the box coordinates predicted from a particular feature level, whose shape is [batch, height_l, width_l, 4 * num_anchors_per_location]. attributes: a dict of (attribute_name, attribute_predictions). Each attribute prediction is a dict that includes: - key: `str`, the level of the multilevel predictions. - values: `Tensor`, the attribute predictions from a particular feature level, whose shape is [batch, height_l, width_l, att_size * num_anchors_per_location]. """ outputs = {} # Feature extraction. features = self.backbone(images) if output_intermediate_features: outputs.update( {'backbone_{}'.format(k): v for k, v in features.items()}) if self.decoder: features = self.decoder(features) if output_intermediate_features: outputs.update( {'decoder_{}'.format(k): v for k, v in features.items()}) # Dense prediction. `raw_attributes` can be empty. raw_scores, raw_boxes, raw_attributes = self.head(features) if training: outputs.update({ 'cls_outputs': raw_scores, 'box_outputs': raw_boxes, }) if raw_attributes: outputs.update({'attribute_outputs': raw_attributes}) return outputs else: # Generate anchor boxes for this batch if not provided. if anchor_boxes is None: if isinstance(images, Sequence): primary_images = images[0] elif isinstance(images, tf.Tensor): primary_images = images else: raise ValueError( 'Input should be a tf.Tensor or a sequence of tf.Tensor, not {}.' .format(type(images))) _, image_height, image_width, _ = primary_images.get_shape( ).as_list() anchor_boxes = anchor.Anchor( min_level=self._config_dict['min_level'], max_level=self._config_dict['max_level'], num_scales=self._config_dict['num_scales'], aspect_ratios=self._config_dict['aspect_ratios'], anchor_size=self._config_dict['anchor_size'], image_size=(image_height, image_width)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [tf.shape(primary_images)[0], 1, 1, 1]) # Post-processing. final_results = self.detection_generator(raw_boxes, raw_scores, anchor_boxes, image_shape, raw_attributes) outputs.update({ 'cls_outputs': raw_scores, 'box_outputs': raw_boxes, }) if self.detection_generator.get_config()['apply_nms']: outputs.update({ 'detection_boxes': final_results['detection_boxes'], 'detection_scores': final_results['detection_scores'], 'detection_classes': final_results['detection_classes'], 'num_detections': final_results['num_detections'] }) else: outputs.update({ 'decoded_boxes': final_results['decoded_boxes'], 'decoded_box_scores': final_results['decoded_box_scores'] }) if raw_attributes: outputs.update({ 'attribute_outputs': raw_attributes, 'detection_attributes': final_results['detection_attributes'], }) return outputs