Exemple #1
0
 def _build_anchor_boxes(self):
     """Builds and returns anchor boxes."""
     model_params = self.params.task.model
     input_anchor = anchor.build_anchor_generator(
         min_level=model_params.min_level,
         max_level=model_params.max_level,
         num_scales=model_params.anchor.num_scales,
         aspect_ratios=model_params.anchor.aspect_ratios,
         anchor_size=model_params.anchor.anchor_size)
     return input_anchor(image_size=(self._input_image_size[0],
                                     self._input_image_size[1]))
    def testLabelAnchors(self, min_level, max_level, num_scales, aspect_ratios,
                         anchor_size, has_attribute):
        input_size = [512, 512]
        ground_truth_class_id = 2
        attribute_name = 'depth'
        ground_truth_depth = 3.0

        # The matched anchors are the anchors used as ground truth and the anchors
        # at the next octave scale on the same location.
        expected_anchor_locations = [[0, 0, 0], [0, 0, 1]]
        anchor_gen = anchor.build_anchor_generator(min_level, max_level,
                                                   num_scales, aspect_ratios,
                                                   anchor_size)
        anchor_boxes = anchor_gen(input_size)
        anchor_labeler = anchor.AnchorLabeler()

        # Uses the first anchors as ground truth. The ground truth should map to
        # two anchors with two intermediate scales at the same location.
        gt_boxes = anchor_boxes['3'][0:1, 0, 0:4]
        gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32)
        gt_attributes = {
            attribute_name: tf.constant([[ground_truth_depth]],
                                        dtype=tf.float32)
        } if has_attribute else {}

        (cls_targets, box_targets, att_targets, _,
         box_weights) = anchor_labeler.label_anchors(anchor_boxes, gt_boxes,
                                                     gt_classes, gt_attributes)

        for k, v in cls_targets.items():
            cls_targets[k] = v.numpy()
        for k, v in box_targets.items():
            box_targets[k] = v.numpy()
        box_weights = box_weights.numpy()

        anchor_locations = np.vstack(
            np.where(cls_targets[str(min_level)] > -1)).transpose()
        self.assertAllClose(expected_anchor_locations, anchor_locations)
        # Two anchor boxes on min_level got matched to the gt_boxes.
        self.assertAllClose(tf.reduce_sum(box_weights), 2)

        if has_attribute:
            self.assertIn(attribute_name, att_targets)
            for k, v in att_targets[attribute_name].items():
                att_targets[attribute_name][k] = v.numpy()
            anchor_locations = np.vstack(
                np.where(att_targets[attribute_name][str(min_level)] > 0.0)
            ).transpose()
            self.assertAllClose(expected_anchor_locations, anchor_locations)
        else:
            self.assertEmpty(att_targets)
    def testEquivalentResult(self, min_level, max_level, aspect_ratios,
                             num_scales, anchor_size, image_size):
        anchor_gen = anchor.build_anchor_generator(min_level=min_level,
                                                   max_level=max_level,
                                                   num_scales=num_scales,
                                                   aspect_ratios=aspect_ratios,
                                                   anchor_size=anchor_size)
        anchors = anchor_gen(image_size)
        expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales,
                                            aspect_ratios, anchor_size,
                                            image_size)

        expected_anchors = expected_anchor_gen.multilevel_boxes
        for k in expected_anchors.keys():
            self.assertAllClose(expected_anchors[k], anchors[k])
    def test_forward(self, strategy, image_size, training, has_att_heads,
                     output_intermediate_features, soft_nms_sigma):
        """Test for creation of a R50-FPN RetinaNet."""
        tf.keras.backend.set_image_data_format('channels_last')
        num_classes = 3
        min_level = 3
        max_level = 7
        num_scales = 3
        aspect_ratios = [1.0]
        num_anchors_per_location = num_scales * len(aspect_ratios)

        images = np.random.rand(2, image_size[0], image_size[1], 3)
        image_shape = np.array([[image_size[0], image_size[1]],
                                [image_size[0], image_size[1]]])

        with strategy.scope():
            anchor_gen = anchor.build_anchor_generator(
                min_level=min_level,
                max_level=max_level,
                num_scales=num_scales,
                aspect_ratios=aspect_ratios,
                anchor_size=3)
            anchor_boxes = anchor_gen(image_size)
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])

            backbone = resnet.ResNet(model_id=50)
            decoder = fpn.FPN(input_specs=backbone.output_specs,
                              min_level=min_level,
                              max_level=max_level)

            if has_att_heads:
                attribute_heads = [
                    dict(name='depth', type='regression', size=1)
                ]
            else:
                attribute_heads = None
            head = dense_prediction_heads.RetinaNetHead(
                min_level=min_level,
                max_level=max_level,
                num_classes=num_classes,
                attribute_heads=attribute_heads,
                num_anchors_per_location=num_anchors_per_location)
            generator = detection_generator.MultilevelDetectionGenerator(
                max_num_detections=10,
                nms_version='v1',
                use_cpu_nms=soft_nms_sigma is not None,
                soft_nms_sigma=soft_nms_sigma)
            model = retinanet_model.RetinaNetModel(
                backbone=backbone,
                decoder=decoder,
                head=head,
                detection_generator=generator)

            model_outputs = model(
                images,
                image_shape,
                anchor_boxes,
                output_intermediate_features=output_intermediate_features,
                training=training)

        if training:
            cls_outputs = model_outputs['cls_outputs']
            box_outputs = model_outputs['box_outputs']
            for level in range(min_level, max_level + 1):
                self.assertIn(str(level), cls_outputs)
                self.assertIn(str(level), box_outputs)
                self.assertAllEqual([
                    2, image_size[0] // 2**level, image_size[1] // 2**level,
                    num_classes * num_anchors_per_location
                ], cls_outputs[str(level)].numpy().shape)
                self.assertAllEqual([
                    2, image_size[0] // 2**level, image_size[1] // 2**level,
                    4 * num_anchors_per_location
                ], box_outputs[str(level)].numpy().shape)
                if has_att_heads:
                    att_outputs = model_outputs['attribute_outputs']
                    for att in att_outputs.values():
                        self.assertAllEqual([
                            2, image_size[0] // 2**level, image_size[1] //
                            2**level, 1 * num_anchors_per_location
                        ], att[str(level)].numpy().shape)
        else:
            self.assertIn('detection_boxes', model_outputs)
            self.assertIn('detection_scores', model_outputs)
            self.assertIn('detection_classes', model_outputs)
            self.assertIn('num_detections', model_outputs)
            self.assertAllEqual([2, 10, 4],
                                model_outputs['detection_boxes'].numpy().shape)
            self.assertAllEqual(
                [2, 10], model_outputs['detection_scores'].numpy().shape)
            self.assertAllEqual(
                [2, 10], model_outputs['detection_classes'].numpy().shape)
            self.assertAllEqual([
                2,
            ], model_outputs['num_detections'].numpy().shape)
            if has_att_heads:
                self.assertIn('detection_attributes', model_outputs)
                self.assertAllEqual([2, 10, 1],
                                    model_outputs['detection_attributes']
                                    ['depth'].numpy().shape)
        if output_intermediate_features:
            for l in range(2, 6):
                self.assertIn('backbone_{}'.format(l), model_outputs)
                self.assertAllEqual([
                    2, image_size[0] // 2**l, image_size[1] // 2**l,
                    backbone.output_specs[str(l)].as_list()[-1]
                ], model_outputs['backbone_{}'.format(l)].numpy().shape)
            for l in range(min_level, max_level + 1):
                self.assertIn('decoder_{}'.format(l), model_outputs)
                self.assertAllEqual([
                    2, image_size[0] // 2**l, image_size[1] // 2**l,
                    decoder.output_specs[str(l)].as_list()[-1]
                ], model_outputs['decoder_{}'.format(l)].numpy().shape)
    def testDetectionsOutputShape(self, nms_version, has_att_heads,
                                  use_cpu_nms, soft_nms_sigma,
                                  use_regular_nms):
        min_level = 4
        max_level = 6
        num_scales = 2
        max_num_detections = 10
        aspect_ratios = [1.0, 2.0]
        anchor_scale = 2.0
        output_size = [64, 64]
        num_classes = 4
        pre_nms_top_k = 5000
        pre_nms_score_threshold = 0.01
        batch_size = 1
        tflite_post_processing_config = {
            'max_detections': max_num_detections,
            'max_classes_per_detection': 1,
            'use_regular_nms': use_regular_nms,
            'nms_score_threshold': 0.01,
            'nms_iou_threshold': 0.5
        }
        kwargs = {
            'apply_nms': True,
            'pre_nms_top_k': pre_nms_top_k,
            'pre_nms_score_threshold': pre_nms_score_threshold,
            'nms_iou_threshold': 0.5,
            'max_num_detections': max_num_detections,
            'nms_version': nms_version,
            'use_cpu_nms': use_cpu_nms,
            'soft_nms_sigma': soft_nms_sigma,
            'tflite_post_processing_config': tflite_post_processing_config
        }

        input_anchor = anchor.build_anchor_generator(min_level, max_level,
                                                     num_scales, aspect_ratios,
                                                     anchor_scale)
        anchor_boxes = input_anchor(output_size)
        cls_outputs_all = (np.random.rand(84, num_classes) -
                           0.5) * 3  # random 84x3 outputs.
        box_outputs_all = np.random.rand(84, 4)  # random 84 boxes.
        class_outputs = {
            '4':
            tf.reshape(
                tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32),
                [1, 8, 8, num_classes]),
            '5':
            tf.reshape(
                tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32),
                [1, 4, 4, num_classes]),
            '6':
            tf.reshape(
                tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32),
                [1, 2, 2, num_classes]),
        }
        box_outputs = {
            '4':
            tf.reshape(
                tf.convert_to_tensor(box_outputs_all[0:64], dtype=tf.float32),
                [1, 8, 8, 4]),
            '5':
            tf.reshape(
                tf.convert_to_tensor(box_outputs_all[64:80], dtype=tf.float32),
                [1, 4, 4, 4]),
            '6':
            tf.reshape(
                tf.convert_to_tensor(box_outputs_all[80:84], dtype=tf.float32),
                [1, 2, 2, 4]),
        }
        if has_att_heads:
            att_outputs_all = np.random.rand(84, 1)  # random attributes.
            att_outputs = {
                'depth': {
                    '4':
                    tf.reshape(
                        tf.convert_to_tensor(att_outputs_all[0:64],
                                             dtype=tf.float32), [1, 8, 8, 1]),
                    '5':
                    tf.reshape(
                        tf.convert_to_tensor(att_outputs_all[64:80],
                                             dtype=tf.float32), [1, 4, 4, 1]),
                    '6':
                    tf.reshape(
                        tf.convert_to_tensor(att_outputs_all[80:84],
                                             dtype=tf.float32), [1, 2, 2, 1]),
                }
            }
        else:
            att_outputs = None
        image_info = tf.constant(
            [[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]], dtype=tf.float32)
        generator = detection_generator.MultilevelDetectionGenerator(**kwargs)
        results = generator(box_outputs, class_outputs, anchor_boxes,
                            image_info[:, 1, :], att_outputs)
        boxes = results['detection_boxes']
        classes = results['detection_classes']
        scores = results['detection_scores']
        valid_detections = results['num_detections']

        if nms_version == 'tflite':
            # When nms_version is `tflite`, all output tensors are empty as the actual
            # post-processing happens in the TFLite model.
            self.assertEqual(boxes.numpy().shape, ())
            self.assertEqual(scores.numpy().shape, ())
            self.assertEqual(classes.numpy().shape, ())
            self.assertEqual(valid_detections.numpy().shape, ())
        else:
            self.assertEqual(boxes.numpy().shape,
                             (batch_size, max_num_detections, 4))
            self.assertEqual(scores.numpy().shape, (
                batch_size,
                max_num_detections,
            ))
            self.assertEqual(classes.numpy().shape, (
                batch_size,
                max_num_detections,
            ))
            self.assertEqual(valid_detections.numpy().shape, (batch_size, ))
            if has_att_heads:
                for att in results['detection_attributes'].values():
                    self.assertEqual(att.numpy().shape,
                                     (batch_size, max_num_detections, 1))
Exemple #6
0
  def _parse_eval_data(self, data):
    """Parses data for evaluation.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      A dictionary of {'images': image, 'labels': labels} where
        image: image tensor that is preproessed to have normalized value and
          dimension [output_size[0], output_size[1], 3]
        labels: a dictionary of tensors used for training. The following
          describes {key: value} pairs in the dictionary.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
          image_info: a 2D `Tensor` that encodes the information of the image
            and the applied preprocessing. It is in the format of
            [[original_height, original_width], [scaled_height, scaled_width],
          anchor_boxes: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, 4] representing anchor boxes at each
            level.
    """
    # Gets original image and its size.
    image = data['image']
    image_shape = tf.shape(image)[0:2]

    # Normalizes image with mean and std pixel values.
    image = preprocess_ops.normalize_image(image)

    # Resizes and crops image.
    image, image_info = preprocess_ops.resize_and_crop_image(
        image,
        self._output_size,
        padded_size=preprocess_ops.compute_padded_size(
            self._output_size, 2 ** self._max_level),
        aug_scale_min=1.0,
        aug_scale_max=1.0)
    image_height, image_width, _ = image.get_shape().as_list()

    # Casts input image to self._dtype
    image = tf.cast(image, dtype=self._dtype)

    # Converts boxes from normalized coordinates to pixel coordinates.
    boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape)

    # Compute Anchor boxes.
    input_anchor = anchor.build_anchor_generator(
        min_level=self._min_level,
        max_level=self._max_level,
        num_scales=self._num_scales,
        aspect_ratios=self._aspect_ratios,
        anchor_size=self._anchor_size)
    anchor_boxes = input_anchor(image_size=(image_height, image_width))

    labels = {
        'image_info': image_info,
        'anchor_boxes': anchor_boxes,
    }

    groundtruths = {
        'source_id': data['source_id'],
        'height': data['height'],
        'width': data['width'],
        'num_detections': tf.shape(data['groundtruth_classes'])[0],
        'boxes': boxes,
        'classes': data['groundtruth_classes'],
        'areas': data['groundtruth_area'],
        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
    }
    groundtruths['source_id'] = utils.process_source_id(
        groundtruths['source_id'])
    groundtruths = utils.pad_groundtruths_to_fixed_size(
        groundtruths, self._max_num_instances)
    labels['groundtruths'] = groundtruths
    return image, labels
Exemple #7
0
  def _parse_train_data(self, data):
    """Parses data for training.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        rpn_score_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location]. The height_l and
          width_l represent the dimension of class logits at l-th level.
        rpn_box_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        gt_boxes: Groundtruth bounding box annotations. The box is represented
           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
           image that is fed to the network. The tennsor is padded with -1 to
           the fixed dimension [self._max_num_instances, 4].
        gt_classes: Groundtruth classes annotations. The tennsor is padded
          with -1 to the fixed dimension [self._max_num_instances].
        gt_masks: groundtrugh masks cropped by the bounding box and
          resized to a fixed size determined by mask_crop_size.
    """
    classes = data['groundtruth_classes']
    boxes = data['groundtruth_boxes']
    if self._include_mask:
      masks = data['groundtruth_instance_masks']

    is_crowds = data['groundtruth_is_crowd']
    # Skips annotations with `is_crowd` = True.
    if self._skip_crowd_during_training:
      num_groundtruths = tf.shape(classes)[0]
      with tf.control_dependencies([num_groundtruths, is_crowds]):
        indices = tf.cond(
            tf.greater(tf.size(is_crowds), 0),
            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
            lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
      classes = tf.gather(classes, indices)
      boxes = tf.gather(boxes, indices)
      if self._include_mask:
        masks = tf.gather(masks, indices)

    # Gets original image and its size.
    image = data['image']
    if self._augmenter is not None:
      image = self._augmenter.distort(image)

    image_shape = tf.shape(image)[0:2]

    # Normalizes image with mean and std pixel values.
    image = preprocess_ops.normalize_image(image)

    # Flips image randomly during training.
    if self._aug_rand_hflip:
      if self._include_mask:
        image, boxes, masks = preprocess_ops.random_horizontal_flip(
            image, boxes, masks)
      else:
        image, boxes, _ = preprocess_ops.random_horizontal_flip(
            image, boxes)

    # Converts boxes from normalized coordinates to pixel coordinates.
    # Now the coordinates of boxes are w.r.t. the original image.
    boxes = box_ops.denormalize_boxes(boxes, image_shape)

    # Resizes and crops image.
    image, image_info = preprocess_ops.resize_and_crop_image(
        image,
        self._output_size,
        padded_size=preprocess_ops.compute_padded_size(
            self._output_size, 2 ** self._max_level),
        aug_scale_min=self._aug_scale_min,
        aug_scale_max=self._aug_scale_max)
    image_height, image_width, _ = image.get_shape().as_list()

    # Resizes and crops boxes.
    # Now the coordinates of boxes are w.r.t the scaled image.
    image_scale = image_info[2, :]
    offset = image_info[3, :]
    boxes = preprocess_ops.resize_and_crop_boxes(
        boxes, image_scale, image_info[1, :], offset)

    # Filters out ground truth boxes that are all zeros.
    indices = box_ops.get_non_empty_box_indices(boxes)
    boxes = tf.gather(boxes, indices)
    classes = tf.gather(classes, indices)
    if self._include_mask:
      masks = tf.gather(masks, indices)
      # Transfer boxes to the original image space and do normalization.
      cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
      cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
      cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape)
      num_masks = tf.shape(masks)[0]
      masks = tf.image.crop_and_resize(
          tf.expand_dims(masks, axis=-1),
          cropped_boxes,
          box_indices=tf.range(num_masks, dtype=tf.int32),
          crop_size=[self._mask_crop_size, self._mask_crop_size],
          method='bilinear')
      masks = tf.squeeze(masks, axis=-1)

    # Assigns anchor targets.
    # Note that after the target assignment, box targets are absolute pixel
    # offsets w.r.t. the scaled image.
    input_anchor = anchor.build_anchor_generator(
        min_level=self._min_level,
        max_level=self._max_level,
        num_scales=self._num_scales,
        aspect_ratios=self._aspect_ratios,
        anchor_size=self._anchor_size)
    anchor_boxes = input_anchor(image_size=(image_height, image_width))
    anchor_labeler = anchor.RpnAnchorLabeler(
        self._rpn_match_threshold,
        self._rpn_unmatched_threshold,
        self._rpn_batch_size_per_im,
        self._rpn_fg_fraction)
    rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
        anchor_boxes, boxes,
        tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

    # Casts input image to self._dtype
    image = tf.cast(image, dtype=self._dtype)

    # Packs labels for model_fn outputs.
    labels = {
        'anchor_boxes':
            anchor_boxes,
        'image_info':
            image_info,
        'rpn_score_targets':
            rpn_score_targets,
        'rpn_box_targets':
            rpn_box_targets,
        'gt_boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_instances,
                                                     -1),
        'gt_classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_instances,
                                                     -1),
    }
    if self._include_mask:
      labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size(
          masks, self._max_num_instances, -1)

    return image, labels
    def _parse_eval_data(self, data):
        """Parses data for training and evaluation."""
        groundtruths = {}
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        # If not empty, `attributes` is a dict of (name, ground_truth) pairs.
        # `ground_gruth` of attributes is assumed in shape [N, attribute_size].
        # TODO(xianzhi): support parsing attributes weights.
        attributes = data.get('groundtruth_attributes', {})

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = preprocess_ops.normalize_image(image)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_ops.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = preprocess_ops.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=preprocess_ops.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
                                                     image_info[1, :], offset)
        # Filters out ground truth boxes that are all zeros.
        indices = box_ops.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        for k, v in attributes.items():
            attributes[k] = tf.gather(v, indices)

        # Assigns anchors.
        input_anchor = anchor.build_anchor_generator(
            min_level=self._min_level,
            max_level=self._max_level,
            num_scales=self._num_scales,
            aspect_ratios=self._aspect_ratios,
            anchor_size=self._anchor_size)
        anchor_boxes = input_anchor(image_size=(image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets, att_targets, cls_weights,
         box_weights) = anchor_labeler.label_anchors(
             anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)

        # Casts input image to desired data type.
        image = tf.cast(image, dtype=self._dtype)

        # Sets up groundtruth data for evaluation.
        groundtruths = {
            'source_id':
            data['source_id'],
            'height':
            data['height'],
            'width':
            data['width'],
            'num_detections':
            tf.shape(data['groundtruth_classes']),
            'image_info':
            image_info,
            'boxes':
            box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape),
            'classes':
            data['groundtruth_classes'],
            'areas':
            data['groundtruth_area'],
            'is_crowds':
            tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }
        if 'groundtruth_attributes' in data:
            groundtruths['attributes'] = data['groundtruth_attributes']
        groundtruths['source_id'] = utils.process_source_id(
            groundtruths['source_id'])
        groundtruths = utils.pad_groundtruths_to_fixed_size(
            groundtruths, self._max_num_instances)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': anchor_boxes,
            'cls_weights': cls_weights,
            'box_weights': box_weights,
            'image_info': image_info,
            'groundtruths': groundtruths,
        }
        if att_targets:
            labels['attribute_targets'] = att_targets
        return image, labels
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        # If not empty, `attributes` is a dict of (name, ground_truth) pairs.
        # `ground_gruth` of attributes is assumed in shape [N, attribute_size].
        # TODO(xianzhi): support parsing attributes weights.
        attributes = data.get('groundtruth_attributes', {})
        is_crowds = data['groundtruth_is_crowd']

        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training:
            num_groundtrtuhs = tf.shape(input=classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    pred=tf.greater(tf.size(input=is_crowds), 0),
                    true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.
                                             int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            for k, v in attributes.items():
                attributes[k] = tf.gather(v, indices)

        # Gets original image.
        image = data['image']

        # Apply autoaug or randaug.
        if self._augmenter is not None:
            image, boxes = self._augmenter.distort_with_boxes(image, boxes)
        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = preprocess_ops.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes, _ = preprocess_ops.random_horizontal_flip(
                image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_ops.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = preprocess_ops.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=preprocess_ops.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
                                                     image_info[1, :], offset)
        # Filters out ground truth boxes that are all zeros.
        indices = box_ops.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        for k, v in attributes.items():
            attributes[k] = tf.gather(v, indices)

        # Assigns anchors.
        input_anchor = anchor.build_anchor_generator(
            min_level=self._min_level,
            max_level=self._max_level,
            num_scales=self._num_scales,
            aspect_ratios=self._aspect_ratios,
            anchor_size=self._anchor_size)
        anchor_boxes = input_anchor(image_size=(image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets, att_targets, cls_weights,
         box_weights) = anchor_labeler.label_anchors(
             anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes)

        # Casts input image to desired data type.
        image = tf.cast(image, dtype=self._dtype)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': anchor_boxes,
            'cls_weights': cls_weights,
            'box_weights': box_weights,
            'image_info': image_info,
        }
        if att_targets:
            labels['attribute_targets'] = att_targets
        return image, labels