Beispiel #1
0
    def _build_outputs(self, images, labels, mode):
        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._anchor_params.min_level, self._anchor_params.max_level,
                self._anchor_params.num_scales,
                self._anchor_params.aspect_ratios,
                self._anchor_params.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

        backbone_features = self._backbone_fn(
            images, is_training=(mode == mode_keys.TRAIN))
        fpn_features = self._fpn_fn(backbone_features,
                                    is_training=(mode == mode_keys.TRAIN))
        cls_outputs, box_outputs = self._head_fn(
            fpn_features, is_training=(mode == mode_keys.TRAIN))
        model_outputs = {
            'cls_outputs': cls_outputs,
            'box_outputs': box_outputs,
        }

        if mode != mode_keys.TRAIN:
            detection_results = self._generate_detections_fn(
                box_outputs, cls_outputs, anchor_boxes,
                labels['image_info'][:, 1:2, :])
            model_outputs.update(detection_results)
        return model_outputs
Beispiel #2
0
    def _serving_model_graph(features, params):
        """Build the model graph for serving."""
        images = features['images']
        _, height, width, _ = images.get_shape().as_list()

        input_anchor = anchor.Anchor(params.anchor.min_level,
                                     params.anchor.max_level,
                                     params.anchor.num_scales,
                                     params.anchor.aspect_ratios,
                                     params.anchor.anchor_size,
                                     (height, width))

        model_fn = factory.model_generator(params)
        model_outputs = model_fn.build_outputs(
            features['images'],
            labels={'anchor_boxes': input_anchor.multilevel_boxes},
            mode=mode_keys.PREDICT)

        if cast_num_detections_to_float:
            model_outputs['num_detections'] = tf.cast(
                model_outputs['num_detections'], dtype=tf.float32)

        if output_image_info:
            model_outputs.update({
                'image_info': features['image_info'],
            })

        if output_normalized_coordinates:
            model_outputs['detection_boxes'] = box_utils.normalize_boxes(
                model_outputs['detection_boxes'],
                features['image_info'][:, 1:2, :])

        return model_outputs
Beispiel #3
0
  def _serving_model_graph(features, params):
    """Build the model graph for serving."""
    images = features['images']
    batch_size, height, width, _ = images.get_shape().as_list()

    input_anchor = anchor.Anchor(
        params.anchor.min_level, params.anchor.max_level,
        params.anchor.num_scales, params.anchor.aspect_ratios,
        params.anchor.anchor_size, (height, width))

    multilevel_boxes = {}
    for k, v in six.iteritems(input_anchor.multilevel_boxes):
      multilevel_boxes[k] = tf.tile(
          tf.expand_dims(v, 0), [batch_size, 1, 1])

    model_fn = factory.model_generator(params)
    model_outputs = model_fn.build_outputs(
        features['images'],
        labels={
            'anchor_boxes': multilevel_boxes,
            'image_info': features['image_info'],
        },
        mode=mode_keys.PREDICT)

    if cast_num_detections_to_float:
      model_outputs['num_detections'] = tf.cast(
          model_outputs['num_detections'], dtype=tf.float32)

    if output_image_info:
      model_outputs.update({
          'image_info': features['image_info'],
      })

    if output_normalized_coordinates:
      model_outputs['detection_boxes'] = box_utils.normalize_boxes(
          model_outputs['detection_boxes'],
          features['image_info'][:, 1:2, :])

    predictions = {
        'num_detections': tf.identity(
            model_outputs['num_detections'], 'NumDetections'),
        'detection_boxes': tf.identity(
            model_outputs['detection_boxes'], 'DetectionBoxes'),
        'detection_classes': tf.identity(
            model_outputs['detection_classes'], 'DetectionClasses'),
        'detection_scores': tf.identity(
            model_outputs['detection_scores'], 'DetectionScores'),
    }
    if 'detection_masks' in model_outputs:
      predictions.update({
          'detection_masks':
              tf.identity(model_outputs['detection_masks'], 'DetectionMasks'),
      })

    if output_image_info:
      predictions['image_info'] = tf.identity(
          model_outputs['image_info'], 'ImageInfo')

    return predictions
Beispiel #4
0
    def _build_outputs(self, images, labels, mode):
        batch_size = tf.shape(images)[0]
        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._params.architecture.min_level,
                self._params.architecture.max_level,
                self._params.anchor.num_scales,
                self._params.anchor.aspect_ratios,
                self._params.anchor.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(
            images, is_training=(mode == mode_keys.TRAIN))
        fpn_features = self._fpn_fn(backbone_features,
                                    is_training=(mode == mode_keys.TRAIN))

        if self._params.architecture.output_flat_fpn_features:
            flat_fpn_features_list = []
            for level in range(self._params.architecture.min_level,
                               self._params.architecture.max_level + 1):
                flat_fpn_features_list.append(
                    tf.reshape(fpn_features[level], [batch_size, -1]))
            flat_fpn_features = tf.concat(flat_fpn_features_list, axis=1)
            flat_fpn_features = tf.identity(flat_fpn_features,
                                            'RawFpnFeatures')

        cls_outputs, box_outputs = self._head_fn(
            fpn_features, is_training=(mode == mode_keys.TRAIN))
        model_outputs = {
            'cls_outputs': cls_outputs,
            'box_outputs': box_outputs,
        }

        tf.logging.info('Computing number of FLOPs before NMS...')
        static_batch_size = images.get_shape().as_list()[0]
        if static_batch_size:
            _, _ = benchmark_utils.compute_model_statistics(static_batch_size)

        if mode != mode_keys.TRAIN:
            detection_results = self._generate_detections_fn(
                box_outputs, cls_outputs, anchor_boxes,
                labels['image_info'][:, 1:2, :])
            model_outputs.update(detection_results)
        return model_outputs
def parse_single_example(serialized_example, params):
    """Parses a singel serialized TFExample string."""
    decoder = tf_example_decoder.TfExampleDecoder()
    data = decoder.decode(serialized_example)
    image = data['image']
    source_id = data['source_id']
    source_id = dataloader_utils.process_source_id(source_id)
    height = data['height']
    width = data['width']
    boxes = data['groundtruth_boxes']
    boxes = box_utils.denormalize_boxes(boxes, tf.shape(image)[:2])
    classes = data['groundtruth_classes']
    is_crowds = data['groundtruth_is_crowd']
    areas = data['groundtruth_area']

    image = input_utils.normalize_image(image)
    image, image_info = input_utils.resize_and_crop_image(
        image,
        params.retinanet_parser.output_size,
        padded_size=input_utils.compute_padded_size(
            params.retinanet_parser.output_size, 2**params.anchor.max_level),
        aug_scale_min=1.0,
        aug_scale_max=1.0)
    anchors = anchor.Anchor(params.anchor.min_level, params.anchor.max_level,
                            params.anchor.num_scales,
                            params.anchor.aspect_ratios,
                            params.anchor.anchor_size,
                            image.get_shape().as_list()[:2])

    labels = {
        'anchor_boxes': anchors.multilevel_boxes,
        'image_info': image_info,
    }
    groundtruths = {
        'source_id': source_id,
        'height': height,
        'width': width,
        'num_detections': tf.shape(classes),
        'boxes': boxes,
        'classes': classes,
        'areas': areas,
        'is_crowds': tf.cast(is_crowds, tf.int32),
    }
    return image, labels, groundtruths
Beispiel #6
0
    def _build_outputs(self, images, labels, mode):
        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._anchor_params.min_level, self._anchor_params.max_level,
                self._anchor_params.num_scales,
                self._anchor_params.aspect_ratios,
                self._anchor_params.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

            batch_size = tf.shape(images)[0]
            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(
            images, is_training=(mode == mode_keys.TRAIN))
        fpn_features = self._fpn_fn(backbone_features,
                                    is_training=(mode == mode_keys.TRAIN))
        cls_outputs, box_outputs = self._head_fn(
            fpn_features, is_training=(mode == mode_keys.TRAIN))
        model_outputs = {
            'cls_outputs': cls_outputs,
            'box_outputs': box_outputs,
        }

        tf.logging.info('Computing number of FLOPs before NMS...')
        _, _ = benchmark_utils.compute_model_statistics(
            images.get_shape().as_list()[0])

        if mode != mode_keys.TRAIN:
            detection_results = self._generate_detections_fn(
                box_outputs, cls_outputs, anchor_boxes,
                labels['image_info'][:, 1:2, :])
            model_outputs.update(detection_results)
        return model_outputs
Beispiel #7
0
    def _parse_train_data_v2(self, data):
        """Parses data for training.

        Args:
          data: the decoded tensor dictionary from TfExampleDecoder.

        Returns:
          image: image tensor that is preproessed to have normalized value and
            dimension [output_size[0], output_size[1], 3]
          labels: a dictionary of tensors used for training. The following describes
            {key: value} pairs in the dictionary.
            image_info: a 2D `Tensor` that encodes the information of the image and
              the applied preprocessing. It is in the format of
              [[original_height, original_width], [scaled_height, scaled_width],
            anchor_boxes: ordered dictionary with keys
              [min_level, min_level+1, ..., max_level]. The values are tensor with
              shape [height_l, width_l, 4] representing anchor boxes at each level.
            rpn_score_targets: ordered dictionary with keys
              [min_level, min_level+1, ..., max_level]. The values are tensor with
              shape [height_l, width_l, anchors_per_location]. The height_l and
              width_l represent the dimension of class logits at l-th level.
            rpn_box_targets: ordered dictionary with keys
              [min_level, min_level+1, ..., max_level]. The values are tensor with
              shape [height_l, width_l, anchors_per_location * 4]. The height_l and
              width_l represent the dimension of bounding box regression output at
              l-th level.
            gt_boxes: Groundtruth bounding box annotations. The box is represented
               in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
               image that is fed to the network. The tennsor is padded with -1 to
               the fixed dimension [self._max_num_instances, 4].
            gt_classes: Groundtruth classes annotations. The tennsor is padded
              with -1 to the fixed dimension [self._max_num_instances].
            gt_masks: groundtrugh masks cropped by the bounding box and
              resized to a fixed size determined by mask_crop_size.
        """
        if self._use_autoaugment:
            try:
                from utils import (
                    autoaugment_utils, )  # pylint: disable=g-import-not-at-top
            except ImportError as e:
                logging.exception("Autoaugment is not supported in TF 2.x.")
                raise e

        classes = data["groundtruth_classes"]
        boxes = data["groundtruth_boxes"]
        masks = None
        attributes = None

        if self._include_mask:
            masks = data["groundtruth_instance_masks"]

        if self._num_attributes:
            attributes = data["groundtruth_attributes"]

        is_crowds = data["groundtruth_is_crowd"]
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(input=classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    pred=tf.greater(tf.size(input=is_crowds), 0),
                    true_fn=lambda: tf.compat.v1.where(
                        tf.logical_not(is_crowds))[:, 0],
                    false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.
                                             int64),
                )
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

            if self._include_mask:
                masks = tf.gather(masks, indices)

            if self._num_attributes:
                attributes = tf.gather(attributes, indices)

        # Gets original image and its size.
        image = data["image"]

        # NOTE: The autoaugment method works best when used alongside the standard
        # horizontal flipping of images along with size jittering and normalization.
        if self._use_autoaugment and not self._apply_autoaugment_after_resizing:
            (
                image,
                boxes,
                masks,
            ) = autoaugment_utils.distort_image_and_masks_with_autoaugment(
                image, boxes, masks, self._autoaugment_policy_name)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(
                    image, boxes, masks)
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Resizes and crops image.
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max,
        )

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        orig_image_shape = image_info[0]
        boxes = box_utils.denormalize_boxes(boxes, orig_image_shape)

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        rescaled_image_shape = tf.shape(input=image)[:2]
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  rescaled_image_shape, offset)

        # Filters out ground truth boxes that are all zeros.
        boxes, classes, masks, attributes = self._remove_empty_boxes(
            boxes, classes, masks, attributes)

        # apply the autoaugment after resizing
        if self._use_autoaugment and self._apply_autoaugment_after_resizing:
            # prepare image and boxes for the autoaugment
            image = tf.image.convert_image_dtype(image, dtype=tf.uint8)
            boxes = box_utils.normalize_boxes(boxes, rescaled_image_shape)

            # prepare masks for the autoaugment
            masks = tf.expand_dims(masks, axis=-1)
            scaled_mask_size = tf.cast(
                tf.round(orig_image_shape * image_scale), tf.int32)
            scaled_masks = tf.image.resize(
                masks, scaled_mask_size, method=tf.image.ResizeMethod.BILINEAR)
            offset_int = tf.cast(offset, tf.int32)
            masks = scaled_masks[:, offset_int[0]:offset_int[0] +
                                 rescaled_image_shape[0],
                                 offset_int[1]:offset_int[1] +
                                 rescaled_image_shape[1], ]
            masks = tf.squeeze(masks, axis=-1)
            masks = tf.cast(tf.greater(masks, 0.5), tf.float32)

            # apply the autoaugment
            (
                image,
                boxes,
                masks,
            ) = autoaugment_utils.distort_image_and_masks_with_autoaugment(
                image, boxes, masks, self._autoaugment_policy_name)

            # convert the image back to float32 and denormalize bboxes
            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
            boxes = box_utils.denormalize_boxes(boxes, rescaled_image_shape)

            # filters out empty bboxes
            boxes, classes, masks, attributes = self._remove_empty_boxes(
                boxes, classes, masks, attributes)

        if self._include_mask:
            if self._use_autoaugment and self._apply_autoaugment_after_resizing:
                # don't rescale boxes as masks were already resized
                cropped_boxes = box_utils.normalize_boxes(
                    boxes, rescaled_image_shape)
            else:
                # transfer boxes to the original image space
                cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0),
                                                [1, 2])
                cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0),
                                         [1, 2])
                cropped_boxes = box_utils.normalize_boxes(
                    cropped_boxes, orig_image_shape)

            num_masks = tf.shape(input=masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method="bilinear",
            )
            masks = tf.squeeze(masks, axis=-1)

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # pad the image
        padded_size = input_utils.compute_padded_size(self._output_size,
                                                      2**self._max_level)
        image = tf.image.pad_to_bounding_box(image, 0, 0, padded_size[0],
                                             padded_size[1])

        image_height, image_width, _ = image.get_shape().as_list()

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(
            self._min_level,
            self._max_level,
            self._num_scales,
            self._aspect_ratios,
            self._anchor_size,
            (image_height, image_width),
        )
        anchor_labeler = anchor.RpnAnchorLabeler(
            input_anchor,
            self._rpn_match_threshold,
            self._rpn_unmatched_threshold,
            self._rpn_batch_size_per_im,
            self._rpn_fg_fraction,
        )
        rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
            boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            "anchor_boxes": input_anchor.multilevel_boxes,
            "image_info": image_info,
            "rpn_score_targets": rpn_score_targets,
            "rpn_box_targets": rpn_box_targets,
        }
        labels["gt_boxes"] = input_utils.pad_to_fixed_size(
            boxes, self._max_num_instances, -1)
        labels["gt_classes"] = input_utils.pad_to_fixed_size(
            classes, self._max_num_instances, -1)

        if self._include_mask:
            labels["gt_masks"] = input_utils.pad_to_fixed_size(
                masks, self._max_num_instances, -1)

        if self._num_attributes:
            labels["gt_attributes"] = input_utils.pad_to_fixed_size(
                attributes, self._max_num_instances, -1)

        return image, labels
Beispiel #8
0
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

        # Gets original image and its size.
        image = data['image']

        # NOTE: The autoaugment method works best when used alongside the standard
        # horizontal flipping of images along with size jittering and normalization.
        if self._use_autoaugment:
            try:
                from utils import autoaugment_utils  # pylint: disable=g-import-not-at-top
            except ImportError as e:
                logging.exception('Autoaugment is not supported in TF 2.x.')
                raise e

            image, boxes = autoaugment_utils.distort_image_with_autoaugment(
                image, boxes, self._autoaugment_policy_name)

        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  (image_height, image_width),
                                                  offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }
        return image, labels
    def _parse_predict_data(self, data):
        """Parses data for prediction.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      A dictionary of {'images': image, 'labels': labels} where
        images: image tensor that is preproessed to have normalized value and
          dimension [output_size[0], output_size[1], 3]
        labels: a dictionary of tensors used for training. The following
          describes {key: value} pairs in the dictionary.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
          image_info: a 2D `Tensor` that encodes the information of the image
            and the applied preprocessing. It is in the format of
            [[original_height, original_width], [scaled_height, scaled_width],
          anchor_boxes: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, 4] representing anchor boxes at each
            level.
    """
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Compute Anchor boxes.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        labels = {
            'source_id': dataloader_utils.process_source_id(data['source_id']),
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
        }

        if self._mode == ModeKeys.PREDICT_WITH_GT:
            # Converts boxes from normalized coordinates to pixel coordinates.
            boxes = box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                                image_shape)
            groundtruths = {
                'source_id': data['source_id'],
                'height': data['height'],
                'width': data['width'],
                'num_detections': tf.shape(data['groundtruth_classes']),
                'boxes': boxes,
                'classes': data['groundtruth_classes'],
                'areas': data['groundtruth_area'],
                'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
            }
            groundtruths['source_id'] = dataloader_utils.process_source_id(
                groundtruths['source_id'])
            groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
                groundtruths, self._max_num_instances)
            labels['groundtruths'] = groundtruths

        return {
            'images': image,
            'labels': labels,
        }
    def _parse_train_data(self, data):
        """Parses data for training.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        rpn_score_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location]. The height_l and
          width_l represent the dimension of class logits at l-th level.
        rpn_box_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        gt_boxes: Groundtruth bounding box annotations. The box is represented
           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
           image that is fed to the network. The tennsor is padded with -1 to
           the fixed dimension [self._max_num_instances, 4].
        gt_classes: Groundtruth classes annotations. The tennsor is padded
          with -1 to the fixed dimension [self._max_num_instances].
        gt_masks: groundtrugh masks cropped by the bounding box and
          resized to a fixed size determined by mask_crop_size.
    """
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        if self._include_mask:
            masks = data['groundtruth_instance_masks']

        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            if self._include_mask:
                masks = tf.gather(masks, indices)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(
                    image, boxes, masks)
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        if self._include_mask:
            masks = tf.gather(masks, indices)
            # Transfer boxes to the original image space and do normalization.
            cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0),
                                            [1, 2])
            cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0),
                                     [1, 2])
            cropped_boxes = box_utils.normalize_boxes(cropped_boxes,
                                                      image_shape)
            num_masks = tf.shape(masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method='bilinear')
            masks = tf.squeeze(masks, axis=-1)

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.RpnAnchorLabeler(input_anchor,
                                                 self._rpn_match_threshold,
                                                 self._rpn_unmatched_threshold,
                                                 self._rpn_batch_size_per_im,
                                                 self._rpn_fg_fraction)
        rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
            boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
            'rpn_score_targets': rpn_score_targets,
            'rpn_box_targets': rpn_box_targets,
        }
        labels['gt_boxes'] = input_utils.clip_or_pad_to_fixed_size(
            boxes, self._max_num_instances, -1)
        labels['gt_classes'] = input_utils.clip_or_pad_to_fixed_size(
            classes, self._max_num_instances, -1)
        if self._include_mask:
            labels['gt_masks'] = input_utils.clip_or_pad_to_fixed_size(
                masks, self._max_num_instances, -1)

        return image, labels
    def _build_outputs(self, images, labels, mode):
        is_training = (mode == mode_keys.TRAIN)

        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._params.architecture.min_level,
                self._params.architecture.max_level,
                self._params.anchor.num_scales,
                self._params.anchor.aspect_ratios,
                self._params.anchor.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

            batch_size = tf.shape(images)[0]
            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(images, is_training=is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training=is_training)
        cls_outputs, box_outputs = self._retinanet_head_fn(
            fpn_features, is_training=is_training)
        # Shapemask mask prediction.
        if is_training:
            boxes = labels['mask_boxes']
            outer_boxes = labels['mask_outer_boxes']
            classes = labels['mask_classes']
        else:
            detection_results = self._generate_detections_fn(
                box_outputs, cls_outputs, anchor_boxes,
                labels['image_info'][:, 1:2, :])
            boxes = detection_results['detection_boxes']
            scores = detection_results['detection_scores']
            classes = detection_results['detection_classes']
            valid_detections = detection_results['num_detections']

            # Use list as input to avoide segmentation fault on TPU.
            image_size = images.get_shape().as_list()[1:3]
            outer_boxes = box_utils.compute_outer_boxes(
                tf.reshape(boxes, [-1, 4]),
                image_size,
                scale=self._outer_box_scale)
            outer_boxes = tf.reshape(outer_boxes, tf.shape(boxes))
            classes = tf.cast(classes, tf.int32)

        instance_features, prior_masks = self._shape_prior_head_fn(
            fpn_features, boxes, outer_boxes, classes, is_training)
        coarse_mask_logits = self._coarse_mask_fn(instance_features,
                                                  prior_masks, classes,
                                                  is_training)
        fine_mask_logits = self._fine_mask_fn(instance_features,
                                              coarse_mask_logits, classes,
                                              is_training)
        model_outputs = {
            'cls_outputs': cls_outputs,
            'box_outputs': box_outputs,
            'fine_mask_logits': fine_mask_logits,
            'coarse_mask_logits': coarse_mask_logits,
            'prior_masks': prior_masks,
            'fpn_features': fpn_features,
        }

        if not is_training:
            model_outputs.update({
                'num_detections':
                valid_detections,
                'detection_boxes':
                boxes,
                'detection_outer_boxes':
                outer_boxes,
                'detection_masks':
                tf.sigmoid(fine_mask_logits),
                'detection_classes':
                tf.cast(classes, dtype=tf.int32),
                'detection_scores':
                scores,
            })
        return model_outputs
Beispiel #12
0
    def _parse_train_data(self, data):
        """Parse data for ShapeMask training."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        masks = data['groundtruth_instance_masks']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            masks = tf.gather(masks, indices)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # If not using category, makes all categories with id = 0.
        if not self._use_category:
            classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes, masks = input_utils.random_horizontal_flip(
                image, boxes, masks)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            self._output_size,
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_scale = image_info[2, :]
        offset = image_info[3, :]

        # Resizes and crops boxes and masks.
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  self._output_size, offset)
        masks = input_utils.resize_and_crop_masks(
            tf.expand_dims(masks, axis=-1), image_scale, self._output_size,
            offset)
        masks = tf.squeeze(masks, axis=-1)

        # Filters out ground truth boxes that are all zeros.
        indices = input_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        masks = tf.gather(masks, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size, self._output_size)
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # Sample groundtruth masks/boxes/classes for mask branch.
        num_masks = tf.shape(masks)[0]
        mask_shape = tf.shape(masks)[1:3]

        # Pad sampled boxes/masks/classes to a constant batch size.
        padded_boxes = input_utils.pad_to_fixed_size(boxes,
                                                     self._num_sampled_masks)
        padded_classes = input_utils.pad_to_fixed_size(classes,
                                                       self._num_sampled_masks)
        padded_masks = input_utils.pad_to_fixed_size(masks,
                                                     self._num_sampled_masks)

        # Randomly sample groundtruth masks for mask branch training. For the image
        # without groundtruth masks, it will sample the dummy padded tensors.
        rand_indices = tf.random.uniform([self._num_sampled_masks],
                                         minval=0,
                                         maxval=tf.maximum(num_masks, 1),
                                         dtype=tf.dtypes.int32)
        sampled_boxes = tf.gather(padded_boxes, rand_indices)
        sampled_classes = tf.gather(padded_classes, rand_indices)
        sampled_masks = tf.gather(padded_masks, rand_indices)

        # Compute mask targets in feature crop. A feature crop fully contains a
        # sampled box.
        mask_outer_boxes = box_utils.compute_outer_boxes(
            sampled_boxes, mask_shape, scale=self._outer_box_scale)
        norm_mask_outer_boxes = box_utils.normalize_boxes(
            mask_outer_boxes, mask_shape)

        # Set sampled_masks shape to [batch_size, height, width, 1].
        sampled_masks = tf.expand_dims(sampled_masks, axis=-1)
        mask_targets = tf.image.crop_and_resize(
            sampled_masks,
            norm_mask_outer_boxes,
            box_ind=tf.range(self._num_sampled_masks),
            crop_size=[self._mask_crop_size, self._mask_crop_size],
            method='bilinear',
            extrapolation_value=0,
            name='train_mask_targets')
        mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5),
                                tf.ones_like(mask_targets),
                                tf.zeros_like(mask_targets))
        mask_targets = tf.squeeze(mask_targets, axis=-1)

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
            # For ShapeMask.
            'mask_boxes': sampled_boxes,
            'mask_outer_boxes': mask_outer_boxes,
            'mask_targets': mask_targets,
            'mask_classes': sampled_classes,
            'mask_is_valid': tf.cast(tf.not_equal(num_masks, 0), tf.int32)
        }
        return image, labels
  def build_outputs(self, inputs, mode):
    is_training = mode == mode_keys.TRAIN
    model_outputs = {}

    image = inputs['image']
    _, image_height, image_width, _ = image.get_shape().as_list()
    backbone_features = self._backbone_fn(image, is_training)
    fpn_features = self._fpn_fn(backbone_features, is_training)

    rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
        fpn_features, is_training)
    model_outputs.update({
        'rpn_score_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  rpn_score_outputs),
        'rpn_box_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  rpn_box_outputs),
    })
    input_anchor = anchor.Anchor(self._params.anchor.min_level,
                                 self._params.anchor.max_level,
                                 self._params.anchor.num_scales,
                                 self._params.anchor.aspect_ratios,
                                 self._params.anchor.anchor_size,
                                 (image_height, image_width))
    rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs,
                                         input_anchor.multilevel_boxes,
                                         inputs['image_info'][:, 1, :],
                                         is_training)
    if is_training:
      rpn_rois = tf.stop_gradient(rpn_rois)

      # Sample proposals.
      rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
          self._sample_rois_fn(rpn_rois, inputs['gt_boxes'],
                               inputs['gt_classes']))

      # Create bounding box training targets.
      box_targets = box_utils.encode_boxes(
          matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
      # If the target is background, the box target is set to all 0s.
      box_targets = tf.where(
          tf.tile(
              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
              [1, 1, 4]),
          tf.zeros_like(box_targets),
          box_targets)
      model_outputs.update({
          'class_targets': matched_gt_classes,
          'box_targets': box_targets,
      })

    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_features, rpn_rois, output_size=7)

    class_outputs, box_outputs = self._frcnn_head_fn(roi_features, is_training)

    model_outputs.update({
        'class_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  class_outputs),
        'box_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  box_outputs),
    })

    # Add this output to train to make the checkpoint loadable in predict mode.
    # If we skip it in train mode, the heads will be out-of-order and checkpoint
    # loading will fail.
    boxes, scores, classes, valid_detections = self._generate_detections_fn(
        box_outputs, class_outputs, rpn_rois, inputs['image_info'][:, 1:2, :])
    model_outputs.update({
        'num_detections': valid_detections,
        'detection_boxes': boxes,
        'detection_classes': classes,
        'detection_scores': scores,
    })

    if not self._include_mask:
      return model_outputs

    if is_training:
      rpn_rois, classes, mask_targets = self._sample_masks_fn(
          rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
          inputs['gt_masks'])
      mask_targets = tf.stop_gradient(mask_targets)

      classes = tf.cast(classes, dtype=tf.int32)

      model_outputs.update({
          'mask_targets': mask_targets,
          'sampled_class_targets': classes,
      })
    else:
      rpn_rois = boxes
      classes = tf.cast(classes, dtype=tf.int32)

    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
        fpn_features, rpn_rois, output_size=14)

    mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)

    if is_training:
      model_outputs.update({
          'mask_outputs':
              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                    mask_outputs),
      })
    else:
      model_outputs.update({
          'detection_masks': tf.nn.sigmoid(mask_outputs)
      })

    return model_outputs
Beispiel #14
0
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

        # Gets original image and its size.
        image = data['image']

        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  (image_height, image_width),
                                                  offset)
        # Filters out ground truth boxes that are all zeros.
        indices = input_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }
        return image, labels
    def parse_train_data(self, data):
        """Parse data for ShapeMask training."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        masks = data['groundtruth_instance_masks']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            masks = tf.gather(masks, indices)

        # If not using category, makes all categories with id = 0.
        if not self._use_category:
            classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

        image = self.get_normalized_image(data)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes, masks = input_utils.random_horizontal_flip(
                image, boxes, masks)

        # Converts boxes from normalized coordinates to pixel coordinates.
        image_shape = tf.shape(image)[0:2]
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            self._output_size,
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        self._train_image_scale = image_info[2, :]
        self._train_offset = image_info[3, :]

        # Resizes and crops boxes and masks.
        boxes = input_utils.resize_and_crop_boxes(boxes,
                                                  self._train_image_scale,
                                                  image_info[1, :],
                                                  self._train_offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        masks = tf.gather(masks, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size, self._output_size)
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # Sample groundtruth masks/boxes/classes for mask branch.
        num_masks = tf.shape(masks)[0]
        mask_shape = tf.shape(masks)[1:3]

        # Pad sampled boxes/masks/classes to a constant batch size.
        padded_boxes = input_utils.pad_to_fixed_size(boxes,
                                                     self._num_sampled_masks)
        padded_classes = input_utils.pad_to_fixed_size(classes,
                                                       self._num_sampled_masks)
        padded_masks = input_utils.pad_to_fixed_size(masks,
                                                     self._num_sampled_masks)

        # Randomly sample groundtruth masks for mask branch training. For the image
        # without groundtruth masks, it will sample the dummy padded tensors.
        rand_indices = tf.random.shuffle(
            tf.range(tf.maximum(num_masks, self._num_sampled_masks)))
        rand_indices = tf.mod(rand_indices, tf.maximum(num_masks, 1))
        rand_indices = rand_indices[0:self._num_sampled_masks]
        rand_indices = tf.reshape(rand_indices, [self._num_sampled_masks])

        sampled_boxes = tf.gather(padded_boxes, rand_indices)
        sampled_classes = tf.gather(padded_classes, rand_indices)
        sampled_masks = tf.gather(padded_masks, rand_indices)
        # Jitter the sampled boxes to mimic the noisy detections.
        sampled_boxes = box_utils.jitter_boxes(
            sampled_boxes, noise_scale=self._box_jitter_scale)
        sampled_boxes = box_utils.clip_boxes(sampled_boxes, self._output_size)
        # Compute mask targets in feature crop. A feature crop fully contains a
        # sampled box.
        mask_outer_boxes = box_utils.compute_outer_boxes(
            sampled_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale)
        mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes,
                                                self._output_size)
        # Compensate the offset of mask_outer_boxes to map it back to original image
        # scale.
        mask_outer_boxes_ori = mask_outer_boxes
        mask_outer_boxes_ori += tf.tile(
            tf.expand_dims(self._train_offset, axis=0), [1, 2])
        mask_outer_boxes_ori /= tf.tile(
            tf.expand_dims(self._train_image_scale, axis=0), [1, 2])
        norm_mask_outer_boxes_ori = box_utils.normalize_boxes(
            mask_outer_boxes_ori, mask_shape)

        # Set sampled_masks shape to [batch_size, height, width, 1].
        sampled_masks = tf.cast(tf.expand_dims(sampled_masks, axis=-1),
                                tf.float32)
        mask_targets = tf.image.crop_and_resize(
            sampled_masks,
            norm_mask_outer_boxes_ori,
            box_ind=tf.range(self._num_sampled_masks),
            crop_size=[self._mask_crop_size, self._mask_crop_size],
            method='bilinear',
            extrapolation_value=0,
            name='train_mask_targets')
        mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5),
                                tf.ones_like(mask_targets),
                                tf.zeros_like(mask_targets))
        mask_targets = tf.squeeze(mask_targets, axis=-1)
        if self._up_sample_factor > 1:
            fine_mask_targets = tf.image.crop_and_resize(
                sampled_masks,
                norm_mask_outer_boxes_ori,
                box_ind=tf.range(self._num_sampled_masks),
                crop_size=[
                    self._mask_crop_size * self._up_sample_factor,
                    self._mask_crop_size * self._up_sample_factor
                ],
                method='bilinear',
                extrapolation_value=0,
                name='train_mask_targets')
            fine_mask_targets = tf.where(
                tf.greater_equal(fine_mask_targets, 0.5),
                tf.ones_like(fine_mask_targets),
                tf.zeros_like(fine_mask_targets))
            fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1)
        else:
            fine_mask_targets = mask_targets

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32)
        if self._mask_train_class == 'all':
            mask_is_valid = valid_image * tf.ones_like(sampled_classes,
                                                       tf.int32)
        else:
            # Get the intersection of sampled classes with training splits.
            mask_valid_classes = tf.cast(
                tf.expand_dims(
                    class_utils.coco_split_class_ids(self._mask_train_class),
                    1), sampled_classes.dtype)
            match = tf.reduce_any(
                tf.equal(tf.expand_dims(sampled_classes, 0),
                         mask_valid_classes), 0)
            mask_is_valid = valid_image * tf.cast(match, tf.int32)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
            # For ShapeMask.
            'mask_boxes': sampled_boxes,
            'mask_outer_boxes': mask_outer_boxes,
            'mask_targets': mask_targets,
            'fine_mask_targets': fine_mask_targets,
            'mask_classes': sampled_classes,
            'mask_is_valid': mask_is_valid,
        }
        return image, labels
    def parse_predict_data(self, data):
        """Parse data for ShapeMask training."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        masks = data['groundtruth_instance_masks']

        # If not using category, makes all categories with id = 0.
        if not self._use_category:
            classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

        image = self.get_normalized_image(data)

        # Converts boxes from normalized coordinates to pixel coordinates.
        image_shape = tf.shape(image)[0:2]
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            self._output_size,
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_scale = image_info[2, :]
        offset = image_info[3, :]

        # Resizes and crops boxes and masks.
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)
        masks = input_utils.resize_and_crop_masks(
            tf.expand_dims(masks, axis=-1), image_scale, self._output_size,
            offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size, self._output_size)
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
        }
        if self._mode == ModeKeys.PREDICT_WITH_GT:
            # Converts boxes from normalized coordinates to pixel coordinates.
            groundtruths = {
                'source_id':
                data['source_id'],
                'height':
                data['height'],
                'width':
                data['width'],
                'num_detections':
                tf.shape(data['groundtruth_classes']),
                'boxes':
                box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                            image_shape),
                'classes':
                data['groundtruth_classes'],
                # 'masks': tf.squeeze(masks, axis=-1),
                'areas':
                data['groundtruth_area'],
                'is_crowds':
                tf.cast(data['groundtruth_is_crowd'], tf.int32),
            }
            groundtruths['source_id'] = dataloader_utils.process_source_id(
                groundtruths['source_id'])
            groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
                groundtruths, self._max_num_instances)
            # Computes training labels.
            (cls_targets, box_targets,
             num_positives) = anchor_labeler.label_anchors(
                 boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
            # Packs labels for model_fn outputs.
            labels.update({
                'cls_targets': cls_targets,
                'box_targets': box_targets,
                'num_positives': num_positives,
                'groundtruths': groundtruths,
            })
        return {
            'images': image,
            'labels': labels,
        }
Beispiel #17
0
    def _build_outputs(self, images, labels, mode):
        is_training = mode == mode_keys.TRAIN
        model_outputs = {}

        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._params.architecture.min_level,
                self._params.architecture.max_level,
                self._params.anchor.num_scales,
                self._params.anchor.aspect_ratios,
                self._params.anchor.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

            batch_size = tf.shape(images)[0]
            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(images, is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training)

        rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
            fpn_features, is_training)
        model_outputs.update({
            'rpn_score_outputs': rpn_score_outputs,
            'rpn_box_outputs': rpn_box_outputs,
        })
        rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs,
                                             rpn_score_outputs, anchor_boxes,
                                             labels['image_info'][:, 1, :],
                                             is_training)

        if is_training:
            rpn_rois = tf.stop_gradient(rpn_rois)

            # Sample proposals.
            rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
                self._sample_rois_fn(rpn_rois, labels['gt_boxes'],
                                     labels['gt_classes']))

            self.add_scalar_summary(
                'fg_bg_ratio_{}'.format(0),
                tf.reduce_sum(
                    tf.cast(tf.greater(matched_gt_classes, 0), tf.float32)) /
                tf.reduce_sum(
                    tf.cast(tf.greater_equal(matched_gt_classes, 0),
                            tf.float32)))

            # Create bounding box training targets.
            box_targets = box_utils.encode_boxes(
                matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
            # If the target is background, the box target is set to all 0s.
            box_targets = tf.where(
                tf.tile(
                    tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
                    [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
            model_outputs.update({
                'class_targets': matched_gt_classes,
                'box_targets': box_targets,
            })

        _, num_rois_before_cat, _ = rpn_rois.get_shape().as_list()

        if is_training and self._feat_distill:
            tf.logging.info(f'rois before concat distill boxes: {rpn_rois}')
            rpn_rois = tf.concat([rpn_rois, labels['roi_boxes']], axis=1)
            # [batch_size, num_rois+max_distill_rois, 4]
            tf.logging.info(f'rois after concat distill boxes: {rpn_rois}')

        roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=7)

        if is_training and self._feat_distill:
            tf.logging.info(f'rois before split: {rpn_rois}')
            rpn_rois, _ = tf.split(
                rpn_rois, [num_rois_before_cat, self._max_distill_rois],
                axis=1)
            tf.logging.info(f'rois after split: {rpn_rois}')

        (class_outputs, box_outputs, distill_feat_outputs,
         distill_class_outputs) = self._frcnn_head_fn(roi_features,
                                                      is_training)
        model_outputs.update({
            'class_outputs': class_outputs,
            'box_outputs': box_outputs,
        })
        if is_training and self._feat_distill:
            model_outputs.update(
                {'distill_feat_outputs': distill_feat_outputs})

        if not is_training:
            detection_results = self._generate_detections_fn(
                box_outputs,
                class_outputs,
                rpn_rois,
                labels['image_info'][:, 1:2, :],
                bbox_per_class=not self._params.frcnn_head.
                class_agnostic_bbox_pred,
                distill_class_outputs=distill_class_outputs,
            )
            model_outputs.update(detection_results)

        if not self._include_mask:
            return model_outputs

        if is_training:
            rpn_rois, classes, mask_targets = self._sample_masks_fn(
                rpn_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, labels['gt_masks'])
            mask_targets = tf.stop_gradient(mask_targets)

            classes = tf.cast(classes, dtype=tf.int32)

            model_outputs.update({
                'mask_targets': mask_targets,
                'sampled_class_targets': classes,
            })
        else:
            rpn_rois = detection_results['detection_boxes']
            classes = tf.cast(detection_results['detection_classes'],
                              dtype=tf.int32)

        mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=14)

        mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes,
                                           is_training)

        if is_training:
            model_outputs.update({
                'mask_outputs': mask_outputs,
            })
        else:
            model_outputs.update(
                {'detection_masks': tf.nn.sigmoid(mask_outputs)})

        return model_outputs
Beispiel #18
0
    def build_outputs(self, features, labels, mode):
        is_training = mode == mode_keys.TRAIN
        model_outputs = {}

        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._anchor_params.min_level, self._anchor_params.max_level,
                self._anchor_params.num_scales,
                self._anchor_params.aspect_ratios,
                self._anchor_params.anchor_size,
                features.get_shape().as_list()[1:3]).multilevel_boxes

        backbone_features = self._backbone_fn(features, is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training)

        rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
            fpn_features, is_training)
        model_outputs.update({
            'rpn_score_outputs': rpn_score_outputs,
            'rpn_box_outputs': rpn_box_outputs,
        })
        rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs,
                                             rpn_score_outputs, anchor_boxes,
                                             labels['image_info'][:, 1, :],
                                             is_training)

        if is_training:
            rpn_rois = tf.stop_gradient(rpn_rois)

            # Sample proposals.
            rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
                self._sample_rois_fn(rpn_rois, labels['gt_boxes'],
                                     labels['gt_classes']))

            # Create bounding box training targets.
            box_targets = box_utils.encode_boxes(
                matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
            # If the target is background, the box target is set to all 0s.
            box_targets = tf.where(
                tf.tile(
                    tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
                    [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
            model_outputs.update({
                'class_targets': matched_gt_classes,
                'box_targets': box_targets,
            })

        roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=7)

        class_outputs, box_outputs = self._frcnn_head_fn(
            roi_features, is_training)
        model_outputs.update({
            'class_outputs': class_outputs,
            'box_outputs': box_outputs,
        })

        if not is_training:
            detection_results = self._generate_detections_fn(
                box_outputs, class_outputs, rpn_rois,
                labels['image_info'][:, 1:2, :])
            model_outputs.update(detection_results)

        if not self._include_mask:
            self._log_model_statistics(features)
            return model_outputs

        if is_training:
            rpn_rois, classes, mask_targets = self._sample_masks_fn(
                rpn_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, labels['gt_masks'])
            mask_targets = tf.stop_gradient(mask_targets)

            classes = tf.cast(classes, dtype=tf.int32)

            model_outputs.update({
                'mask_targets': mask_targets,
                'sampled_class_targets': classes,
            })
        else:
            rpn_rois = detection_results['detection_boxes']
            classes = tf.cast(detection_results['detection_classes'],
                              dtype=tf.int32)

        mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=14)

        mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes,
                                           is_training)

        if is_training:
            model_outputs.update({
                'mask_outputs': mask_outputs,
            })
        else:
            model_outputs.update(
                {'detection_masks': tf.nn.sigmoid(mask_outputs)})

        self._log_model_statistics(features)
        return model_outputs
    def _build_outputs(self, images, labels, mode):
        is_training = mode == mode_keys.TRAIN
        model_outputs = {}

        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._params.architecture.min_level,
                self._params.architecture.max_level,
                self._params.anchor.num_scales,
                self._params.anchor.aspect_ratios,
                self._params.anchor.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

            batch_size = tf.shape(images)[0]
            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(images, is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training)

        rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
            fpn_features, is_training)
        model_outputs.update({
            'rpn_score_outputs': rpn_score_outputs,
            'rpn_box_outputs': rpn_box_outputs,
        })
        # Run the RPN layer to get bbox coordinates for first frcnn layer.
        current_rois, _ = self._generate_rois_fn(rpn_box_outputs,
                                                 rpn_score_outputs,
                                                 anchor_boxes,
                                                 labels['image_info'][:, 1, :],
                                                 is_training)

        cascade_ious = [-1]
        if self._cascade_iou_thresholds is not None:
            cascade_ious = cascade_ious + self._cascade_iou_thresholds
        next_rois = current_rois
        # Stores the class predictions for each RCNN head.
        all_class_outputs = []
        for cascade_num, iou_threshold in enumerate(cascade_ious):
            # In cascade RCNN we want the higher layers to have different regression
            # weights as the predicted deltas become smaller and smaller.
            regression_weights = self._cascade_layer_to_weights[cascade_num]
            current_rois = next_rois
            (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
             matched_gt_classes, matched_gt_indices,
             current_rois) = self._run_frcnn_head(fpn_features, current_rois,
                                                  labels, is_training,
                                                  model_outputs, cascade_num,
                                                  iou_threshold,
                                                  regression_weights)
            all_class_outputs.append(class_outputs)

            # Generate the next rois if we are running another cascade.
            # Since bboxes are predicted for every class
            # (if `class_agnostic_bbox_pred` is false) this takes the best class
            # bbox and converts it to the correct format to be used for roi
            # operations.
            if is_training:
                correct_class = matched_gt_classes
            else:
                correct_class = tf.arg_max(class_outputs, dimension=-1)

            next_rois = self._box_outputs_to_rois(
                box_outputs, current_rois, correct_class,
                labels['image_info'][:, 1:2, :], regression_weights)

        if not is_training:
            tf.logging.info('(self._class_agnostic_bbox_pred): {}'.format(
                self._class_agnostic_bbox_pred))
            if self._cascade_class_ensemble:
                class_outputs = tf.add_n(all_class_outputs) / len(
                    all_class_outputs)
            # Post processing/NMS is done here for final boxes. Note NMS is done
            # before to generate proposals of the output of the RPN head.
            # The background class is also removed here.
            detection_results = self._generate_detections_fn(
                box_outputs,
                class_outputs,
                current_rois,
                labels['image_info'][:, 1:2, :],
                regression_weights,
                bbox_per_class=(not self._class_agnostic_bbox_pred))
            model_outputs.update(detection_results)

        if not self._include_mask:
            return model_outputs

        if is_training:
            current_rois, classes, mask_targets = self._sample_masks_fn(
                current_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, labels['gt_masks'])
            mask_targets = tf.stop_gradient(mask_targets)

            classes = tf.cast(classes, dtype=tf.int32)

            model_outputs.update({
                'mask_targets': mask_targets,
                'sampled_class_targets': classes,
            })
        else:
            current_rois = detection_results['detection_boxes']
            classes = tf.cast(detection_results['detection_classes'],
                              dtype=tf.int32)

        mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, current_rois, output_size=14)
        mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes,
                                           is_training)

        if is_training:
            model_outputs.update({
                'mask_outputs': mask_outputs,
            })
        else:
            model_outputs.update(
                {'detection_masks': tf.nn.sigmoid(mask_outputs)})

        return model_outputs
    def _parse_eval_data(self, data):
        """Parses data for evaluation.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        groundtruths:
          source_id: Groundtruth source id.
          height: Original image height.
          width: Original image width.
          boxes: Groundtruth bounding box annotations. The box is represented
             in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
             image that is fed to the network. The tennsor is padded with -1 to
             the fixed dimension [self._max_num_instances, 4].
          classes: Groundtruth classes annotations. The tennsor is padded
            with -1 to the fixed dimension [self._max_num_instances].
          areas: Box area or mask area depend on whether mask is present.
          is_crowds: Whether the ground truth label is a crowd label.
          num_groundtruths: Number of ground truths in the image.
    """
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # Assigns anchor targets.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Sets up groundtruth data for evaluation.
        groundtruths = {
            'source_id':
            data['source_id'],
            'height':
            data['height'],
            'width':
            data['width'],
            'num_groundtruths':
            tf.shape(data['groundtruth_classes']),
            'boxes':
            box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                        image_shape),
            'classes':
            data['groundtruth_classes'],
            'areas':
            data['groundtruth_area'],
            'is_crowds':
            tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }
        # TODO(b/143766089): Add ground truth masks for segmentation metrics.
        groundtruths['source_id'] = dataloader_utils.process_source_id(
            groundtruths['source_id'])
        groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
            groundtruths, self._max_num_instances)

        # Packs labels for model_fn outputs.
        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
            'groundtruths': groundtruths,
        }

        return image, labels
Beispiel #21
0
    def _parse_eval_data(self, data):
        """Parses data for training and evaluation."""
        groundtruths = {}
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)
        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Sets up groundtruth data for evaluation.
        groundtruths = {
            'source_id':
            data['source_id'],
            'num_groundtrtuhs':
            tf.shape(data['groundtruth_classes']),
            'image_info':
            image_info,
            'boxes':
            box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                        image_shape),
            'classes':
            data['groundtruth_classes'],
            'areas':
            data['groundtruth_area'],
            'is_crowds':
            tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }
        groundtruths['source_id'] = process_source_id(
            groundtruths['source_id'])
        groundtruths = pad_groundtruths_to_fixed_size(groundtruths,
                                                      self._max_num_instances)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
            'groundtruths': groundtruths,
        }
        return image, labels
Beispiel #22
0
def build_predictions(features,
                      params,
                      output_image_info,
                      output_normalized_coordinates,
                      cast_num_detections_to_float,
                      cast_detection_classes_to_float=False):
    """Builds the model graph for serving.

  Args:
    features: features to be passed to the serving model graph
    params: hyperparameters to be passed to the serving model graph
    output_image_info: bool, whether output the image_info node.
    output_normalized_coordinates: bool, whether box outputs are in the
      normalized coordinates.
    cast_num_detections_to_float: bool, whether to cast the number of detections
      to float type.
    cast_detection_classes_to_float: bool, whether or not cast the detection
      classes  to float type.

  Returns:
    predictions: model outputs for serving.
    model_outputs: a dict of model output tensors.
  """
    images = features['images']
    batch_size, height, width, _ = images.get_shape().as_list()

    input_anchor = anchor.Anchor(params.architecture.min_level,
                                 params.architecture.max_level,
                                 params.anchor.num_scales,
                                 params.anchor.aspect_ratios,
                                 params.anchor.anchor_size, (height, width))

    multilevel_boxes = {}
    for k, v in six.iteritems(input_anchor.multilevel_boxes):
        multilevel_boxes[k] = tf.tile(tf.expand_dims(v, 0), [batch_size, 1, 1])

    model_fn = factory.model_generator(params)
    model_outputs = model_fn.build_outputs(features['images'],
                                           labels={
                                               'anchor_boxes':
                                               multilevel_boxes,
                                               'image_info':
                                               features['image_info'],
                                           },
                                           mode=mode_keys.PREDICT)

    # Return flattened raw outputs.
    if not params.postprocess.apply_nms:
        predictions = {
            'raw_boxes': tf.identity(model_outputs['raw_boxes'], 'RawBoxes'),
            'raw_scores': tf.identity(model_outputs['raw_scores'],
                                      'RawScores'),
        }
        return predictions, model_outputs

    if cast_num_detections_to_float:
        model_outputs['num_detections'] = tf.cast(
            model_outputs['num_detections'], dtype=tf.float32)

    if cast_detection_classes_to_float:
        model_outputs['detection_classes'] = tf.cast(
            model_outputs['detection_classes'], dtype=tf.float32)

    if output_image_info:
        model_outputs.update({
            'image_info': features['image_info'],
        })

    if output_normalized_coordinates:
        detection_boxes = (
            model_outputs['detection_boxes'] /
            tf.tile(features['image_info'][:, 2:3, :], [1, 1, 2]))
        model_outputs['detection_boxes'] = box_utils.normalize_boxes(
            detection_boxes, features['image_info'][:, 0:1, :])

    predictions = {
        'num_detections':
        tf.identity(model_outputs['num_detections'], 'NumDetections'),
        'detection_boxes':
        tf.identity(model_outputs['detection_boxes'], 'DetectionBoxes'),
        'detection_classes':
        tf.identity(model_outputs['detection_classes'], 'DetectionClasses'),
        'detection_scores':
        tf.identity(model_outputs['detection_scores'], 'DetectionScores'),
    }
    if 'detection_masks' in model_outputs:
        predictions.update({
            'detection_masks':
            tf.identity(model_outputs['detection_masks'], 'DetectionMasks'),
        })
        if 'detection_outer_boxes' in model_outputs:
            predictions.update({
                'detection_outer_boxes':
                tf.identity(model_outputs['detection_outer_boxes'],
                            'DetectionOuterBoxes'),
            })

    if output_image_info:
        predictions['image_info'] = tf.identity(model_outputs['image_info'],
                                                'ImageInfo')

    return predictions, model_outputs
    def _build_outputs(self, images, labels, mode):
        is_training = mode == mode_keys.TRAIN
        model_outputs = {}

        if "anchor_boxes" in labels:
            anchor_boxes = labels["anchor_boxes"]
        else:
            anchor_boxes = anchor.Anchor(
                self._params.architecture.min_level,
                self._params.architecture.max_level,
                self._params.anchor.num_scales,
                self._params.anchor.aspect_ratios,
                self._params.anchor.anchor_size,
                images.get_shape().as_list()[1:3],
            ).multilevel_boxes

            batch_size = tf.shape(input=images)[0]
            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(images, is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training)

        rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
            fpn_features, is_training)
        model_outputs.update({
            "rpn_score_outputs": rpn_score_outputs,
            "rpn_box_outputs": rpn_box_outputs,
        })
        rpn_rois, _ = self._generate_rois_fn(
            rpn_box_outputs,
            rpn_score_outputs,
            anchor_boxes,
            labels["image_info"][:, 1, :],
            is_training,
        )

        if is_training:
            rpn_rois = tf.stop_gradient(rpn_rois)

            # Sample proposals.
            (
                rpn_rois,
                matched_gt_boxes,
                matched_gt_classes,
                matched_gt_indices,
            ) = self._sample_rois_fn(rpn_rois, labels["gt_boxes"],
                                     labels["gt_classes"])

            # Create bounding box training targets.
            box_targets = box_utils.encode_boxes(
                matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
            # If the target is background, the box target is set to all 0s.
            box_targets = tf.compat.v1.where(
                tf.tile(
                    tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
                    [1, 1, 4]),
                tf.zeros_like(box_targets),
                box_targets,
            )
            model_outputs.update({
                "class_targets": matched_gt_classes,
                "box_targets": box_targets,
            })

        roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=7)

        class_outputs, box_outputs = self._frcnn_head_fn(
            roi_features, is_training)
        model_outputs.update({
            "class_outputs": class_outputs,
            "box_outputs": box_outputs,
        })

        if not is_training:
            detection_results = self._generate_detections_fn(
                box_outputs, class_outputs, rpn_rois,
                labels["image_info"][:, 1:2, :])
            model_outputs.update(detection_results)

        if not self._include_mask:
            return model_outputs

        if is_training:
            (
                rpn_rois,
                classes,
                mask_targets,
                gather_nd_gt_indices,
            ) = self._sample_masks_fn(
                rpn_rois,
                matched_gt_boxes,
                matched_gt_classes,
                matched_gt_indices,
                labels["gt_masks"],
            )
            mask_targets = tf.stop_gradient(mask_targets)

            classes = tf.cast(classes, dtype=tf.int32)

            model_outputs.update({
                "mask_targets": mask_targets,
                "sampled_class_targets": classes,
            })
        else:
            rpn_rois = detection_results["detection_boxes"]
            classes = tf.cast(detection_results["detection_classes"],
                              dtype=tf.int32)

        mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=14)

        mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes,
                                           is_training)

        if is_training:
            model_outputs.update({
                "mask_outputs": mask_outputs,
            })
        else:
            model_outputs.update(
                {"detection_masks": tf.nn.sigmoid(mask_outputs)})

        if not self._include_attributes:
            return model_outputs

        attribute_outputs = self._attributes_head_fn(mask_roi_features,
                                                     is_training)

        if is_training:
            attribute_targets = tf.gather_nd(
                labels["gt_attributes"],
                gather_nd_gt_indices)  # [batch, K, num_attributes]

            model_outputs.update({
                "attribute_outputs": attribute_outputs,
                "attribute_targets": attribute_targets,
            })
        else:
            model_outputs["detection_attributes"] = tf.nn.sigmoid(
                attribute_outputs)

        return model_outputs
Beispiel #24
0
    def _parse_predict_data(self, data):
        """Parses data for prediction."""
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Compute Anchor boxes.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
        }
        # If mode is PREDICT_WITH_GT, returns groundtruths and training targets
        # in labels.
        if self._mode == ModeKeys.PREDICT_WITH_GT:
            # Converts boxes from normalized coordinates to pixel coordinates.
            boxes = box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                                image_shape)
            groundtruths = {
                'source_id': data['source_id'],
                'height': data['height'],
                'width': data['width'],
                'num_detections': tf.shape(data['groundtruth_classes']),
                'boxes': boxes,
                'classes': data['groundtruth_classes'],
                'areas': data['groundtruth_area'],
                'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
            }
            groundtruths['source_id'] = dataloader_utils.process_source_id(
                groundtruths['source_id'])
            groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
                groundtruths, self._max_num_instances)
            labels['groundtruths'] = groundtruths

            # Computes training objective for evaluation loss.
            classes = data['groundtruth_classes']

            image_scale = image_info[2, :]
            offset = image_info[3, :]
            boxes = input_utils.resize_and_crop_boxes(
                boxes, image_scale, (image_height, image_width), offset)
            # Filters out ground truth boxes that are all zeros.
            indices = box_utils.get_non_empty_box_indices(boxes)
            boxes = tf.gather(boxes, indices)
            classes = tf.gather(classes, indices)

            # Assigns anchors.
            anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                                  self._match_threshold,
                                                  self._unmatched_threshold)
            (cls_targets, box_targets,
             num_positives) = anchor_labeler.label_anchors(
                 boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
            labels['cls_targets'] = cls_targets
            labels['box_targets'] = box_targets
            labels['num_positives'] = num_positives
        return {
            'images': image,
            'labels': labels,
        }
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(input=classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    pred=tf.greater(tf.size(input=is_crowds), 0),
                    true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.
                                             int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

        # Gets original image and its size.
        image = data['image']

        # NOTE: The autoaugment method works best when used alongside the standard
        # horizontal flipping of images along with size jittering and normalization.
        if self._use_autoaugment:
            image, boxes = autoaugment_utils.distort_image_with_autoaugment(
                image, boxes, self._autoaugment_policy_name)

        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)
        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }
        # return image, labels
        num_anchors = input_anchor.anchors_per_location
        mlvl_cls_targets = tf.concat([tf.reshape(cls_targets[lv], [-1, num_anchors]) \
                                     for lv in range(self._min_level, self._max_level+1)], axis=0)
        mlvl_box_targets = tf.concat([tf.reshape(box_targets[lv], [-1, num_anchors*4]) \
                                     for lv in range(self._min_level, self._max_level + 1)], axis=0)
        num_positives_expand = tf.ones_like(
            mlvl_box_targets[..., 0:1]) * num_positives
        mlvl_cls_targets_wp = tf.concat(
            [mlvl_cls_targets,
             tf.cast(num_positives_expand, dtype=tf.int32)],
            axis=-1)
        mlvl_box_targets_wp = tf.concat(
            [mlvl_box_targets, num_positives_expand], axis=-1)
        return image, (mlvl_cls_targets_wp, mlvl_box_targets_wp)