Ejemplo n.º 1
0
 def clip_boxes(self, boxes):
     """Clip boxes to fit in an image."""
     boxes = tf.where(tf.less(boxes, 0), tf.zeros_like(boxes), boxes)
     is_height_short_side = tf.less(self._scaled_height, self._scaled_width)
     bound = tf.where(
         is_height_short_side,
         tf.convert_to_tensor(
             [self._output_size[0] - 1, self._output_size[1] - 1] * 2,
             dtype=tf.float32),
         tf.convert_to_tensor(
             [self._output_size[1] - 1, self._output_size[0] - 1] * 2,
             dtype=tf.float32))
     boxes = tf.where(tf.greater(boxes, bound), bound * tf.ones_like(boxes),
                      boxes)
     return boxes
Ejemplo n.º 2
0
        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        features: A dictionary that contains the image and auxiliary
          information. The following describes {key: value} pairs in the
          dictionary.
          image: An image tensor that is preprocessed to have normalized value
            and fixed dimension [image_size, image_size, 3]
          image_info: Image information that includes the original height and
            width, the scale of the processed image to the original image, and
            the scaled height and width.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
        labels: (only for training) A dictionary that contains groundtruth
          labels. The following describes {key: value} pairs in the dictionary.
          score_targets_dict: An ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors]. The height_l and width_l
            represent the dimension of objectiveness score at l-th level.
          box_targets_dict: An ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors * 4]. The height_l and
            width_l represent the dimension of bounding box regression output at
            l-th level.
          gt_boxes: Groundtruth bounding box annotations. The box is represented
             in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the
             fixed dimension [self._max_num_instances, 4].
          gt_classes: Groundtruth classes annotations. The tennsor is padded
            with -1 to the fixed dimension [self._max_num_instances].
          cropped_gt_masks: Groundtruth masks cropped by the bounding box and
            resized to a fixed size determined by params['gt_mask_size']
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)

                image = data['image']
                source_id = data['source_id']
                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                if self._mode == tf.estimator.ModeKeys.PREDICT:
                    input_processor = InstanceSegmentationInputProcessor(
                        image, image_size, params['short_side_image_size'],
                        params['long_side_max_image_size'])
                    input_processor.normalize_image()
                    input_processor.set_scale_factors_to_mlperf_reference_size(
                    )
                    image = input_processor.resize_and_crop_image()
                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    image_info = input_processor.get_image_info()
                    return {
                        'images': image,
                        'image_info': image_info,
                        'source_ids': source_id
                    }

                # The following part is for training.
                instance_masks = data['groundtruth_instance_masks']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                if not params['use_category']:
                    classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

                if (params['skip_crowd_during_training']
                        and self._mode == tf.estimator.ModeKeys.TRAIN):
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)
                    instance_masks = tf.gather_nd(instance_masks, indices)

                input_processor = InstanceSegmentationInputProcessor(
                    image, image_size, params['short_side_image_size'],
                    params['long_side_max_image_size'], boxes, classes,
                    instance_masks)
                input_processor.normalize_image()
                if params['input_rand_hflip']:
                    input_processor.random_horizontal_flip()

                input_processor.set_scale_factors_to_mlperf_reference_size()
                image = input_processor.resize_and_crop_image()

                boxes, classes = input_processor.resize_and_crop_boxes()
                cropped_gt_masks = input_processor.crop_gt_masks(
                    params['gt_mask_size'])

                image_info = input_processor.get_image_info()
                # Assign anchors.
                is_height_short_side = tf.less(image_info[3], image_info[4])
                score_targets, box_targets = tf.cond(
                    is_height_short_side,
                    lambda: anchor_labeler.label_anchors(boxes, classes),
                    lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes))  # pylint: disable=line-too-long

                # Pad groundtruth data.
                boxes *= image_info[2]
                boxes = pad_to_fixed_size(boxes, -1,
                                          [self._max_num_instances, 4])
                classes = pad_to_fixed_size(classes, -1,
                                            [self._max_num_instances, 1])
                # Pads cropped_gt_masks.
                cropped_gt_masks = tf.reshape(
                    cropped_gt_masks, [-1, (params['gt_mask_size'] + 4)**2])
                cropped_gt_masks = pad_to_fixed_size(
                    cropped_gt_masks, -1,
                    [self._max_num_instances, (params['gt_mask_size'] + 4)**2])
                cropped_gt_masks = tf.reshape(cropped_gt_masks, [
                    self._max_num_instances, params['gt_mask_size'] + 4,
                    params['gt_mask_size'] + 4
                ])
                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)

                features = {}
                features['images'] = image
                features['image_info'] = image_info
                features['source_ids'] = source_id

                labels = {}
                for level in range(params['min_level'],
                                   params['max_level'] + 1):
                    labels['score_targets_%d' % level] = score_targets[level]
                    labels['box_targets_%d' % level] = box_targets[level]
                labels['gt_boxes'] = boxes
                labels['gt_classes'] = classes
                labels['cropped_gt_masks'] = cropped_gt_masks
                return features, labels
Ejemplo n.º 3
0
 def clip_boxes(self, boxes):
     """Clip boxes to fit in an image."""
     boxes = tf.where(tf.less(boxes, 0), tf.zeros_like(boxes), boxes)
     boxes = tf.where(tf.greater(boxes, self._output_size - 1),
                      (self._output_size - 1) * tf.ones_like(boxes), boxes)
     return boxes