Esempio n. 1
0
def det_post_process(params: Dict[Any, Any],
                     cls_outputs: Dict[int, tf.Tensor],
                     box_outputs: Dict[int, tf.Tensor],
                     scales: List[float],
                     min_score_thresh,
                     max_boxes_to_draw):
  """Post preprocessing the box/class predictions.

  Args:
    params: a parameter dictionary that includes `min_level`, `max_level`,
      `batch_size`, and `num_classes`.
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    scales: a list of float values indicating image scale.
    min_score_thresh: A float representing the threshold for deciding when to
      remove boxes based on score.
    max_boxes_to_draw: Max number of boxes to draw.

  Returns:
    detections_batch: a batch of detection results. Each detection is a tensor
      with each row representing [image_id, x, y, width, height, score, class].
  """
  # TODO(tanmingxing): refactor the code to make it more explicity.
  outputs = {
      'cls_outputs_all': [None],
      'box_outputs_all': [None],
      'indices_all': [None],
      'classes_all': [None]
  }
  det_model_fn.add_metric_fn_inputs(
      params, cls_outputs, box_outputs, outputs, -1)

  # Create anchor_label for picking top-k predictions.
  eval_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                 params['num_scales'], params['aspect_ratios'],
                                 params['anchor_scale'], params['image_size'])
  anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes'])

  # Add all detections for each input image.
  detections_batch = []
  for index in range(params['batch_size']):
    cls_outputs_per_sample = outputs['cls_outputs_all'][index]
    box_outputs_per_sample = outputs['box_outputs_all'][index]
    indices_per_sample = outputs['indices_all'][index]
    classes_per_sample = outputs['classes_all'][index]
    detections = anchor_labeler.generate_detections(
        cls_outputs_per_sample,
        box_outputs_per_sample,
        indices_per_sample,
        classes_per_sample,
        image_id=[index],
        image_scale=[scales[index]],
        min_score_thresh=min_score_thresh,
        max_boxes_to_draw=max_boxes_to_draw,
        disable_pyfun=params.get('disable_pyfun'))
    detections_batch.append(detections)
  return tf.stack(detections_batch, name='detections')
Esempio n. 2
0
    def metric_fn(**kwargs):
      """Returns a dictionary that has the evaluation metrics."""
      batch_size = params['batch_size']
      eval_anchors = anchors.Anchors(params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'])
      anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                             params['num_classes'])
      cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
      box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

      if params.get('testdev_dir', None):
        logging.info('Eval testdev_dir %s', params['testdev_dir'])
        coco_metrics = coco_metric_fn(
            batch_size,
            anchor_labeler,
            params['val_json_file'],
            testdev_dir=params['testdev_dir'],
            disable_pyfun=params.get('disable_pyfun', None),
            **kwargs)
      else:
        logging.info('Eval val with groudtruths %s.', params['val_json_file'])
        coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                      params['val_json_file'], **kwargs)

      # Add metrics to output.
      output_metrics = {
          'cls_loss': cls_loss,
          'box_loss': box_loss,
      }
      output_metrics.update(coco_metrics)
      return output_metrics
Esempio n. 3
0
 def metric_fn(**kwargs):
   """Evaluation metric fn. Performed on CPU, do not reference TPU ops."""
   eval_anchors = anchors.Anchors(params['min_level'],
                                  params['max_level'],
                                  params['num_scales'],
                                  params['aspect_ratios'],
                                  params['anchor_scale'],
                                  params['image_size'])
   anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                          params['num_classes'])
   cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
   box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
   # add metrics to output
   cls_outputs = {}
   box_outputs = {}
   for level in range(params['min_level'], params['max_level'] + 1):
     cls_outputs[level] = kwargs['cls_outputs_%d' % level]
     box_outputs[level] = kwargs['box_outputs_%d' % level]
   detections = anchor_labeler.generate_detections(
       cls_outputs, box_outputs, kwargs['source_ids'])
   eval_metric = coco_metric.EvaluationMetric(params['val_json_file'])
   coco_metrics = eval_metric.estimator_metric_fn(detections,
                                                  kwargs['image_scales'])
   # Add metrics to output.
   output_metrics = {
       'cls_loss': cls_loss,
       'box_loss': box_loss,
   }
   output_metrics.update(coco_metrics)
   return output_metrics
Esempio n. 4
0
def get_pred_results(cls_outputs_dict,box_outputs_dict, params):
    input_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                    params['num_scales'],
                                    params['aspect_ratios'],
                                    params['anchor_scale'],
                                    (params['image_size'] - 5))
    anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'])

    return tf.map_fn(anchor_labeler.generate_detections,(cls_outputs_dict,box_outputs_dict),dtype=tf.float32)
Esempio n. 5
0
 def __init__(self, params):
     self._max_num_instances = MAX_NUM_INSTANCES
     self._image_size = params["image_size"]
     self._num_classes = params["num_classes"]
     input_anchors = anchors.Anchors(params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     (params['image_size'] - 5))
     self.anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                                 params['num_classes'])
Esempio n. 6
0
        def metric_fn(**kwargs):
            """Evaluation metric fn. Performed on CPU, do not reference TPU ops."""
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            # add metrics to output
            cls_outputs = {}
            box_outputs = {}
            detections_bs = []
            for index in range(batch_size):
                for level in range(params['min_level'],
                                   params['max_level'] + 1):
                    _, w, h, c = kwargs['cls_outputs_%d' %
                                        level].get_shape().as_list()
                    cls_outputs[level] = tf.slice(
                        kwargs['cls_outputs_%d' % level], [index, 0, 0, 0],
                        [1, w, h, c])
                    _, w, h, c = kwargs['box_outputs_%d' %
                                        level].get_shape().as_list()
                    box_outputs[level] = tf.slice(
                        kwargs['box_outputs_%d' % level], [index, 0, 0, 0],
                        [1, w, h, c])
                detections = anchor_labeler.generate_detections(
                    cls_outputs, box_outputs,
                    tf.slice(kwargs['source_ids'], [index], [1]),
                    tf.slice(kwargs['image_scales'], [index], [1]))
                detections_bs.append(detections)
            eval_metric = coco_metric.EvaluationMetric(params['val_json_file'])
            coco_metrics = eval_metric.estimator_metric_fn(
                detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics
Esempio n. 7
0
    def metric_fn(**kwargs):
      """Returns a dictionary that has the evaluation metrics."""
      batch_size = params['batch_size']
      eval_anchors = anchors.Anchors(
          params['min_level'], params['max_level'], params['num_scales'],
          params['aspect_ratios'], params['anchor_scale'], params['image_size'])
      anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                             params['num_classes'])
      cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
      box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
      coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                    params['val_json_file'], **kwargs)

      # Add metrics to output.
      output_metrics = {
          'cls_loss': cls_loss,
          'box_loss': box_loss,
      }
      output_metrics.update(coco_metrics)
      return output_metrics
Esempio n. 8
0
def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor],
                     box_outputs: Dict[int, tf.Tensor], scales: List[float]):

    outputs = {
        'cls_outputs_all': [None],
        'box_outputs_all': [None],
        'indices_all': [None],
        'classes_all': [None]
    }

    add_metric_fn_inputs(params, cls_outputs, box_outputs, outputs)
    #Create anchor_label for picking top-k predictions.
    eval_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                   params['num_scales'],
                                   params['aspect_ratios'],
                                   params['anchor_scale'],
                                   params['image_size'])
    anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes'])
    #Add all detections for each input image.
    detections_batch = []
    for index in range(params['batch_size']):
        #shape is [MAX_DETECTION_POINTS,]---->score
        cls_outputs_per_sample = outputs['cls_outputs_all'][index]
        #shape is [MAX_DETECTION_POINTS,4]---->box ---ty,tx,th,tw
        box_outputs_per_sample = outputs['box_outputs_all'][index]
        # shape is [MAX_DETECTION_POINTS,]
        indices_per_sample = outputs['indices_all'][index]
        # shape is [MAX_DETECTION_POINTS,]
        classes_per_sample = outputs['classes_all'][index]
        detections = anchor_labeler.generate_detections(
            cls_outputs_per_sample,
            box_outputs_per_sample,
            indices_per_sample,
            classes_per_sample,
            image_id=[index],
            image_scale=[scales[index]],
            disable_pyfun=False)
        detections_batch.append(detections)
    #shape is batch =[batch,M,7]---[image_id, x, y, width, height, score, class]
    return tf.stack(detections_batch, name='detections')
Esempio n. 9
0
    def __call__(self, params):
        image_size = (params['image_size'], params['image_size'])
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'], image_size)
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'],
                                               params['rpn_positive_overlap'],
                                               params['rpn_negative_overlap'],
                                               params['rpn_batch_size_per_im'],
                                               params['rpn_fg_fraction'])

        example_decoder = tf_example_decoder.TfExampleDecoder(
            use_instance_mask=self._use_instance_mask)

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        features: a dictionary that contains the image and auxiliary
          information. The following describes {key: value} pairs in the
          dictionary.
          image: Image tensor that is preproessed to have normalized value and
            fixed dimension [image_size, image_size, 3]
          image_info: image information that includes the original height and
            width, the scale of the proccessed image to the original image, and
            the scaled height and width.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
        labels: a dictionary that contains auxiliary information plus (optional)
          labels. The following describes {key: value} pairs in the dictionary.
          `labels` is only for training.
          score_targets_dict: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors]. The height_l and width_l
            represent the dimension of objectiveness score at l-th level.
          box_targets_dict: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors * 4]. The height_l and
            width_l represent the dimension of bounding box regression output at
            l-th level.
          gt_boxes: Groundtruth bounding box annotations. The box is represented
             in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the
             fixed dimension [self._max_num_instances, 4].
          gt_classes: Groundtruth classes annotations. The tennsor is padded
            with -1 to the fixed dimension [self._max_num_instances].
          cropped_gt_masks: groundtrugh masks cropped by the bounding box and
            resized to a fixed size determined by params['gt_mask_size']
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                image = data['image']
                source_id = data['source_id']
                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                if self._mode == tf.estimator.ModeKeys.PREDICT:
                    input_processor = InstanceSegmentationInputProcessor(
                        image, image_size)
                    input_processor.normalize_image()
                    input_processor.set_scale_factors_to_output_size()
                    image = input_processor.resize_and_crop_image()
                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    image_info = input_processor.get_image_info()
                    return {
                        'images': image,
                        'image_info': image_info,
                        'source_ids': source_id
                    }

                elif self._mode == tf.estimator.ModeKeys.TRAIN:
                    instance_masks = None
                    if self._use_instance_mask:
                        instance_masks = data['groundtruth_instance_masks']
                    boxes = data['groundtruth_boxes']
                    classes = data['groundtruth_classes']
                    classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                         [-1, 1])
                    if not params['use_category']:
                        classes = tf.cast(tf.greater(classes, 0),
                                          dtype=tf.float32)

                    if (params['skip_crowd_during_training']
                            and self._mode == tf.estimator.ModeKeys.TRAIN):
                        indices = tf.where(
                            tf.logical_not(data['groundtruth_is_crowd']))
                        classes = tf.gather_nd(classes, indices)
                        boxes = tf.gather_nd(boxes, indices)
                        if self._use_instance_mask:
                            instance_masks = tf.gather_nd(
                                instance_masks, indices)

                    input_processor = InstanceSegmentationInputProcessor(
                        image, image_size, boxes, classes, instance_masks)
                    input_processor.normalize_image()
                    if params['input_rand_hflip']:
                        input_processor.random_horizontal_flip()

                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'])
                    image = input_processor.resize_and_crop_image()

                    boxes, classes = input_processor.resize_and_crop_boxes()
                    if self._use_instance_mask:
                        instance_masks = input_processor.resize_and_crop_masks(
                        )
                        cropped_gt_masks = input_processor.crop_gt_masks(
                            instance_masks, boxes, params['gt_mask_size'],
                            image_size)

                    # Assign anchors.
                    score_targets, box_targets = anchor_labeler.label_anchors(
                        boxes, classes)

                    # Pad groundtruth data.
                    image_info = input_processor.get_image_info()
                    boxes *= image_info[2]
                    boxes = pad_to_fixed_size(boxes, -1,
                                              [self._max_num_instances, 4])
                    classes = pad_to_fixed_size(classes, -1,
                                                [self._max_num_instances, 1])

                    # Pads cropped_gt_masks.
                    if self._use_instance_mask:
                        cropped_gt_masks = tf.reshape(
                            cropped_gt_masks, [self._max_num_instances, -1])
                        cropped_gt_masks = pad_to_fixed_size(
                            cropped_gt_masks, -1, [
                                self._max_num_instances,
                                (params['gt_mask_size'] + 4)**2
                            ])
                        cropped_gt_masks = tf.reshape(cropped_gt_masks, [
                            self._max_num_instances, params['gt_mask_size'] +
                            4, params['gt_mask_size'] + 4
                        ])

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    features = {}
                    features['images'] = image
                    features['image_info'] = image_info
                    features['source_ids'] = source_id
                    labels = {}
                    for level in range(params['min_level'],
                                       params['max_level'] + 1):
                        labels['score_targets_%d' %
                               level] = score_targets[level]
                        labels['box_targets_%d' % level] = box_targets[level]
                    labels['gt_boxes'] = boxes
                    labels['gt_classes'] = classes
                    if self._use_instance_mask:
                        labels['cropped_gt_masks'] = cropped_gt_masks
                    return (features, labels)

        batch_size = params['batch_size'] if 'batch_size' in params else 1
        dataset = tf.data.Dataset.list_files(
            self._file_pattern,
            shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN))
        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(
                _prefetch_dataset,
                cycle_length=32,
                sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN)))
        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.apply(
            tf.contrib.data.map_and_batch(_dataset_parser,
                                          batch_size=batch_size,
                                          num_parallel_batches=64,
                                          drop_remainder=True))

        # Transposes images for TPU performance.
        # Given the batch size, the batch dimesion (N) goes to either the minor
        # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C)
        # dimension. Here, we assume N is 4 or 8 and C is 3, so we use
        # (H, W, C, N).
        if (params['transpose_input']
                and self._mode == tf.estimator.ModeKeys.TRAIN):

            def _transpose_images(features, labels):
                features['images'] = tf.transpose(features['images'],
                                                  [1, 2, 3, 0])
                return features, labels

            dataset = dataset.map(_transpose_images, num_parallel_calls=64)

        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)

        if self._num_examples > 0:
            dataset = dataset.take(self._num_examples)
        if self._use_fake_data:
            # Turn this dataset into a semi-fake dataset which always loop at the
            # first batch. This reduces variance in performance and is useful in
            # testing.
            dataset = dataset.take(1).cache().repeat()
        return dataset
Esempio n. 10
0
        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        features: a dictionary that contains the image and auxiliary
          information. The following describes {key: value} pairs in the
          dictionary.
          image: Image tensor that is preproessed to have normalized value and
            fixed dimension [image_size, image_size, 3]
          image_info: image information that includes the original height and
            width, the scale of the proccessed image to the original image, and
            the scaled height and width.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
        labels: a dictionary that contains auxiliary information plus (optional)
          labels. The following describes {key: value} pairs in the dictionary.
          `labels` is only for training.
          score_targets_dict: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors]. The height_l and width_l
            represent the dimension of objectiveness score at l-th level.
          box_targets_dict: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors * 4]. The height_l and
            width_l represent the dimension of bounding box regression output at
            l-th level.
          gt_boxes: Groundtruth bounding box annotations. The box is represented
             in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the
             fixed dimension [self._max_num_instances, 4].
          gt_classes: Groundtruth classes annotations. The tennsor is padded
            with -1 to the fixed dimension [self._max_num_instances].
          cropped_gt_masks: groundtrugh masks cropped by the bounding box and
            resized to a fixed size determined by params['gt_mask_size']
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                data['groundtruth_is_crowd'] = tf.cond(
                    tf.greater(tf.size(data['groundtruth_is_crowd']),
                               0), lambda: data['groundtruth_is_crowd'],
                    lambda: tf.zeros_like(data['groundtruth_classes'],
                                          dtype=tf.bool))
                image = data['image']
                image = tf.image.convert_image_dtype(image, dtype=tf.float32)
                orig_image = image
                source_id = data['source_id']
                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                if self._mode == tf.estimator.ModeKeys.PREDICT:
                    image = preprocess_ops.normalize_image(image)
                    image, image_info, _, _, _ = preprocess_ops.resize_crop_pad(
                        image, params['image_size'], 2**params['max_level'])
                    if params['precision'] == 'bfloat16':
                        image = tf.cast(image, dtype=tf.bfloat16)

                    features = {
                        'images': image,
                        'image_info': image_info,
                        'source_ids': source_id,
                    }
                    if params['visualize_images_summary']:
                        resized_image = tf.image.resize_images(
                            orig_image, params['image_size'])
                        features['orig_images'] = resized_image
                    if params['include_groundtruth_in_features']:
                        labels = _prepare_labels_for_eval(
                            data,
                            target_num_instances=self._max_num_instances,
                            target_polygon_list_len=self.
                            _max_num_polygon_list_len,
                            use_instance_mask=params['include_mask'])
                        return {'features': features, 'labels': labels}
                    else:
                        return {'features': features}

                elif (self._mode == tf.estimator.ModeKeys.TRAIN
                      or self._mode == tf.estimator.ModeKeys.EVAL):
                    instance_masks = None
                    if self._use_instance_mask:
                        instance_masks = data['groundtruth_instance_masks']
                    boxes = data['groundtruth_boxes']
                    classes = data['groundtruth_classes']
                    classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                         [-1, 1])
                    if not params['use_category']:
                        classes = tf.cast(tf.greater(classes, 0),
                                          dtype=tf.float32)

                    if (params['skip_crowd_during_training']
                            and self._mode == tf.estimator.ModeKeys.TRAIN):
                        indices = tf.where(
                            tf.logical_not(data['groundtruth_is_crowd']))
                        classes = tf.gather_nd(classes, indices)
                        boxes = tf.gather_nd(boxes, indices)
                        if self._use_instance_mask:
                            instance_masks = tf.gather_nd(
                                instance_masks, indices)

                    image = preprocess_ops.normalize_image(image)
                    # Random flipping for training only.
                    if (self._mode == tf.estimator.ModeKeys.TRAIN
                            and params['input_rand_hflip']):
                        flipped_results = (
                            preprocess_ops.random_horizontal_flip(
                                image, boxes=boxes, masks=instance_masks))
                        if self._use_instance_mask:
                            image, boxes, instance_masks = flipped_results
                        else:
                            image, boxes = flipped_results
                    # Scaling, jittering and padding.
                    image, image_info, boxes, classes, cropped_gt_masks = (
                        preprocess_ops.resize_crop_pad(
                            image,
                            params['image_size'],
                            2**params['max_level'],
                            aug_scale_min=params['aug_scale_min'],
                            aug_scale_max=params['aug_scale_max'],
                            boxes=boxes,
                            classes=classes,
                            masks=instance_masks,
                            crop_mask_size=params['gt_mask_size']))
                    if cropped_gt_masks is not None:
                        cropped_gt_masks = tf.pad(cropped_gt_masks,
                                                  paddings=tf.constant([[
                                                      0,
                                                      0,
                                                  ], [
                                                      2,
                                                      2,
                                                  ], [2, 2]]),
                                                  mode='CONSTANT',
                                                  constant_values=0.)

                    padded_height, padded_width, _ = image.get_shape().as_list(
                    )
                    padded_image_size = (padded_height, padded_width)
                    input_anchors = anchors.Anchors(params['min_level'],
                                                    params['max_level'],
                                                    params['num_scales'],
                                                    params['aspect_ratios'],
                                                    params['anchor_scale'],
                                                    padded_image_size)
                    anchor_labeler = anchors.AnchorLabeler(
                        input_anchors, params['num_classes'],
                        params['rpn_positive_overlap'],
                        params['rpn_negative_overlap'],
                        params['rpn_batch_size_per_im'],
                        params['rpn_fg_fraction'])

                    # Assign anchors.
                    score_targets, box_targets = anchor_labeler.label_anchors(
                        boxes, classes)

                    # Pad groundtruth data.
                    boxes = preprocess_ops.pad_to_fixed_size(
                        boxes, -1, [self._max_num_instances, 4])
                    classes = preprocess_ops.pad_to_fixed_size(
                        classes, -1, [self._max_num_instances, 1])

                    # Pads cropped_gt_masks.
                    if self._use_instance_mask:
                        cropped_gt_masks = tf.reshape(
                            cropped_gt_masks,
                            tf.stack([tf.shape(cropped_gt_masks)[0], -1]))
                        cropped_gt_masks = preprocess_ops.pad_to_fixed_size(
                            cropped_gt_masks, -1, [
                                self._max_num_instances,
                                (params['gt_mask_size'] + 4)**2
                            ])
                        cropped_gt_masks = tf.reshape(cropped_gt_masks, [
                            self._max_num_instances, params['gt_mask_size'] +
                            4, params['gt_mask_size'] + 4
                        ])

                    if params['precision'] == 'bfloat16':
                        image = tf.cast(image, dtype=tf.bfloat16)

                    features = {
                        'images': image,
                        'image_info': image_info,
                        'source_ids': source_id,
                    }
                    labels = {}
                    for level in range(params['min_level'],
                                       params['max_level'] + 1):
                        labels['score_targets_%d' %
                               level] = score_targets[level]
                        labels['box_targets_%d' % level] = box_targets[level]
                    labels['gt_boxes'] = boxes
                    labels['gt_classes'] = classes
                    if self._use_instance_mask:
                        labels['cropped_gt_masks'] = cropped_gt_masks
                    return features, labels
Esempio n. 11
0
  def __call__(self, params):
    input_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                    params['num_scales'],
                                    params['aspect_ratios'],
                                    params['anchor_scale'],
                                    params['image_size'])
    anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'])
    example_decoder = tf_example_decoder.TfExampleDecoder()

    def _dataset_parser(value):
      """Parse data to a fixed dimension input image and learning targets."""
      with tf.name_scope('parser'):
        data = example_decoder.decode(value)

        source_id = data['source_id']
        # for xView dataset only; basically the original name is 122.tif and we will change it to number 122 later on.
        # len = tf.size(tf.string_split([data['source_id']],""))
        # source_id = tf.substr(data['source_id'],0,len - 4)

        image = data['image']
        boxes = data['groundtruth_boxes']
        classes = data['groundtruth_classes']
        classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
        # Handle crowd annotations. As crowd annotations are not large
        # instances, the model ignores them in training.
        if params['skip_crowd']:
          indices = tf.where(tf.logical_not(data['groundtruth_is_crowd']))
          classes = tf.gather_nd(classes, indices)
          boxes = tf.gather_nd(boxes, indices)

        # the image normalization is identical to Cloud TPU ResNet-50
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        image = _normalize_image(image)

        if params['input_rand_hflip']:
          image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes)
        image_original_shape = tf.shape(image)
        image, _ = preprocessor.resize_to_range(
            image,
            min_dimension=params['image_size'],
            max_dimension=params['image_size'])
        image_scale = tf.to_float(image_original_shape[0]) / tf.to_float(
            tf.shape(image)[0])
        image, boxes = preprocessor.scale_boxes_to_pixel_coordinates(
            image, boxes, keypoints=None)

        image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'],
                                             params['image_size'])
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(boxes, classes)
        #
        # sess = tf.get_default_session()
        # print("source id is", sess.run(source_id))

        source_id = tf.string_to_number(source_id, out_type=tf.float32)

        # sess = tf.get_default_session()
        # print("after conversion, source id is", sess.run(source_id))

        if params['use_bfloat16']:
          image = tf.cast(image, dtype=tf.bfloat16)
        row = (image, cls_targets, box_targets, num_positives, source_id,
               image_scale)
        return row

    batch_size = params['batch_size']

    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)

    dataset = dataset.shuffle(buffer_size=1024)
    if self._is_training:
      dataset = dataset.repeat()

    def prefetch_dataset(filename):
      dataset = tf.data.TFRecordDataset(filename).prefetch(1)
      return dataset

    dataset = dataset.apply(
        tf.contrib.data.parallel_interleave(
            prefetch_dataset, cycle_length=32, sloppy=True))
    dataset = dataset.shuffle(20)

    dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
    dataset = dataset.prefetch(batch_size)
    dataset = dataset.apply(
        tf.contrib.data.batch_and_drop_remainder(batch_size))
    dataset = dataset.prefetch(1)


    (images, cls_targets, box_targets, num_positives, source_ids,
     image_scales) = dataset.make_one_shot_iterator().get_next()
    labels = {}
    # count num_positives in a batch
    num_positives_batch = tf.reduce_mean(num_positives)
    labels['mean_num_positives'] = tf.reshape(
        tf.tile(tf.expand_dims(num_positives_batch, 0), [
            batch_size,
        ]), [batch_size, 1])

    for level in range(params['min_level'], params['max_level'] + 1):
      labels['cls_targets_%d' % level] = cls_targets[level]
      labels['box_targets_%d' % level] = box_targets[level]
    labels['source_ids'] = source_ids
    labels['image_scales'] = image_scales
    # from tensorflow.python.data.ops import dataset_ops
    # return dataset_ops.Dataset.zip((images, labels))
    return images, labels
Esempio n. 12
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
  """Model defination for the RetinaNet model based on ResNet.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the RetinaNet model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
  """
  def _model_outputs():
    return model(
        features,
        min_level=params['min_level'],
        max_level=params['max_level'],
        num_classes=params['num_classes'],
        num_anchors=len(params['aspect_ratios'] * params['num_scales']),
        resnet_depth=params['resnet_depth'],
        is_training_bn=params['is_training_bn'])

  if params['use_bfloat16']:
    with bfloat16.bfloat16_scope():
      cls_outputs, box_outputs = _model_outputs()
      levels = cls_outputs.keys()
      for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
  else:
    cls_outputs, box_outputs = _model_outputs()
    levels = cls_outputs.keys()

  # First check if it is in PREDICT mode.
  if mode == tf.estimator.ModeKeys.PREDICT:
    # print("entering PREDICT mode")
    predictions = {
        'image': features,
    }
    for level in levels:
      predictions['cls_outputs_%d' % level] = cls_outputs[level]
      predictions['box_outputs_%d' % level] = box_outputs[level]

    eval_anchors = anchors.Anchors(params['min_level'],
                                   params['max_level'],
                                   params['num_scales'],
                                   params['aspect_ratios'],
                                   params['anchor_scale'],
                                   params['image_size'])
    anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                           params['num_classes'])
    detections = anchor_labeler.generate_detections(
        cls_outputs, box_outputs,image_id=100)
    print("detection for image is", detections)
    predictions['detections'] = detections
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

  # Load pretrained model from checkpoint.
  if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:

    def scaffold_fn():
      """Loads pretrained model through scaffold function."""
      tf.train.init_from_checkpoint(params['resnet_checkpoint'], {
          '/': 'resnet%s/' % params['resnet_depth'],
      })
      return tf.train.Scaffold()
  else:
    scaffold_fn = None

  # Set up training loss and learning rate.
  global_step = tf.train.get_global_step()
  learning_rate = _learning_rate_schedule(
      params['learning_rate'], params['lr_warmup_init'],
      params['lr_warmup_step'], params['lr_drop_step'], global_step)
  # cls_loss and box_loss are for logging. only total_loss is optimized.
  total_loss, cls_loss, box_loss = _detection_loss(cls_outputs, box_outputs,
                                                   labels, params)

  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.MomentumOptimizer(
        learning_rate, momentum=params['momentum'])
    if params['use_tpu']:
      optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    # Batch norm requires update_ops to be added as a train_op dependency.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    var_list = variable_filter_fn(
        tf.trainable_variables(),
        params['resnet_depth']) if variable_filter_fn else None
    with tf.control_dependencies(update_ops):
      train_op = optimizer.minimize(total_loss, global_step, var_list=var_list)
  else:
    train_op = None

  # Evaluation only works on GPU/CPU host and batch_size=1
  eval_metrics = None
  if mode == tf.estimator.ModeKeys.EVAL:

    def metric_fn(**kwargs):
      """Evaluation metric fn. Performed on CPU, do not reference TPU ops."""
      eval_anchors = anchors.Anchors(params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'])
      anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                             params['num_classes'])
      cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
      box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
      # add metrics to output
      cls_outputs = {}
      box_outputs = {}
      for level in range(params['min_level'], params['max_level'] + 1):
        cls_outputs[level] = kwargs['cls_outputs_%d' % level]
        box_outputs[level] = kwargs['box_outputs_%d' % level]
      detections = anchor_labeler.generate_detections(
          cls_outputs, box_outputs, kwargs['source_ids'])
      eval_metric = coco_metric.EvaluationMetric(params['val_json_file'])
      coco_metrics = eval_metric.estimator_metric_fn(detections,
                                                     kwargs['image_scales'])
      # Add metrics to output.
      output_metrics = {
          'cls_loss': cls_loss,
          'box_loss': box_loss,
      }
      output_metrics.update(coco_metrics)
      return output_metrics

    batch_size = params['batch_size']
    cls_loss_repeat = tf.reshape(
        tf.tile(tf.expand_dims(cls_loss, 0), [
            batch_size,
        ]), [batch_size, 1])
    box_loss_repeat = tf.reshape(
        tf.tile(tf.expand_dims(box_loss, 0), [
            batch_size,
        ]), [batch_size, 1])
    metric_fn_inputs = {
        'cls_loss_repeat': cls_loss_repeat,
        'box_loss_repeat': box_loss_repeat,
        'source_ids': labels['source_ids'],
        'image_scales': labels['image_scales'],
    }
    for level in range(params['min_level'], params['max_level'] + 1):
      metric_fn_inputs['cls_outputs_%d' % level] = cls_outputs[level]
      metric_fn_inputs['box_outputs_%d' % level] = box_outputs[level]
    eval_metrics = (metric_fn, metric_fn_inputs)

  return tpu_estimator.TPUEstimatorSpec(
      mode=mode,
      loss=total_loss,
      train_op=train_op,
      eval_metrics=eval_metrics,
      scaffold_fn=scaffold_fn)
Esempio n. 13
0
  def __call__(self, params=None):
    if params is None:
      params = self._params
    input_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                    params['num_scales'],
                                    params['aspect_ratios'],
                                    params['anchor_scale'],
                                    params['image_size'])
    anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'])
    example_decoder = tf_example_decoder.TfExampleDecoder()

    def _dataset_parser(value):
      """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        image: Image tensor that is preprocessed to have normalized value and
          fixed dimension [image_size, image_size, 3]
        cls_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors]. The height_l and width_l
          represent the dimension of class logits at l-th level.
        box_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        num_positives: Number of positive anchors in the image.
        source_id: Source image id. Default value -1 if the source id is empty
          in the groundtruth annotation.
        image_scale: Scale of the processed image to the original image.
        boxes: Groundtruth bounding box annotations. The box is represented in
          [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed
          dimension [self._max_num_instances, 4].
        is_crowds: Groundtruth annotations to indicate if an annotation
          represents a group of instances by value {0, 1}. The tensor is
          padded with 0 to the fixed dimension [self._max_num_instances].
        areas: Groundtruth areas annotations. The tensor is padded with -1
          to the fixed dimension [self._max_num_instances].
        classes: Groundtruth classes annotations. The tensor is padded with -1
          to the fixed dimension [self._max_num_instances].
      """
      with tf.name_scope('parser'):
        data = example_decoder.decode(value)
        source_id = data['source_id']
        image = data['image']
        boxes = data['groundtruth_boxes']
        classes = data['groundtruth_classes']
        classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
        areas = data['groundtruth_area']
        is_crowds = data['groundtruth_is_crowd']
        classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])

        if params['skip_crowd_during_training'] and self._is_training:
          indices = tf.where(tf.logical_not(data['groundtruth_is_crowd']))
          classes = tf.gather_nd(classes, indices)
          boxes = tf.gather_nd(boxes, indices)

        # NOTE: The autoaugment method works best when used alongside the
        # standard horizontal flipping of images along with size jittering
        # and normalization.
        if params.get('autoaugment_policy', None) and self._is_training:
          from aug import autoaugment  # pylint: disable=g-import-not-at-top
          image, boxes = autoaugment.distort_image_with_autoaugment(
              image, boxes, params['autoaugment_policy'])

        input_processor = DetectionInputProcessor(
            image, params['image_size'], boxes, classes)
        input_processor.normalize_image()
        if self._is_training and params['input_rand_hflip']:
          input_processor.random_horizontal_flip()
        if self._is_training:
          input_processor.set_training_random_scale_factors(
              params['train_scale_min'], params['train_scale_max'])
        else:
          input_processor.set_scale_factors_to_output_size()
        image = input_processor.resize_and_crop_image()
        boxes, classes = input_processor.resize_and_crop_boxes()

        # Assign anchors.
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(boxes, classes)

        source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1',
                             source_id)
        source_id = tf.string_to_number(source_id)

        # Pad groundtruth data for evaluation.
        image_scale = input_processor.image_scale_to_original
        boxes *= image_scale
        is_crowds = tf.cast(is_crowds, dtype=tf.float32)
        boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4])
        is_crowds = pad_to_fixed_size(is_crowds, 0,
                                      [self._max_num_instances, 1])
        areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1])
        classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1])
        if params['use_bfloat16']:
          image = tf.cast(image, dtype=tf.bfloat16)
        return (image, cls_targets, box_targets, num_positives, source_id,
                image_scale, boxes, is_crowds, areas, classes)

    dataset = tf.data.Dataset.list_files(
        self._file_pattern, shuffle=self._is_training)

    if horovod_enabled() and self._is_training: #multi card eval is not supported yet
      # 根据 GPU 数量做 shard 均分
      dataset = dataset.shard(hvd.size(), hvd.rank())

    if self._is_training:
      dataset = dataset.repeat()

    # Prefetch data from files.
    def _prefetch_dataset(filename):
      dataset = tf.data.TFRecordDataset(filename).prefetch(1)
      return dataset

    cycle_length = 1 if self._is_deterministic else 32
    dataset = dataset.apply(
        tf.data.experimental.parallel_interleave(
            _prefetch_dataset, cycle_length=cycle_length, sloppy=self._is_training))
    if self._is_training:
      dataset = dataset.shuffle(64)

    # Parse the fetched records to input tensors for model function.
    num_parallel_calls = 1 if self._is_deterministic else 64
    dataset = dataset.map(_dataset_parser, num_parallel_calls=num_parallel_calls)
    batch_size = params['batch_size']
    dataset = dataset.prefetch(batch_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)

    def _process_example(images, cls_targets, box_targets, num_positives,
                         source_ids, image_scales, boxes, is_crowds, areas,
                         classes):
      """Processes one batch of data."""
      labels = {}
      # Count num_positives in a batch.
      num_positives_batch = tf.reduce_mean(num_positives)
      labels['mean_num_positives'] = tf.reshape(
          tf.tile(tf.expand_dims(num_positives_batch, 0), [
              batch_size,
          ]), [batch_size, 1])

      for level in range(params['min_level'], params['max_level'] + 1):
        labels['cls_targets_%d' % level] = cls_targets[level]
        labels['box_targets_%d' % level] = box_targets[level]
      # Concatenate groundtruth annotations to a tensor.
      groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2)
      labels['source_ids'] = source_ids
      labels['groundtruth_data'] = groundtruth_data
      labels['image_scales'] = image_scales
      return images, labels

    dataset = dataset.map(_process_example)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    if self._use_fake_data:
      # Turn this dataset into a semi-fake dataset which always loop at the
      # first batch. This reduces variance in performance and is useful in
      # testing.
      dataset = dataset.take(1).cache().repeat()
    return dataset
Esempio n. 14
0
    def __call__(self, params):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder()

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets."""
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                source_id = data['source_id']
                image = data['image']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                # Handle crowd annotations. As crowd annotations are not large
                # instances, the model ignores them in training.
                if params['skip_crowd']:
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)

                # the image normalization is identical to Cloud TPU ResNet-50
                image = tf.image.convert_image_dtype(image, dtype=tf.float32)
                image = _normalize_image(image)

                if params['input_rand_hflip']:
                    image, boxes = preprocessor.random_horizontal_flip(
                        image, boxes=boxes)
                image_original_shape = tf.shape(image)
                image, _ = preprocessor.resize_to_range(
                    image,
                    min_dimension=params['image_size'],
                    max_dimension=params['image_size'])
                image_scale = tf.to_float(
                    image_original_shape[0]) / tf.to_float(tf.shape(image)[0])
                image, boxes = preprocessor.scale_boxes_to_pixel_coordinates(
                    image, boxes, keypoints=None)

                image = tf.image.pad_to_bounding_box(image, 0, 0,
                                                     params['image_size'],
                                                     params['image_size'])
                (cls_targets, cls_weights, box_targets, box_weights,
                 num_positives, num_negatives,
                 num_ignored) = anchor_labeler.label_anchors(boxes, classes)

                source_id = tf.string_to_number(source_id, out_type=tf.float32)
                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)
                row = (image, cls_targets, cls_weights, box_targets,
                       box_weights, num_positives, num_negatives, num_ignored,
                       source_id, image_scale)
                return row

        # batch_size = params['batch_size']
        batch_size = self._batch_size

        dataset = tf.data.Dataset.list_files(self._file_pattern)

        dataset = dataset.shuffle(buffer_size=1024)
        if self._is_training:
            dataset = dataset.repeat()

        def prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename,
                                              buffer_size=8 * 1000 * 1000)
            return dataset

        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(prefetch_dataset,
                                                cycle_length=1,
                                                sloppy=True))
        dataset = dataset.shuffle(buffer_size=3072)

        dataset = dataset.map(_dataset_parser, num_parallel_calls=12)
        dataset = dataset.prefetch(32)
        dataset = dataset.apply(
            tf.contrib.data.batch_and_drop_remainder(batch_size))
        dataset = dataset.prefetch(2)

        (images, cls_targets, cls_weights, box_targets, box_weights,
         num_positives, num_negatives, num_ignored, source_ids,
         image_scales) = dataset.make_one_shot_iterator().get_next()
        labels = {}
        # count num_positives in a batch
        num_positives_batch = tf.reduce_mean(num_positives)
        labels['mean_num_positives'] = tf.reshape(
            tf.tile(tf.expand_dims(num_positives_batch, 0), [
                batch_size,
            ]), [batch_size, 1])

        num_negatives_batch = tf.reduce_mean(num_negatives)
        labels['mean_num_negatives'] = tf.reshape(
            tf.tile(tf.expand_dims(num_negatives_batch, 0), [
                batch_size,
            ]), [batch_size, 1])

        num_ignored_batch = tf.reduce_mean(num_ignored)
        labels['mean_num_ignored'] = tf.reshape(
            tf.tile(tf.expand_dims(num_ignored_batch, 0), [batch_size]),
            [batch_size, 1])

        for level in range(params['min_level'], params['max_level'] + 1):
            labels['cls_targets_%d' % level] = cls_targets[level]
            labels['cls_weights_%d' % level] = cls_weights[level]
            labels['box_targets_%d' % level] = box_targets[level]
            labels['box_weights_%d' % level] = box_weights[level]
        labels['source_ids'] = source_ids
        labels['image_scales'] = image_scales
        return images, labels
    def __call__(self, params):
        image_size = params['dynamic_image_size'] if params[
            'dynamic_input_shapes'] else (params['image_size'],
                                          params['image_size'])
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'], image_size)
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'],
                                               params['rpn_positive_overlap'],
                                               params['rpn_negative_overlap'],
                                               params['rpn_batch_size_per_im'],
                                               params['rpn_fg_fraction'])

        if params['dynamic_input_shapes']:
            height_long_side_image_size = image_size[::-1]
            height_long_side_input_anchors = anchors.Anchors(
                params['min_level'], params['max_level'], params['num_scales'],
                params['aspect_ratios'], params['anchor_scale'],
                height_long_side_image_size)
            height_long_side_anchor_labeler = anchors.AnchorLabeler(
                height_long_side_input_anchors, params['num_classes'],
                params['rpn_positive_overlap'], params['rpn_negative_overlap'],
                params['rpn_batch_size_per_im'], params['rpn_fg_fraction'])

        example_decoder = tf_example_decoder.TfExampleDecoder(
            use_instance_mask=True)

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        image: Image tensor that is preproessed to have normalized value and
          fixed dimension [image_size, image_size, 3]
        cls_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors]. The height_l and width_l
          represent the dimension of class logits at l-th level.
        box_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        num_positives: Number of positive anchors in the image.
        source_id: Source image id. Default value -1 if the source id is empty
          in the groundtruth annotation.
        image_scale: Scale of the proccessed image to the original image.
        boxes: Groundtruth bounding box annotations. The box is represented in
          [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed
          dimension [self._max_num_instances, 4].
        is_crowds: Groundtruth annotations to indicate if an annotation
          represents a group of instances by value {0, 1}. The tennsor is
          padded with 0 to the fixed dimension [self._max_num_instances].
        areas: Groundtruth areas annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
        classes: Groundtruth classes annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                source_id = data['source_id']
                image = data['image']
                instance_masks = data['groundtruth_instance_masks']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                areas = data['groundtruth_area']
                is_crowds = data['groundtruth_is_crowd']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                if not params['use_category']:
                    classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

                if (params['skip_crowd_during_training']
                        and self._mode == tf.estimator.ModeKeys.TRAIN):
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)
                    instance_masks = tf.gather_nd(instance_masks, indices)

                input_processor = InstanceSegmentationInputProcessor(
                    image, image_size, params['short_side_image_size'],
                    params['long_side_max_image_size'], boxes, classes,
                    instance_masks)
                input_processor.normalize_image()
                if (self._mode == tf.estimator.ModeKeys.TRAIN
                        and params['input_rand_hflip']):
                    input_processor.random_horizontal_flip()
                if self._mode == tf.estimator.ModeKeys.TRAIN:
                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'])
                else:
                    input_processor.set_scale_factors_to_mlperf_reference_size(
                    )
                image = input_processor.resize_and_crop_image()
                boxes, classes = input_processor.resize_and_crop_boxes()
                instance_masks = input_processor.resize_and_crop_masks()
                cropped_gt_masks = input_processor.crop_gt_masks(
                    instance_masks, boxes, params['gt_mask_size'], image_size)

                # Assign anchors.
                if params['dynamic_input_shapes']:
                    is_height_short_side = tf.less(
                        input_processor._scaled_height,  # pylint: disable=protected-access
                        input_processor._scaled_width)  # pylint: disable=protected-access
                    score_targets, box_targets = tf.cond(
                        is_height_short_side,
                        lambda: anchor_labeler.label_anchors(boxes, classes),
                        lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes))  # pylint: disable=line-too-long
                else:
                    score_targets, box_targets = anchor_labeler.label_anchors(
                        boxes, classes)

                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                image_scale = input_processor.image_scale_to_original
                scaled_height = input_processor.get_height_length()
                scaled_width = input_processor.get_width_length()
                image_info = tf.stack([
                    tf.to_float(scaled_height),
                    tf.to_float(scaled_width),
                    image_scale,
                    tf.to_float(input_processor.get_original_height),
                    tf.to_float(input_processor.get_original_width),
                ])
                # Pad groundtruth data for evaluation.
                boxes *= image_scale
                is_crowds = tf.cast(is_crowds, dtype=tf.float32)
                boxes = pad_to_fixed_size(boxes, -1,
                                          [self._max_num_instances, 4])
                is_crowds = pad_to_fixed_size(is_crowds, 0,
                                              [self._max_num_instances, 1])
                areas = pad_to_fixed_size(areas, -1,
                                          [self._max_num_instances, 1])
                classes = pad_to_fixed_size(classes, -1,
                                            [self._max_num_instances, 1])
                # Pads cropped_gt_masks.
                cropped_gt_masks = tf.reshape(cropped_gt_masks,
                                              [self._max_num_instances, -1])
                cropped_gt_masks = pad_to_fixed_size(
                    cropped_gt_masks, -1,
                    [self._max_num_instances, (params['gt_mask_size'] + 4)**2])
                cropped_gt_masks = tf.reshape(cropped_gt_masks, [
                    self._max_num_instances, params['gt_mask_size'] + 4,
                    params['gt_mask_size'] + 4
                ])

                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)
                return (image, score_targets, box_targets, source_id,
                        image_info, boxes, is_crowds, areas, classes,
                        cropped_gt_masks)

        # batch_size = params['batch_size']
        batch_size = params['batch_size'] if 'batch_size' in params else 1
        dataset = tf.data.Dataset.list_files(
            self._file_pattern,
            shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN))
        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(
                _prefetch_dataset,
                cycle_length=32,
                sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN)))
        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)

        if params['dynamic_input_shapes']:

            def key_func(image, *args):
                del args
                return tf.cast(tf.shape(image)[0], dtype=tf.int64)

            def reduce_func(unused_key, dataset):
                return dataset.batch(batch_size, drop_remainder=True)

            dataset = dataset.apply(
                tf.contrib.data.group_by_window(
                    key_func=key_func,
                    reduce_func=reduce_func,
                    window_size=params['global_batch_size']))
        else:
            dataset = dataset.prefetch(batch_size)
            dataset = dataset.batch(batch_size, drop_remainder=True)

        def _process_example(images, score_targets, box_targets, source_ids,
                             image_info, boxes, is_crowds, areas, classes,
                             cropped_gt_masks):
            """Processes one batch of data."""
            # Transposes images from (N, H, W, C)->(H, W, N, C). As batch size is
            # less than 8, the batch goes to the second minor dimension.
            if (params['transpose_input']
                    and self._mode == tf.estimator.ModeKeys.TRAIN):
                images = tf.transpose(images, [1, 2, 0, 3])

            labels = {}
            for level in range(params['min_level'], params['max_level'] + 1):
                labels['score_targets_%d' % level] = score_targets[level]
                labels['box_targets_%d' % level] = box_targets[level]
            # Concatenate groundtruth annotations to a tensor.
            groundtruth_data = tf.concat([boxes, is_crowds, areas, classes],
                                         axis=2)
            labels['source_ids'] = source_ids
            labels['groundtruth_data'] = groundtruth_data
            labels['image_info'] = image_info
            labels['cropped_gt_masks'] = cropped_gt_masks
            if self._mode == tf.estimator.ModeKeys.PREDICT:
                features = dict(images=images,
                                image_info=image_info,
                                groundtruth_data=groundtruth_data,
                                source_ids=source_ids)
                return features
            elif params['dynamic_input_shapes']:
                # For dynamic input shapes, we have 2 TPU programs. A tf.cond op is run
                # on the host side to decide which TPU program to launch. As we have
                # data prefetch in device side, the data for evaluating the shape needs
                # to sent back from device to host. Thus we retun `images` shape here
                # explictly to avoid copy the entire `images` back.
                return tf.shape(images), images, labels
            else:
                return images, labels

        dataset = dataset.map(_process_example)
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        return dataset
Esempio n. 16
0
    def __call__(self, params):
        image_size = (params['image_size'], params['image_size'])
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'], image_size)
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'],
                                               params['rpn_positive_overlap'],
                                               params['rpn_negative_overlap'],
                                               params['rpn_batch_size_per_im'],
                                               params['rpn_fg_fraction'])

        example_decoder = tf_example_decoder.TfExampleDecoder(
            use_instance_mask=True)

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        image: Image tensor that is preproessed to have normalized value and
          fixed dimension [image_size, image_size, 3]
        cls_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors]. The height_l and width_l
          represent the dimension of class logits at l-th level.
        box_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        num_positives: Number of positive anchors in the image.
        source_id: Source image id. Default value -1 if the source id is empty
          in the groundtruth annotation.
        image_scale: Scale of the proccessed image to the original image.
        boxes: Groundtruth bounding box annotations. The box is represented in
          [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed
          dimension [self._max_num_instances, 4].
        is_crowds: Groundtruth annotations to indicate if an annotation
          represents a group of instances by value {0, 1}. The tennsor is
          padded with 0 to the fixed dimension [self._max_num_instances].
        areas: Groundtruth areas annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
        classes: Groundtruth classes annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                source_id = data['source_id']
                image = data['image']
                instance_masks = data['groundtruth_instance_masks']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                areas = data['groundtruth_area']
                is_crowds = data['groundtruth_is_crowd']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                if not params['use_category']:
                    classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

                if (params['skip_crowd_during_training']
                        and self._mode == tf.estimator.ModeKeys.TRAIN):
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)
                    instance_masks = tf.gather_nd(instance_masks, indices)

                input_processor = InstanceSegmentationInputProcessor(
                    image, image_size, boxes, classes, instance_masks)
                input_processor.normalize_image()
                if (self._mode == tf.estimator.ModeKeys.TRAIN
                        and params['input_rand_hflip']):
                    input_processor.random_horizontal_flip()
                if self._mode == tf.estimator.ModeKeys.TRAIN:
                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'])
                else:
                    input_processor.set_scale_factors_to_output_size()
                image = input_processor.resize_and_crop_image()
                boxes, classes = input_processor.resize_and_crop_boxes()
                instance_masks = input_processor.resize_and_crop_masks()
                cropped_gt_masks = input_processor.crop_gt_masks(
                    instance_masks, boxes, params['gt_mask_size'], image_size)

                # Assign anchors.
                score_targets, box_targets = anchor_labeler.label_anchors(
                    boxes, classes)

                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                image_scale = input_processor.image_scale_to_original
                scaled_height = input_processor.get_height_length()
                scaled_width = input_processor.get_width_length()
                image_info = tf.stack([
                    tf.to_float(scaled_height),
                    tf.to_float(scaled_width),
                    image_scale,
                    tf.to_float(input_processor.get_original_height),
                    tf.to_float(input_processor.get_original_width),
                ])
                # Pad groundtruth data for evaluation.
                boxes *= image_scale
                is_crowds = tf.cast(is_crowds, dtype=tf.float32)
                boxes = pad_to_fixed_size(boxes, -1,
                                          [self._max_num_instances, 4])
                is_crowds = pad_to_fixed_size(is_crowds, 0,
                                              [self._max_num_instances, 1])
                areas = pad_to_fixed_size(areas, -1,
                                          [self._max_num_instances, 1])
                classes = pad_to_fixed_size(classes, -1,
                                            [self._max_num_instances, 1])
                # Pads cropped_gt_masks.
                cropped_gt_masks = tf.reshape(cropped_gt_masks,
                                              [self._max_num_instances, -1])
                cropped_gt_masks = pad_to_fixed_size(
                    cropped_gt_masks, -1,
                    [self._max_num_instances, (params['gt_mask_size'] + 4)**2])
                cropped_gt_masks = tf.reshape(cropped_gt_masks, [
                    self._max_num_instances, params['gt_mask_size'] + 4,
                    params['gt_mask_size'] + 4
                ])
                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)
                return (image, score_targets, box_targets, source_id,
                        image_info, boxes, is_crowds, areas, classes,
                        cropped_gt_masks)

        # batch_size = params['batch_size']
        batch_size = params['batch_size'] if 'batch_size' in params else 1
        dataset = tf.data.Dataset.list_files(
            self._file_pattern,
            shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN))
        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(
                _prefetch_dataset,
                cycle_length=32,
                sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN)))
        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        def _process_example(images, score_targets, box_targets, source_ids,
                             image_info, boxes, is_crowds, areas, classes,
                             cropped_gt_masks):
            """Processes one batch of data."""
            # Transposes images for TPU performance.
            # Given the batch size, the batch dimesion (N) goes to either the minor
            # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C)
            # dimension. Here, we assume N is 4 or 8 and C is 3, so we use
            # (H, W, C, N).
            if (params['transpose_input']
                    and self._mode == tf.estimator.ModeKeys.TRAIN):
                images = tf.transpose(images, [1, 2, 3, 0])

            labels = {}
            for level in range(params['min_level'], params['max_level'] + 1):
                labels['score_targets_%d' % level] = score_targets[level]
                labels['box_targets_%d' % level] = box_targets[level]
            # Concatenate groundtruth annotations to a tensor.
            groundtruth_data = tf.concat([boxes, is_crowds, areas, classes],
                                         axis=2)
            labels['source_ids'] = source_ids
            labels['groundtruth_data'] = groundtruth_data
            labels['image_info'] = image_info
            labels['cropped_gt_masks'] = cropped_gt_masks
            if self._mode == tf.estimator.ModeKeys.PREDICT:
                features = dict(images=images,
                                image_info=image_info,
                                groundtruth_data=groundtruth_data,
                                source_ids=source_ids)
                return features
            else:
                return images, labels

        dataset = dataset.map(_process_example)
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)

        if self._num_examples > 0:
            dataset = dataset.take(self._num_examples)
        if self._use_fake_data:
            # Turn this dataset into a semi-fake dataset which always loop at the
            # first batch. This reduces variance in performance and is useful in
            # testing.
            dataset = dataset.take(1).cache().repeat()
        return dataset
Esempio n. 17
0
    def __call__(self, params):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder()

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        image: Image tensor that is preproessed to have normalized value and
          fixed dimension [image_size, image_size, 3]
        cls_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors]. The height_l and width_l
          represent the dimension of class logits at l-th level.
        box_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        num_positives: Number of positive anchors in the image.
        source_id: Source image id. Default value -1 if the source id is empty
          in the groundtruth annotation.
        image_scale: Scale of the proccessed image to the original image.
        boxes: Groundtruth bounding box annotations. The box is represented in
          [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed
          dimension [self._max_num_instances, 4].
        is_crowds: Groundtruth annotations to indicate if an annotation
          represents a group of instances by value {0, 1}. The tennsor is
          padded with 0 to the fixed dimension [self._max_num_instances].
        areas: Groundtruth areas annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
        classes: Groundtruth classes annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                source_id = data['source_id']
                image = data['image']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                areas = data['groundtruth_area']
                is_crowds = data['groundtruth_is_crowd']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])

                if params['skip_crowd_during_training'] and self._is_training:
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)

                input_processor = DetectionInputProcessor(
                    image, params['image_size'], boxes, classes)
                input_processor.normalize_image()
                if self._is_training and params['input_rand_hflip']:
                    input_processor.random_horizontal_flip()
                if self._is_training:
                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'])
                else:
                    input_processor.set_scale_factors_to_output_size()
                image = input_processor.resize_and_crop_image()
                boxes, classes = input_processor.resize_and_crop_boxes()

                # Assign anchors.
                (cls_targets, box_targets,
                 num_positives) = anchor_labeler.label_anchors(boxes, classes)

                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                # Pad groundtruth data for evaluation.
                image_scale = input_processor.image_scale_to_original
                boxes *= image_scale
                is_crowds = tf.cast(is_crowds, dtype=tf.float32)
                boxes = pad_to_fixed_size(boxes, -1,
                                          [self._max_num_instances, 4])
                is_crowds = pad_to_fixed_size(is_crowds, 0,
                                              [self._max_num_instances, 1])
                areas = pad_to_fixed_size(areas, -1,
                                          [self._max_num_instances, 1])
                classes = pad_to_fixed_size(classes, -1,
                                            [self._max_num_instances, 1])
                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)
                return (image, cls_targets, box_targets, num_positives,
                        source_id, image_scale, boxes, is_crowds, areas,
                        classes)

        batch_size = params['batch_size']
        dataset = tf.data.Dataset.list_files(self._file_pattern,
                                             shuffle=self._is_training,
                                             seed=tf.random.set_random_seed(
                                                 int(time.time() * 1e9)))
        if self._is_training:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(_prefetch_dataset,
                                                cycle_length=32,
                                                sloppy=self._is_training))
        if self._is_training:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        def _process_example(images, cls_targets, box_targets, num_positives,
                             source_ids, image_scales, boxes, is_crowds, areas,
                             classes):
            """Processes one batch of data."""
            labels = {}
            # Count num_positives in a batch.
            num_positives_batch = tf.reduce_mean(num_positives)
            labels['mean_num_positives'] = tf.reshape(
                tf.tile(tf.expand_dims(num_positives_batch, 0), [
                    batch_size,
                ]), [batch_size, 1])

            for level in range(params['min_level'], params['max_level'] + 1):
                labels['cls_targets_%d' % level] = cls_targets[level]
                labels['box_targets_%d' % level] = box_targets[level]
            # Concatenate groundtruth annotations to a tensor.
            groundtruth_data = tf.concat([boxes, is_crowds, areas, classes],
                                         axis=2)
            labels['source_ids'] = source_ids
            labels['groundtruth_data'] = groundtruth_data
            labels['image_scales'] = image_scales
            return images, labels

        dataset = dataset.map(_process_example)
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        return dataset
Esempio n. 18
0
    def __call__(self, params):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder()

        def get_dataset_for_mode(data_dir, is_training):
            """Return the location of input samples for a given mode."""
            if is_training:
                return '%s/coco_train2017_nocrowd-*' % data_dir
            return '%s/coco_val2017-*' % data_dir

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets."""
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                source_id = data['source_id']
                image = data['image']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])

                # the image normalization is identical to Cloud TPU ResNet-50
                image = tf.image.convert_image_dtype(image, dtype=tf.float32)
                image = _normalize_image(image)

                if params['input_rand_hflip']:
                    image, boxes = preprocessor.random_horizontal_flip(
                        image, boxes=boxes)
                image_original_shape = tf.shape(image)
                image, _ = preprocessor.resize_to_range(
                    image,
                    min_dimension=params['image_size'],
                    max_dimension=params['image_size'])
                image_scale = tf.to_float(
                    image_original_shape[0]) / tf.to_float(tf.shape(image)[0])
                image, boxes = preprocessor.scale_boxes_to_pixel_coordinates(
                    image, boxes, keypoints=None)

                image = tf.image.pad_to_bounding_box(image, 0, 0,
                                                     params['image_size'],
                                                     params['image_size'])
                (cls_targets, box_targets,
                 num_positives) = anchor_labeler.label_anchors(boxes, classes)

                source_id = tf.string_to_number(source_id, out_type=tf.float32)
                row = (image, cls_targets, box_targets, num_positives,
                       source_id, image_scale)
                return row

        batch_size = params['batch_size']

        data_file_pattern = get_dataset_for_mode(self._data_dir,
                                                 self._is_training)
        dataset = tf.data.Dataset.list_files(data_file_pattern)

        dataset = dataset.shuffle(buffer_size=1024)
        if self._is_training:
            dataset = dataset.repeat()

        def prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(prefetch_dataset,
                                                cycle_length=32,
                                                sloppy=True))
        dataset = dataset.shuffle(20)

        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.apply(
            tf.contrib.data.batch_and_drop_remainder(batch_size))
        dataset = dataset.prefetch(1)

        (images, cls_targets, box_targets, num_positives, source_ids,
         image_scales) = dataset.make_one_shot_iterator().get_next()
        labels = {}
        # count num_positives in a batch
        num_positives_batch = tf.reduce_mean(num_positives)
        labels['mean_num_positives'] = tf.reshape(
            tf.tile(tf.expand_dims(num_positives_batch, 0), [
                batch_size,
            ]), [batch_size, 1])

        for level in range(params['min_level'], params['max_level'] + 1):
            labels['cls_targets_%d' % level] = cls_targets[level]
            labels['box_targets_%d' % level] = box_targets[level]
        labels['source_ids'] = source_ids
        labels['image_scales'] = image_scales
        return images, labels
  def __call__(self, params, num_examples=0):
    image_size = params['image_size']
    input_anchors = anchors.Anchors(
        params['min_level'], params['max_level'], params['num_scales'],
        params['aspect_ratios'], params['anchor_scale'], image_size)
    anchor_labeler = anchors.AnchorLabeler(
        input_anchors, params['num_classes'], params['rpn_positive_overlap'],
        params['rpn_negative_overlap'], params['rpn_batch_size_per_im'],
        params['rpn_fg_fraction'])

    height_long_side_image_size = image_size[::-1]
    height_long_side_input_anchors = anchors.Anchors(
        params['min_level'], params['max_level'], params['num_scales'],
        params['aspect_ratios'], params['anchor_scale'],
        height_long_side_image_size)
    height_long_side_anchor_labeler = anchors.AnchorLabeler(
        height_long_side_input_anchors, params['num_classes'],
        params['rpn_positive_overlap'], params['rpn_negative_overlap'],
        params['rpn_batch_size_per_im'], params['rpn_fg_fraction'])

    example_decoder = tf_example_decoder.TfExampleDecoder(
        use_instance_mask=True)

    def _dataset_parser(value):
      """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        features: A dictionary that contains the image and auxiliary
          information. The following describes {key: value} pairs in the
          dictionary.
          image: An image tensor that is preprocessed to have normalized value
            and fixed dimension [image_size, image_size, 3]
          image_info: Image information that includes the original height and
            width, the scale of the processed image to the original image, and
            the scaled height and width.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
        labels: (only for training) A dictionary that contains groundtruth
          labels. The following describes {key: value} pairs in the dictionary.
          score_targets_dict: An ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors]. The height_l and width_l
            represent the dimension of objectiveness score at l-th level.
          box_targets_dict: An ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, num_anchors * 4]. The height_l and
            width_l represent the dimension of bounding box regression output at
            l-th level.
          gt_boxes: Groundtruth bounding box annotations. The box is represented
             in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the
             fixed dimension [self._max_num_instances, 4].
          gt_classes: Groundtruth classes annotations. The tennsor is padded
            with -1 to the fixed dimension [self._max_num_instances].
          cropped_gt_masks: Groundtruth masks cropped by the bounding box and
            resized to a fixed size determined by params['gt_mask_size']
      """
      with tf.name_scope('parser'):
        data = example_decoder.decode(value)

        image = data['image']
        source_id = data['source_id']
        source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1',
                             source_id)
        source_id = tf.string_to_number(source_id)

        if self._mode == tf.estimator.ModeKeys.PREDICT:
          input_processor = InstanceSegmentationInputProcessor(
              image, image_size, params['short_side_image_size'],
              params['long_side_max_image_size'])
          input_processor.normalize_image()
          input_processor.set_scale_factors_to_mlperf_reference_size()
          image = input_processor.resize_and_crop_image()
          if params['use_bfloat16']:
            image = tf.cast(image, dtype=tf.bfloat16)

          image_info = input_processor.get_image_info()
          return {'images': image, 'image_info': image_info,
                  'source_ids': source_id}

        # The following part is for training.
        instance_masks = data['groundtruth_instance_masks']
        boxes = data['groundtruth_boxes']
        classes = data['groundtruth_classes']
        classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
        if not params['use_category']:
          classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

        if (params['skip_crowd_during_training'] and
            self._mode == tf.estimator.ModeKeys.TRAIN):
          indices = tf.where(tf.logical_not(data['groundtruth_is_crowd']))
          classes = tf.gather_nd(classes, indices)
          boxes = tf.gather_nd(boxes, indices)
          instance_masks = tf.gather_nd(instance_masks, indices)

        input_processor = InstanceSegmentationInputProcessor(
            image, image_size, params['short_side_image_size'],
            params['long_side_max_image_size'], boxes, classes,
            instance_masks)
        input_processor.normalize_image()
        if params['input_rand_hflip']:
          input_processor.random_horizontal_flip()

        input_processor.set_scale_factors_to_mlperf_reference_size()
        image = input_processor.resize_and_crop_image()

        boxes, classes = input_processor.resize_and_crop_boxes()
        cropped_gt_masks = input_processor.crop_gt_masks(
            params['gt_mask_size'])

        image_info = input_processor.get_image_info()
        # Assign anchors.
        is_height_short_side = tf.less(image_info[3], image_info[4])
        score_targets, box_targets = tf.cond(
            is_height_short_side,
            lambda: anchor_labeler.label_anchors(boxes, classes),
            lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes))  # pylint: disable=line-too-long

        # Pad groundtruth data.
        boxes *= image_info[2]
        boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4])
        classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1])
        # Pads cropped_gt_masks.
        cropped_gt_masks = tf.reshape(
            cropped_gt_masks, [-1, (params['gt_mask_size'] + 4) ** 2])
        cropped_gt_masks = pad_to_fixed_size(
            cropped_gt_masks, -1,
            [self._max_num_instances, (params['gt_mask_size'] + 4) ** 2])
        cropped_gt_masks = tf.reshape(
            cropped_gt_masks,
            [self._max_num_instances, params['gt_mask_size'] + 4,
             params['gt_mask_size'] + 4])
        if params['use_bfloat16']:
          image = tf.cast(image, dtype=tf.bfloat16)

        features = {}
        features['images'] = image
        features['image_info'] = image_info
        features['source_ids'] = source_id

        labels = {}
        for level in range(params['min_level'], params['max_level'] + 1):
          labels['score_targets_%d' % level] = score_targets[level]
          labels['box_targets_%d' % level] = box_targets[level]
        labels['gt_boxes'] = boxes
        labels['gt_classes'] = classes
        labels['cropped_gt_masks'] = cropped_gt_masks
        return features, labels

    batch_size = params['batch_size'] if 'batch_size' in params else 1
    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
    if self._mode == tf.estimator.ModeKeys.TRAIN:
      # shard and shuffle the image files so each shard has distinctive and
      # random set of images.
      # To improve model convergence under large number of hosts, multiple hosts
      # may share a same dataset shard. This allows a host to get more training
      # images.
      if 'dataset_num_shards' in params:
        train_actual_num_shards = int(params['dataset_num_shards'] //
                                      params['hosts_per_dataset_shard'])
        dataset = dataset.shard(
            train_actual_num_shards,
            int(params['dataset_shard_id'] //
                params['hosts_per_dataset_shard']))
        dataset = dataset.shuffle(tf.to_int64(256 // train_actual_num_shards))

    # Prefetch data from files.
    def _prefetch_dataset(filename):
      dataset = tf.data.TFRecordDataset(filename).prefetch(1)
      return dataset

    dataset = dataset.apply(
        tf.data.experimental.parallel_interleave(
            _prefetch_dataset, cycle_length=32,
            sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN)))
    if self._mode == tf.estimator.ModeKeys.TRAIN:
      # Cache the raw images and shuffle them with a resonably large buffer.
      dataset = dataset.cache().shuffle(params['shuffle_buffer_size']).repeat()

    if self._distributed_eval:
      dataset = dataset.shard(params['dataset_num_shards'],
                              params['dataset_shard_id'])

    # Parse the fetched records to input tensors for model function.
    dataset = dataset.map(_dataset_parser, num_parallel_calls=64)

    def horizontal_image(*args):
      image_info = args[0]['image_info']
      return tf.less(image_info[3], image_info[4])

    def vertical_image(*args):
      return tf.logical_not(horizontal_image(*args))

    # Pad dataset to the desired size and mark if the dataset is padding.
    # During PREDICT, if batch_size_per_shard * num_shards > 5000, the
    # original dataset size won't be evenly divisible by the number of shards.
    # Note that 5000 is the number of eval samples in COCO dataset. In this
    # case, the eval dataset will take (batch_per_shard * num_shards - 5000)
    # samples from the original dataset and mark those extra samples as
    # `is_padding` and the original data as `is_not_padding`. This ensures
    # correctness of evaluation on only 5000 samples.
    # Appends the dataset padding to the original dataset (only in PREDICT).
    if (self._mode == tf.estimator.ModeKeys.PREDICT and
        num_examples > params['eval_samples']):
      def _mark_is_padding(features):
        features[mask_rcnn_params.IS_PADDING] = tf.constant(
            True, dtype=tf.bool, shape=[1])
        return features

      def _mark_is_not_padding(features):
        features[mask_rcnn_params.IS_PADDING] = tf.constant(
            False, dtype=tf.bool, shape=[1])
        return features
      dataset_padding = dataset
      # padd equal number of horizontal and vertical images and interleave them.
      pad_size = int(math.ceil(num_examples - params['eval_samples']))
      dataset_padding_hor = dataset_padding.filter(horizontal_image).map(
          _mark_is_padding).take(pad_size)
      dataset_padding_ver = dataset_padding.filter(vertical_image).map(
          _mark_is_padding).take(pad_size)
      interleaved_dataset_padding = tf.data.experimental.choose_from_datasets(
          [dataset_padding_hor, dataset_padding_ver],
          tf.data.Dataset.range(2).repeat(pad_size))
      if self._distributed_eval:
        dataset = dataset.map(_mark_is_not_padding).take(
            int(
                math.ceil(params['eval_samples'] /
                          params['dataset_num_shards'])))
      else:
        dataset = dataset.map(_mark_is_not_padding).take(params['eval_samples'])
      dataset = dataset.concatenate(interleaved_dataset_padding)

    def key_func(*args):
      return tf.cast(horizontal_image(*args), dtype=tf.int64)

    def reduce_func(unused_key, dataset):
      return dataset.batch(batch_size, drop_remainder=True)

    dataset = dataset.apply(
        tf.data.experimental.group_by_window(
            key_func=key_func,
            reduce_func=reduce_func,
            window_size=(params['batch_size'] *
                         params['replicas_per_worker'])))

    dataset = dataset.map(
        functools.partial(self._transform_images, params),
        num_parallel_calls=16)

    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    if (self._mode == tf.estimator.ModeKeys.TRAIN and
        num_examples > 0):
      dataset = dataset.take(num_examples)
    # Make eval dataset repeat to get rid of eval dataset init per epoch.
    if self._distributed_eval:
      dataset = dataset.take(
          int(num_examples / params['dataset_num_shards'] /
              params['batch_size'])).cache().repeat()
    if self._use_fake_data:
      # Turn this dataset into a semi-fake dataset which always loop at the
      # first batch. This reduces variance in performance and is useful in
      # testing.
      dataset = dataset.take(1).cache().repeat()

    options = tf.data.Options()
    options.experimental_threading.max_intra_op_parallelism = 1
    dataset = dataset.with_options(options)

    return dataset