Beispiel #1
0
 def __init__(self, iou_loss_type, min_level, max_level, num_scales,
              aspect_ratios, anchor_scale, image_size, **kwargs):
     super().__init__(**kwargs)
     self.iou_loss_type = iou_loss_type
     self.input_anchors = anchors.Anchors(min_level, max_level, num_scales,
                                          aspect_ratios, anchor_scale,
                                          image_size)
Beispiel #2
0
    def __call__(self, params, input_context=None, batch_size=None):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder(
            include_mask='segmentation' in params['heads'],
            regenerate_source_id=params['regenerate_source_id'])

        batch_size = batch_size or params['batch_size']
        seed = params['tf_random_seed'] if self._debug else None
        dataset = tf.data.Dataset.list_files(self._file_pattern,
                                             shuffle=self._is_training,
                                             seed=seed)
        if input_context:
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)
        # Prefetch data from files.
        def _prefetch_dataset(filename):
            if params.get('dataset_type', None) == 'sstable':
                pass
            else:
                dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.interleave(_prefetch_dataset,
                                     num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.with_options(self.dataset_options)
        if self._is_training:
            dataset = dataset.shuffle(64, seed=seed)

        # Parse the fetched records to input tensors for model function.
        # pylint: disable=g-long-lambda
        if params.get('dataset_type', None) == 'sstable':
            map_fn = lambda key, value: self.dataset_parser(
                value, example_decoder, anchor_labeler, params)
        else:
            map_fn = lambda value: self.dataset_parser(value, example_decoder,
                                                       anchor_labeler, params)
        # pylint: enable=g-long-lambda
        dataset = dataset.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.batch(batch_size,
                                drop_remainder=params['drop_remainder'])
        dataset = dataset.map(
            lambda *args: self.process_example(params, batch_size, *args))
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        if self._is_training:
            dataset = dataset.repeat()
        if self._use_fake_data:
            # Turn this dataset into a semi-fake dataset which always loop at the
            # first batch. This reduces variance in performance and is useful in
            # testing.
            dataset = dataset.take(1).cache().repeat()
        return dataset
Beispiel #3
0
    def __call__(self, params):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder(
            include_mask='segmentation' in params['heads'],
            regenerate_source_id=params['regenerate_source_id'])

        batch_size = params['batch_size']
        dataset = tf.data.Dataset.list_files(self._file_pattern,
                                             shuffle=self._is_training)
        if self._is_training:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.interleave(
            _prefetch_dataset,
            num_parallel_calls=tf.data.experimental.AUTOTUNE)
        options = tf.data.Options()
        options.experimental_deterministic = not self._is_training
        dataset = dataset.with_options(options)
        if self._is_training:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(
            lambda value: self.dataset_parser(  # pylint: disable=g-long-lambda
                value, example_decoder, anchor_labeler, params),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.map(
            lambda *args: self.process_example(params, batch_size, *args))
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        if self._use_fake_data:
            # Turn this dataset into a semi-fake dataset which always loop at the
            # first batch. This reduces variance in performance and is useful in
            # testing.
            dataset = dataset.take(1).cache().repeat()
        return dataset
Beispiel #4
0
def tflite_pre_nms(params, cls_outputs, box_outputs):
    """Pre-NMS that is compatible with TFLite's custom NMS op.

  For details, see tensorflow/lite/kernels/detection_postprocess.cc

  Args:
    params: a dict of parameters.
    cls_outputs: a list of tensors for classes, each tensor denotes a level of
      logits with shape [1, H, W, num_class * num_anchors].
    box_outputs: a list of tensors for boxes, each tensor ddenotes a level of
      boxes with shape [1, H, W, 4 * num_anchors]. Each box format is [y_min,
      x_min, y_max, x_man].

  Returns:
    boxes: boxes encoded as {y_center, x_center, height, width}
    scores: scores converted from `cls_outputs` logits using sigmoid
    anchors: normalized anchors encoded as {y_center, x_center, height, width}
  """
    cls_outputs = to_list(cls_outputs)
    box_outputs = to_list(box_outputs)
    cls_outputs, box_outputs = merge_class_box_level_outputs(
        params, cls_outputs, box_outputs)
    eval_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                   params['num_scales'],
                                   params['aspect_ratios'],
                                   params['anchor_scale'],
                                   params['image_size'])

    # TODO(b/175166514): Consider computing Top-K boxes & anchors here. We don't
    # do this currently since the resultant graph does not support TFLite
    # delegates well. `topk_class_boxes` won't work as-is, since the outputs
    # will need to be modified appropriately for TFLite op's consumption.

    # TFLite's object detection APIs require normalized anchors.
    height, width = utils.parse_image_size(params['image_size'])
    normalize_factor = tf.constant([height, width, height, width],
                                   dtype=tf.float32)
    normalized_anchors = eval_anchors.boxes / normalize_factor
    decoded_anchors = anchors.decode_anchors_to_centersize(
        box_outputs, normalized_anchors)

    # convert logits to scores.
    scores = tf.math.sigmoid(cls_outputs)

    return box_outputs, scores, decoded_anchors
 def test_parser(self):
     tf.random.set_seed(111111)
     params = hparams_config.get_detection_config(
         'efficientdet-d0').as_dict()
     input_anchors = anchors.Anchors(params['min_level'],
                                     params['max_level'],
                                     params['num_scales'],
                                     params['aspect_ratios'],
                                     params['anchor_scale'],
                                     params['image_size'])
     anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                            params['num_classes'])
     example_decoder = tf_example_decoder.TfExampleDecoder(
         regenerate_source_id=params['regenerate_source_id'])
     tfrecord_path = self._make_fake_tfrecord()
     dataset = tf.data.TFRecordDataset([tfrecord_path])
     value = next(iter(dataset))
     reader = dataloader.InputReader(tfrecord_path, True)
     result = reader.dataset_parser(value, example_decoder, anchor_labeler,
                                    params)
     self.assertEqual(len(result), 10)
Beispiel #6
0
def pre_nms(params, cls_outputs, box_outputs, topk=True):
    """Detection post processing before nms.

  It takes the multi-level class and box predictions from network, merge them
  into unified tensors, and compute boxes, scores, and classes.

  Args:
    params: a dict of parameters.
    cls_outputs: a list of tensors for classes, each tensor denotes a level of
      logits with shape [N, H, W, num_class * num_anchors].
    box_outputs: a list of tensors for boxes, each tensor ddenotes a level of
      boxes with shape [N, H, W, 4 * num_anchors].
    topk: if True, select topk before nms (mainly to speed up nms).

  Returns:
    A tuple of (boxes, scores, classes).
  """
    # get boxes by apply bounding box regression to anchors.
    eval_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                   params['num_scales'],
                                   params['aspect_ratios'],
                                   params['anchor_scale'],
                                   params['image_size'])

    cls_outputs, box_outputs = merge_class_box_level_outputs(
        params, cls_outputs, box_outputs)

    if topk:
        # select topK purely based on scores before NMS, in order to speed up nms.
        cls_outputs, box_outputs, classes, indices = topk_class_boxes(
            params, cls_outputs, box_outputs)
        anchor_boxes = tf.gather(eval_anchors.boxes, indices)
    else:
        anchor_boxes = eval_anchors.boxes
        classes = None

    boxes = anchors.decode_box_outputs(box_outputs, anchor_boxes)
    # convert logits to scores.
    scores = tf.math.sigmoid(cls_outputs)
    return boxes, scores, classes
Beispiel #7
0
def detection_loss(cls_outputs, box_outputs, labels, params):
    """Computes total detection loss.

  Computes total detection loss including box and class loss from all levels.
  Args:
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    labels: the dictionary that returned from dataloader that includes
      groundtruth targets.
    params: the dictionary including training parameters specified in
      default_haprams function in this file.

  Returns:
    total_loss: an integer tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: an integer tensor representing total class loss.
    box_loss: an integer tensor representing total box regression loss.
    box_iou_loss: an integer tensor representing total box iou loss.
  """
    # Sum all positives in a batch for normalization and avoid zero
    # num_positives_sum, which would lead to inf loss during training
    num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
    positives_momentum = params.get('positives_momentum', None) or 0
    if positives_momentum > 0:
        # normalize the num_positive_examples for training stability.
        moving_normalizer_var = tf.Variable(
            0.0,
            name='moving_normalizer',
            dtype=tf.float32,
            synchronization=tf.VariableSynchronization.ON_READ,
            trainable=False,
            aggregation=tf.VariableAggregation.MEAN)
        num_positives_sum = tf.keras.backend.moving_average_update(
            moving_normalizer_var,
            num_positives_sum,
            momentum=params['positives_momentum'])
    elif positives_momentum < 0:
        num_positives_sum = utils.cross_replica_mean(num_positives_sum)

    levels = cls_outputs.keys()
    cls_losses = []
    box_losses = []
    for level in levels:
        # Onehot encoding for classification labels.
        cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level],
                                          params['num_classes'])

        if params['data_format'] == 'channels_first':
            bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, -1, width, height])
        else:
            bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, width, height, -1])
        box_targets_at_level = labels['box_targets_%d' % level]

        cls_loss = focal_loss(cls_outputs[level],
                              cls_targets_at_level,
                              params['alpha'],
                              params['gamma'],
                              normalizer=num_positives_sum,
                              label_smoothing=params['label_smoothing'])

        if params['data_format'] == 'channels_first':
            cls_loss = tf.reshape(
                cls_loss, [bs, -1, width, height, params['num_classes']])
        else:
            cls_loss = tf.reshape(
                cls_loss, [bs, width, height, -1, params['num_classes']])
        cls_loss *= tf.cast(
            tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2),
                           -1), tf.float32)
        cls_losses.append(tf.clip_by_value(tf.reduce_sum(cls_loss), 0.0, 2.0))

        if params['box_loss_weight']:
            box_losses.append(
                _box_loss(box_outputs[level],
                          box_targets_at_level,
                          num_positives_sum,
                          delta=params['delta']))

    if params['iou_loss_type']:
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        box_output_list = [tf.reshape(box_outputs[i], [-1, 4]) for i in levels]
        box_outputs = tf.concat(box_output_list, axis=0)
        box_target_list = [
            tf.reshape(labels['box_targets_%d' % level], [-1, 4])
            for level in levels
        ]
        box_targets = tf.concat(box_target_list, axis=0)
        anchor_boxes = tf.tile(input_anchors.boxes, [params['batch_size'], 1])
        box_outputs = anchors.decode_box_outputs(box_outputs, anchor_boxes)
        box_targets = anchors.decode_box_outputs(box_targets, anchor_boxes)
        box_iou_loss = _box_iou_loss(box_outputs, box_targets,
                                     num_positives_sum,
                                     params['iou_loss_type'])

    else:
        box_iou_loss = 0

    # Sum per level losses to total loss.
    cls_loss = tf.add_n(cls_losses)
    box_loss = tf.add_n(box_losses) if box_losses else 0

    total_loss = (cls_loss + params['box_loss_weight'] * box_loss +
                  params['iou_loss_weight'] * box_iou_loss)

    return total_loss, cls_loss, box_loss, box_iou_loss
Beispiel #8
0
    def __call__(self, params):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder(
            regenerate_source_id=params['regenerate_source_id'])

        @tf.autograph.experimental.do_not_convert
        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        image: Image tensor that is preprocessed to have normalized value and
          fixed dimension [image_height, image_width, 3]
        cls_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors]. The height_l and width_l
          represent the dimension of class logits at l-th level.
        box_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        num_positives: Number of positive anchors in the image.
        source_id: Source image id. Default value -1 if the source id is empty
          in the groundtruth annotation.
        image_scale: Scale of the processed image to the original image.
        boxes: Groundtruth bounding box annotations. The box is represented in
          [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed
          dimension [self._max_instances_per_image, 4].
        is_crowds: Groundtruth annotations to indicate if an annotation
          represents a group of instances by value {0, 1}. The tensor is
          padded with 0 to the fixed dimension [self._max_instances_per_image].
        areas: Groundtruth areas annotations. The tensor is padded with -1
          to the fixed dimension [self._max_instances_per_image].
        classes: Groundtruth classes annotations. The tensor is padded with -1
          to the fixed dimension [self._max_instances_per_image].
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                source_id = data['source_id']
                image = data['image']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                areas = data['groundtruth_area']
                is_crowds = data['groundtruth_is_crowd']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])

                if params['skip_crowd_during_training'] and self._is_training:
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)

                # NOTE: The autoaugment method works best when used alongside the
                # standard horizontal flipping of images along with size jittering
                # and normalization.
                if params.get('autoaugment_policy',
                              None) and self._is_training:
                    from aug import autoaugment  # pylint: disable=g-import-not-at-top
                    image, boxes = autoaugment.distort_image_with_autoaugment(
                        image, boxes, params['autoaugment_policy'],
                        params['use_augmix'], *params['augmix_params'])

                input_processor = DetectionInputProcessor(
                    image, params['image_size'], boxes, classes)
                input_processor.normalize_image()
                if self._is_training and params['input_rand_hflip']:
                    input_processor.random_horizontal_flip()
                if self._is_training:
                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'],
                        params.get('target_size', None))
                else:
                    input_processor.set_scale_factors_to_output_size()
                image = input_processor.resize_and_crop_image()
                boxes, classes = input_processor.resize_and_crop_boxes()

                # Assign anchors.
                (cls_targets, box_targets,
                 num_positives) = anchor_labeler.label_anchors(boxes, classes)

                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.strings.to_number(source_id)

                # Pad groundtruth data for evaluation.
                image_scale = input_processor.image_scale_to_original
                boxes *= image_scale
                is_crowds = tf.cast(is_crowds, dtype=tf.float32)
                boxes = pad_to_fixed_size(boxes, -1,
                                          [self._max_instances_per_image, 4])
                is_crowds = pad_to_fixed_size(
                    is_crowds, 0, [self._max_instances_per_image, 1])
                areas = pad_to_fixed_size(areas, -1,
                                          [self._max_instances_per_image, 1])
                classes = pad_to_fixed_size(classes, -1,
                                            [self._max_instances_per_image, 1])
                return (image, cls_targets, box_targets, num_positives,
                        source_id, image_scale, boxes, is_crowds, areas,
                        classes)

        batch_size = params['batch_size']
        dataset = tf.data.Dataset.list_files(self._file_pattern,
                                             shuffle=self._is_training)
        if self._is_training:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            tf.data.experimental.parallel_interleave(_prefetch_dataset,
                                                     cycle_length=32,
                                                     sloppy=self._is_training))
        if self._is_training:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        @tf.autograph.experimental.do_not_convert
        def _process_example(images, cls_targets, box_targets, num_positives,
                             source_ids, image_scales, boxes, is_crowds, areas,
                             classes):
            """Processes one batch of data."""
            labels = {}
            # Count num_positives in a batch.
            num_positives_batch = tf.reduce_mean(num_positives)
            labels['mean_num_positives'] = tf.reshape(
                tf.tile(tf.expand_dims(num_positives_batch, 0), [
                    batch_size,
                ]), [batch_size, 1])

            if params['data_format'] == 'channels_first':
                images = tf.transpose(images, [0, 3, 1, 2])

            for level in range(params['min_level'], params['max_level'] + 1):
                labels['cls_targets_%d' % level] = cls_targets[level]
                labels['box_targets_%d' % level] = box_targets[level]
                if params['data_format'] == 'channels_first':
                    labels['cls_targets_%d' % level] = tf.transpose(
                        labels['cls_targets_%d' % level], [0, 3, 1, 2])
                    labels['box_targets_%d' % level] = tf.transpose(
                        labels['box_targets_%d' % level], [0, 3, 1, 2])
            # Concatenate groundtruth annotations to a tensor.
            groundtruth_data = tf.concat([boxes, is_crowds, areas, classes],
                                         axis=2)
            labels['source_ids'] = source_ids
            labels['groundtruth_data'] = groundtruth_data
            labels['image_scales'] = image_scales
            return images, labels

        dataset = dataset.map(_process_example)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        if self._use_fake_data:
            # Turn this dataset into a semi-fake dataset which always loop at the
            # first batch. This reduces variance in performance and is useful in
            # testing.
            dataset = dataset.take(1).cache().repeat()
        return dataset
Beispiel #9
0
def detection_loss(cls_outputs, box_outputs, labels, params):
    """Computes total detection loss.

  Computes total detection loss including box and class loss from all levels.
  Args:
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    labels: the dictionary that returned from dataloader that includes
      groundtruth targets.
    params: the dictionary including training parameters specified in
      default_haprams function in this file.

  Returns:
    total_loss: an integer tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: an integer tensor representing total class loss.
    box_loss: an integer tensor representing total box regression loss.
    box_iou_loss: an integer tensor representing total box iou loss.
  """
    # Sum all positives in a batch for normalization and avoid zero
    # num_positives_sum, which would lead to inf loss during training
    num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
    levels = cls_outputs.keys()

    cls_losses = []
    box_losses = []

    for level in levels:
        # Onehot encoding for classification labels.
        cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level],
                                          params['num_classes'])

        if params['data_format'] == 'channels_first':
            bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, -1, width, height])
        else:
            bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, width, height, -1])
        box_targets_at_level = labels['box_targets_%d' % level]

        cls_loss = focal_loss(cls_outputs[level],
                              cls_targets_at_level,
                              params['alpha'],
                              params['gamma'],
                              normalizer=num_positives_sum,
                              label_smoothing=params['label_smoothing'])

        if params['data_format'] == 'channels_first':
            cls_loss = tf.reshape(
                cls_loss, [bs, -1, width, height, params['num_classes']])
        else:
            cls_loss = tf.reshape(
                cls_loss, [bs, width, height, -1, params['num_classes']])
        cls_loss *= tf.cast(
            tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2),
                           -1), tf.float32)
        cls_losses.append(tf.reduce_sum(cls_loss))

        if params['box_loss_weight']:
            box_losses.append(
                _box_loss(box_outputs[level],
                          box_targets_at_level,
                          num_positives_sum,
                          delta=params['delta']))

    if params['iou_loss_type']:
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        box_coder = FasterRcnnBoxCoder()
        input_anchors = BoxList(
            tf.tile(input_anchors.boxes, [params['batch_size'], 1]))
        box_outputs = tf.concat(
            [tf.reshape(v, [-1, 4]) for v in list(box_outputs.values())],
            axis=0)
        box_targets = tf.concat([
            tf.reshape(labels['box_targets_%d' % level], [-1, 4])
            for level in levels
        ],
                                axis=0)
        box_outputs = box_coder.decode(box_outputs, input_anchors)
        box_targets = box_coder.decode(box_targets, input_anchors)
        box_iou_loss = _box_iou_loss(box_outputs.data['boxes'],
                                     box_targets.data['boxes'],
                                     num_positives_sum,
                                     params['iou_loss_type'])
    else:
        box_iou_loss = 0

    # Sum per level losses to total loss.
    cls_loss = tf.add_n(cls_losses)
    box_loss = tf.add_n(box_losses) if box_losses else 0

    total_loss = (cls_loss + params['box_loss_weight'] * box_loss +
                  params['iou_loss_weight'] * box_iou_loss)

    return total_loss, cls_loss, box_loss, box_iou_loss
Beispiel #10
0
def detection_loss(cls_outputs, box_outputs, labels, params):
    """Computes total detection loss.

  Computes total detection loss including box and class loss from all levels.
  Args:
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    labels: the dictionary that returned from dataloader that includes
      groundtruth targets.
    params: the dictionary including training parameters specified in
      default_haprams function in this file.

  Returns:
    total_loss: an integer tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: an integer tensor representing total class loss.
    box_loss: an integer tensor representing total box regression loss.
    box_iou_loss: an integer tensor representing total box iou loss.
  """
    # Sum all positives in a batch for normalization and avoid zero
    # num_positives_sum, which would lead to inf loss during training
    num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
    levels = cls_outputs.keys()

    cls_losses = []
    box_losses = []
    sumrule = {}
    if params.get('sumrule'):
        sumrule = params['sumrule']
        # because of cls_targets -= 1 (so that bg class becomes -1, actual class then starts from 0)
        # we need to subtract 1 from sumrule as well.
        _sumrule = {}
        for k, v in sumrule.items():
            _sumrule[k - 1] = [vv - 1 for vv in v]
        sumrule = _sumrule

    def table_lookup(values, old_onehot, cls_targets_at_level):
        for val in values:
            if sumrule.get(val):
                new_val = sumrule[val]
                #prob = 1.0/len(new_val)
                prob = 0.5  # try sigmoid cross entropy first so set this to 0.5, if we use softmax we should set this to 1.0/len(new_val)
                if len(new_val) == 1:
                    # leaf node, prob = 1.0
                    prob = 1.0
                _matching_onehot = old_onehot[np.where(
                    cls_targets_at_level == val)]
                _matching_onehot[:, new_val] = prob
                _matching_onehot[:, val] = 0
                old_onehot[np.where(
                    cls_targets_at_level == val)] = _matching_onehot
        return old_onehot

    for level in levels:
        # Onehot encoding for classification labels.
        _cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level],
                                           params['num_classes'])
        if params.get('sumrule'):
            unique_labels, _ = tf.unique(
                tf.reshape(labels['cls_targets_%d' % level], [-1]))
            # refine one-hot labels so that we map each label to it's finest leaves
            cls_targets_at_level = tf.numpy_function(
                table_lookup, [
                    unique_labels, _cls_targets_at_level,
                    labels['cls_targets_%d' % level]
                ], _cls_targets_at_level.dtype)
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              _cls_targets_at_level.shape)
        else:
            cls_targets_at_level = _cls_targets_at_level

        if params['data_format'] == 'channels_first':
            bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, -1, width, height])
        else:
            bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, width, height, -1])
        box_targets_at_level = labels['box_targets_%d' % level]

        cls_loss = focal_loss(cls_outputs[level],
                              cls_targets_at_level,
                              params['alpha'],
                              params['gamma'],
                              normalizer=num_positives_sum,
                              label_smoothing=params['label_smoothing'])

        if params['data_format'] == 'channels_first':
            cls_loss = tf.reshape(
                cls_loss, [bs, -1, width, height, params['num_classes']])
        else:
            cls_loss = tf.reshape(
                cls_loss, [bs, width, height, -1, params['num_classes']])
        cls_loss *= tf.cast(
            tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2),
                           -1), tf.float32)
        cls_losses.append(tf.reduce_sum(cls_loss))

        if params['box_loss_weight']:
            box_losses.append(
                _box_loss(box_outputs[level],
                          box_targets_at_level,
                          num_positives_sum,
                          delta=params['delta']))

    if params['iou_loss_type']:
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        box_output_list = [tf.reshape(box_outputs[i], [-1, 4]) for i in levels]
        box_outputs = tf.concat(box_output_list, axis=0)
        box_target_list = [
            tf.reshape(labels['box_targets_%d' % level], [-1, 4])
            for level in levels
        ]
        box_targets = tf.concat(box_target_list, axis=0)
        anchor_boxes = tf.tile(input_anchors.boxes, [params['batch_size'], 1])
        box_outputs = anchors.decode_box_outputs(box_outputs, anchor_boxes)
        box_targets = anchors.decode_box_outputs(box_targets, anchor_boxes)
        box_iou_loss = _box_iou_loss(box_outputs, box_targets,
                                     num_positives_sum,
                                     params['iou_loss_type'])

    else:
        box_iou_loss = 0

    # Sum per level losses to total loss.
    cls_loss = tf.add_n(cls_losses)
    box_loss = tf.add_n(box_losses) if box_losses else 0

    total_loss = (cls_loss + params['box_loss_weight'] * box_loss +
                  params['iou_loss_weight'] * box_iou_loss)

    return total_loss, cls_loss, box_loss, box_iou_loss