コード例 #1
0
        def graph_fn():
            keypoints = tf.constant([[[0.25, 0.5], [0.75, 0.75]],
                                     [[0.5, 0.0], [1.0, 1.0]]])
            window = tf.constant([0.25, 0.25, 0.75, 0.75])

            expected_keypoints = tf.constant([[[0, 0.5], [1.0, 1.0]],
                                              [[0.5, -0.5], [1.5, 1.5]]])
            output = keypoint_ops.change_coordinate_frame(keypoints, window)
            return output, expected_keypoints
コード例 #2
0
 def _scale_keypoints_to_normalized_true_image(args):
   """Scale the box coordinates to be relative to the true image shape."""
   keypoints, true_image_shape = args
   true_image_shape = tf.cast(true_image_shape, tf.float32)
   true_height, true_width = true_image_shape[0], true_image_shape[1]
   normalized_window = tf.stack(
       [0.0, 0.0, true_height / image_height, true_width / image_width])
   return keypoint_ops.change_coordinate_frame(keypoints,
                                               normalized_window)
コード例 #3
0
    def test_change_coordinate_frame(self):
        keypoints = tf.constant([[[0.25, 0.5], [0.75, 0.75]],
                                 [[0.5, 0.0], [1.0, 1.0]]])
        window = tf.constant([0.25, 0.25, 0.75, 0.75])

        expected_keypoints = tf.constant([[[0, 0.5], [1.0, 1.0]],
                                          [[0.5, -0.5], [1.5, 1.5]]])
        output = keypoint_ops.change_coordinate_frame(keypoints, window)

        with self.test_session() as sess:
            output_, expected_keypoints_ = sess.run(
                [output, expected_keypoints])
            self.assertAllClose(output_, expected_keypoints_)
コード例 #4
0
ファイル: keypoint_ops_test.py プロジェクト: ALISCIFP/models
  def test_change_coordinate_frame(self):
    keypoints = tf.constant([
        [[0.25, 0.5], [0.75, 0.75]],
        [[0.5, 0.0], [1.0, 1.0]]
    ])
    window = tf.constant([0.25, 0.25, 0.75, 0.75])

    expected_keypoints = tf.constant([
        [[0, 0.5], [1.0, 1.0]],
        [[0.5, -0.5], [1.5, 1.5]]
    ])
    output = keypoint_ops.change_coordinate_frame(keypoints, window)

    with self.test_session() as sess:
      output_, expected_keypoints_ = sess.run([output, expected_keypoints])
      self.assertAllClose(output_, expected_keypoints_)
コード例 #5
0
ファイル: inputs.py プロジェクト: yangyongjx/lpcvc
def transform_input_data(tensor_dict,
                         model_preprocess_fn,
                         image_resizer_fn,
                         num_classes,
                         data_augmentation_fn=None,
                         merge_multiple_boxes=False,
                         retain_original_image=False,
                         use_multiclass_scores=False,
                         use_bfloat16=False,
                         retain_original_image_additional_channels=False):
    """A single function that is responsible for all input data transformations.

  Data transformation functions are applied in the following order.
  1. If key fields.InputDataFields.image_additional_channels is present in
     tensor_dict, the additional channels will be merged into
     fields.InputDataFields.image.
  2. data_augmentation_fn (optional): applied on tensor_dict.
  3. model_preprocess_fn: applied only on image tensor in tensor_dict.
  4. image_resizer_fn: applied on original image and instance mask tensor in
     tensor_dict.
  5. one_hot_encoding: applied to classes tensor in tensor_dict.
  6. merge_multiple_boxes (optional): when groundtruth boxes are exactly the
     same they can be merged into a single box with an associated k-hot class
     label.

  Args:
    tensor_dict: dictionary containing input tensors keyed by
      fields.InputDataFields.
    model_preprocess_fn: model's preprocess function to apply on image tensor.
      This function must take in a 4-D float tensor and return a 4-D preprocess
      float tensor and a tensor containing the true image shape.
    image_resizer_fn: image resizer function to apply on groundtruth instance
      `masks. This function must take a 3-D float tensor of an image and a 3-D
      tensor of instance masks and return a resized version of these along with
      the true shapes.
    num_classes: number of max classes to one-hot (or k-hot) encode the class
      labels.
    data_augmentation_fn: (optional) data augmentation function to apply on
      input `tensor_dict`.
    merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes
      and classes for a given image if the boxes are exactly the same.
    retain_original_image: (optional) whether to retain original image in the
      output dictionary.
    use_multiclass_scores: whether to use multiclass scores as class targets
      instead of one-hot encoding of `groundtruth_classes`. When
      this is True and multiclass_scores is empty, one-hot encoding of
      `groundtruth_classes` is used as a fallback.
    use_bfloat16: (optional) a bool, whether to use bfloat16 in training.
    retain_original_image_additional_channels: (optional) Whether to retain
      original image additional channels in the output dictionary.

  Returns:
    A dictionary keyed by fields.InputDataFields containing the tensors obtained
    after applying all the transformations.
  """
    out_tensor_dict = tensor_dict.copy()
    if fields.InputDataFields.multiclass_scores in out_tensor_dict:
        out_tensor_dict[
            fields.InputDataFields.
            multiclass_scores] = _multiclass_scores_or_one_hot_labels(
                out_tensor_dict[fields.InputDataFields.multiclass_scores],
                out_tensor_dict[fields.InputDataFields.groundtruth_boxes],
                out_tensor_dict[fields.InputDataFields.groundtruth_classes],
                num_classes)

    if fields.InputDataFields.groundtruth_boxes in out_tensor_dict:
        out_tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates(
            out_tensor_dict)
        out_tensor_dict = util_ops.filter_unrecognized_classes(out_tensor_dict)

    if retain_original_image:
        out_tensor_dict[fields.InputDataFields.original_image] = tf.cast(
            image_resizer_fn(out_tensor_dict[fields.InputDataFields.image],
                             None)[0], tf.uint8)

    if fields.InputDataFields.image_additional_channels in out_tensor_dict:
        channels = out_tensor_dict[
            fields.InputDataFields.image_additional_channels]
        out_tensor_dict[fields.InputDataFields.image] = tf.concat(
            [out_tensor_dict[fields.InputDataFields.image], channels], axis=2)
        if retain_original_image_additional_channels:
            out_tensor_dict[
                fields.InputDataFields.image_additional_channels] = tf.cast(
                    image_resizer_fn(channels, None)[0], tf.uint8)

    # Apply data augmentation ops.
    if data_augmentation_fn is not None:
        out_tensor_dict = data_augmentation_fn(out_tensor_dict)

    # Apply model preprocessing ops and resize instance masks.
    image = out_tensor_dict[fields.InputDataFields.image]
    preprocessed_resized_image, true_image_shape = model_preprocess_fn(
        tf.expand_dims(tf.cast(image, dtype=tf.float32), axis=0))

    preprocessed_shape = tf.shape(preprocessed_resized_image)
    new_height, new_width = preprocessed_shape[1], preprocessed_shape[2]

    im_box = tf.stack([
        0.0, 0.0,
        tf.to_float(new_height) / tf.to_float(true_image_shape[0, 0]),
        tf.to_float(new_width) / tf.to_float(true_image_shape[0, 1])
    ])

    if fields.InputDataFields.groundtruth_boxes in tensor_dict:
        bboxes = out_tensor_dict[fields.InputDataFields.groundtruth_boxes]
        boxlist = box_list.BoxList(bboxes)
        realigned_bboxes = box_list_ops.change_coordinate_frame(
            boxlist, im_box)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_boxes] = realigned_bboxes.get()

    if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
        keypoints = out_tensor_dict[
            fields.InputDataFields.groundtruth_keypoints]
        realigned_keypoints = keypoint_ops.change_coordinate_frame(
            keypoints, im_box)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_keypoints] = realigned_keypoints

    if use_bfloat16:
        preprocessed_resized_image = tf.cast(preprocessed_resized_image,
                                             tf.bfloat16)
    out_tensor_dict[fields.InputDataFields.image] = tf.squeeze(
        preprocessed_resized_image, axis=0)
    out_tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze(
        true_image_shape, axis=0)
    if fields.InputDataFields.groundtruth_instance_masks in out_tensor_dict:
        masks = out_tensor_dict[
            fields.InputDataFields.groundtruth_instance_masks]
        _, resized_masks, _ = image_resizer_fn(image, masks)
        if use_bfloat16:
            resized_masks = tf.cast(resized_masks, tf.bfloat16)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_instance_masks] = resized_masks

    label_offset = 1
    zero_indexed_groundtruth_classes = out_tensor_dict[
        fields.InputDataFields.groundtruth_classes] - label_offset
    if use_multiclass_scores:
        out_tensor_dict[
            fields.InputDataFields.groundtruth_classes] = out_tensor_dict[
                fields.InputDataFields.multiclass_scores]
    else:
        out_tensor_dict[
            fields.InputDataFields.groundtruth_classes] = tf.one_hot(
                zero_indexed_groundtruth_classes, num_classes)
    out_tensor_dict.pop(fields.InputDataFields.multiclass_scores, None)

    if fields.InputDataFields.groundtruth_confidences in out_tensor_dict:
        groundtruth_confidences = out_tensor_dict[
            fields.InputDataFields.groundtruth_confidences]
        # Map the confidences to the one-hot encoding of classes
        out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
            tf.reshape(groundtruth_confidences, [-1, 1]) *
            out_tensor_dict[fields.InputDataFields.groundtruth_classes])
    else:
        groundtruth_confidences = tf.ones_like(
            zero_indexed_groundtruth_classes, dtype=tf.float32)
        out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
            out_tensor_dict[fields.InputDataFields.groundtruth_classes])

    if merge_multiple_boxes:
        merged_boxes, merged_classes, merged_confidences, _ = (
            util_ops.merge_boxes_with_multiple_labels(
                out_tensor_dict[fields.InputDataFields.groundtruth_boxes],
                zero_indexed_groundtruth_classes, groundtruth_confidences,
                num_classes))
        merged_classes = tf.cast(merged_classes, tf.float32)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_boxes] = merged_boxes
        out_tensor_dict[
            fields.InputDataFields.groundtruth_classes] = merged_classes
        out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
            merged_confidences)
    if fields.InputDataFields.groundtruth_boxes in out_tensor_dict:
        out_tensor_dict[
            fields.InputDataFields.num_groundtruth_boxes] = tf.shape(
                out_tensor_dict[fields.InputDataFields.groundtruth_boxes])[0]

    return out_tensor_dict
コード例 #6
0
ファイル: rscps.py プロジェクト: Kevinkald/asset_tracking
def random_scale_crop_and_pad_to_square(
        image,
        label_weights=None,
        masks=None,
        boxes=None,
        keypoints=None,
        label_confidences=None,
        scale_min=0.1,
        scale_max=2.0,
        output_size=512,
        resize_method=tf.image.ResizeMethod.BILINEAR,
        seed=None):
    """Randomly scale, crop, and then pad an image to fixed square dimensions.

   Randomly scale, crop, and then pad an image to the desired square output
   dimensions. Specifically, this method first samples a random_scale factor
   from a uniform distribution between scale_min and scale_max, and then resizes
   the image such that it's maximum dimension is (output_size * random_scale).
   Secondly, a square output_size crop is extracted from the resized image
   (note, this will only occur when random_scale > 1.0). Lastly, the cropped
   region is padded to the desired square output_size, by filling with zeros.
   The augmentation is borrowed from [1]
   [1]: https://arxiv.org/abs/1911.09070

  Args:
    image: rank 3 float32 tensor containing 1 image ->
      [height, width, channels].
    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. Boxes
      are in normalized form meaning their coordinates vary between [0, 1]. Each
      row is in the form of [ymin, xmin, ymax, xmax]. Boxes on the crop boundary
      are clipped to the boundary and boxes falling outside the crop are
      ignored.
    labels: rank 1 int32 tensor containing the object classes.
    label_weights: float32 tensor of shape [num_instances] representing the
      weight for each box.
    masks: (optional) rank 3 float32 tensor with shape [num_instances, height,
      width] containing instance masks. The masks are of the same height, width
      as the input `image`.
    keypoints: (optional) rank 3 float32 tensor with shape [num_instances,
      num_keypoints, 2]. The keypoints are in y-x normalized coordinates.
    label_confidences: (optional) float32 tensor of shape [num_instance]
      representing the confidence for each box.
    scale_min: float, the minimum value for the random scale factor.
    scale_max: float, the maximum value for the random scale factor.
    output_size: int, the desired (square) output image size.
    resize_method: tf.image.ResizeMethod, resize method to use when scaling the
      input images.
    seed: random seed.

  Returns:
    image: image which is the same rank as input image.
    boxes: boxes which is the same rank as input boxes.
           Boxes are in normalized form.
    labels: new labels.
    label_weights: rank 1 float32 tensor with shape [num_instances].
    masks: rank 3 float32 tensor with shape [num_instances, height, width]
           containing instance masks.
    label_confidences: confidences for retained boxes.
  """
    img_shape = tf.shape(image)
    input_height, input_width = img_shape[0], img_shape[1]
    random_scale = tf.random_uniform([], scale_min, scale_max, seed=seed)

    # Compute the scaled height and width from the random scale.
    max_input_dim = tf.cast(tf.maximum(input_height, input_width), tf.float32)
    input_ar_y = tf.cast(input_height, tf.float32) / max_input_dim
    input_ar_x = tf.cast(input_width, tf.float32) / max_input_dim
    scaled_height = tf.cast(random_scale * output_size * input_ar_y, tf.int32)
    scaled_width = tf.cast(random_scale * output_size * input_ar_x, tf.int32)

    # Compute the offsets:
    offset_y = tf.cast(scaled_height - output_size, tf.float32)
    offset_x = tf.cast(scaled_width - output_size, tf.float32)
    offset_y = tf.maximum(0.0, offset_y) * tf.random_uniform(
        [], 0, 1, seed=seed)
    offset_x = tf.maximum(0.0, offset_x) * tf.random_uniform(
        [], 0, 1, seed=seed)
    offset_y = tf.cast(offset_y, tf.int32)
    offset_x = tf.cast(offset_x, tf.int32)

    # Scale, crop, and pad the input image.
    scaled_image = tf.image.resize_images(image, [scaled_height, scaled_width],
                                          method=resize_method)
    scaled_image = scaled_image[offset_y:offset_y + output_size,
                                offset_x:offset_x + output_size, :]
    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
                                                output_size, output_size)

    # Update the boxes.
    new_window = tf.cast(tf.stack(
        [offset_y, offset_x, offset_y + output_size, offset_x + output_size]),
                         dtype=tf.float32)
    new_window /= tf.cast(tf.stack(
        [scaled_height, scaled_width, scaled_height, scaled_width]),
                          dtype=tf.float32)
    #boxlist = box_list.BoxList(boxes)
    #boxlist = box_list_ops.change_coordinate_frame(boxlist, new_window)
    #boxlist, indices = box_list_ops.prune_completely_outside_window(
    #    boxlist, [0.0, 0.0, 1.0, 1.0])
    #boxlist = box_list_ops.clip_to_window(
    #    boxlist, [0.0, 0.0, 1.0, 1.0], filter_nonoverlapping=False)

    return_values = output_image

    if masks is not None:
        new_masks = tf.expand_dims(masks, -1)
        new_masks = tf.image.resize_images(new_masks,
                                           [scaled_height, scaled_width],
                                           method=resize_method)
        new_masks = new_masks[:, offset_y:offset_y + output_size,
                              offset_x:offset_x + output_size, :]
        new_masks = tf.image.pad_to_bounding_box(new_masks, 0, 0, output_size,
                                                 output_size)
        new_masks = tf.squeeze(new_masks, [-1])
        return_values.append(tf.gather(new_masks, indices))

    if keypoints is not None:
        keypoints = tf.gather(keypoints, indices)
        keypoints = keypoint_ops.change_coordinate_frame(keypoints, new_window)
        keypoints = keypoint_ops.prune_outside_window(keypoints,
                                                      [0.0, 0.0, 1.0, 1.0])
        return_values.append(keypoints)

    if label_confidences is not None:
        return_values.append(tf.gather(label_confidences, indices))

    return return_values
コード例 #7
0
ファイル: inputs.py プロジェクト: jiapei100/TensorflowModels
def transform_input_data(tensor_dict,
                         model_preprocess_fn,
                         image_resizer_fn,
                         num_classes,
                         data_augmentation_fn=None,
                         merge_multiple_boxes=False,
                         retain_original_image=False,
                         use_multiclass_scores=False,
                         use_bfloat16=False,
                         retain_original_image_additional_channels=False,
                         keypoint_type_weight=None):
    """A single function that is responsible for all input data transformations.

  Data transformation functions are applied in the following order.
  1. If key fields.InputDataFields.image_additional_channels is present in
     tensor_dict, the additional channels will be merged into
     fields.InputDataFields.image.
  2. data_augmentation_fn (optional): applied on tensor_dict.
  3. model_preprocess_fn: applied only on image tensor in tensor_dict.
  4. keypoint_type_weight (optional): If groundtruth keypoints are in
     the tensor dictionary, per-keypoint weights are produced. These weights are
     initialized by `keypoint_type_weight` (or ones if left None).
     Then, for all keypoints that are not visible, the weights are set to 0 (to
     avoid penalizing the model in a loss function).
  5. image_resizer_fn: applied on original image and instance mask tensor in
     tensor_dict.
  6. one_hot_encoding: applied to classes tensor in tensor_dict.
  7. merge_multiple_boxes (optional): when groundtruth boxes are exactly the
     same they can be merged into a single box with an associated k-hot class
     label.

  Args:
    tensor_dict: dictionary containing input tensors keyed by
      fields.InputDataFields.
    model_preprocess_fn: model's preprocess function to apply on image tensor.
      This function must take in a 4-D float tensor and return a 4-D preprocess
      float tensor and a tensor containing the true image shape.
    image_resizer_fn: image resizer function to apply on groundtruth instance
      `masks. This function must take a 3-D float tensor of an image and a 3-D
      tensor of instance masks and return a resized version of these along with
      the true shapes.
    num_classes: number of max classes to one-hot (or k-hot) encode the class
      labels.
    data_augmentation_fn: (optional) data augmentation function to apply on
      input `tensor_dict`.
    merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes
      and classes for a given image if the boxes are exactly the same.
    retain_original_image: (optional) whether to retain original image in the
      output dictionary.
    use_multiclass_scores: whether to use multiclass scores as class targets
      instead of one-hot encoding of `groundtruth_classes`. When
      this is True and multiclass_scores is empty, one-hot encoding of
      `groundtruth_classes` is used as a fallback.
    use_bfloat16: (optional) a bool, whether to use bfloat16 in training.
    retain_original_image_additional_channels: (optional) Whether to retain
      original image additional channels in the output dictionary.
    keypoint_type_weight: A list (of length num_keypoints) containing
      groundtruth loss weights to use for each keypoint. If None, will use a
      weight of 1.

  Returns:
    A dictionary keyed by fields.InputDataFields containing the tensors obtained
    after applying all the transformations.
  """
    out_tensor_dict = tensor_dict.copy()

    labeled_classes_field = fields.InputDataFields.groundtruth_labeled_classes
    if labeled_classes_field in out_tensor_dict:
        # tf_example_decoder casts unrecognized labels to -1. Remove these
        # unrecognized labels before converting labeled_classes to k-hot vector.
        out_tensor_dict[labeled_classes_field] = _remove_unrecognized_classes(
            out_tensor_dict[labeled_classes_field], unrecognized_label=-1)
        out_tensor_dict[
            labeled_classes_field] = _convert_labeled_classes_to_k_hot(
                out_tensor_dict[labeled_classes_field], num_classes)

    if fields.InputDataFields.multiclass_scores in out_tensor_dict:
        out_tensor_dict[
            fields.InputDataFields.
            multiclass_scores] = _multiclass_scores_or_one_hot_labels(
                out_tensor_dict[fields.InputDataFields.multiclass_scores],
                out_tensor_dict[fields.InputDataFields.groundtruth_boxes],
                out_tensor_dict[fields.InputDataFields.groundtruth_classes],
                num_classes)

    if fields.InputDataFields.groundtruth_boxes in out_tensor_dict:
        out_tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates(
            out_tensor_dict)
        out_tensor_dict = util_ops.filter_unrecognized_classes(out_tensor_dict)

    if retain_original_image:
        out_tensor_dict[fields.InputDataFields.original_image] = tf.cast(
            image_resizer_fn(out_tensor_dict[fields.InputDataFields.image],
                             None)[0], tf.uint8)

    if fields.InputDataFields.image_additional_channels in out_tensor_dict:
        channels = out_tensor_dict[
            fields.InputDataFields.image_additional_channels]
        out_tensor_dict[fields.InputDataFields.image] = tf.concat(
            [out_tensor_dict[fields.InputDataFields.image], channels], axis=2)
        if retain_original_image_additional_channels:
            out_tensor_dict[
                fields.InputDataFields.image_additional_channels] = tf.cast(
                    image_resizer_fn(channels, None)[0], tf.uint8)

    # Apply data augmentation ops.
    if data_augmentation_fn is not None:
        out_tensor_dict = data_augmentation_fn(out_tensor_dict)

    # Apply model preprocessing ops and resize instance masks.
    image = out_tensor_dict[fields.InputDataFields.image]
    preprocessed_resized_image, true_image_shape = model_preprocess_fn(
        tf.expand_dims(tf.cast(image, dtype=tf.float32), axis=0))

    preprocessed_shape = tf.shape(preprocessed_resized_image)
    new_height, new_width = preprocessed_shape[1], preprocessed_shape[2]

    im_box = tf.stack([
        0.0, 0.0,
        tf.to_float(new_height) / tf.to_float(true_image_shape[0, 0]),
        tf.to_float(new_width) / tf.to_float(true_image_shape[0, 1])
    ])

    if fields.InputDataFields.groundtruth_boxes in tensor_dict:
        bboxes = out_tensor_dict[fields.InputDataFields.groundtruth_boxes]
        boxlist = box_list.BoxList(bboxes)
        realigned_bboxes = box_list_ops.change_coordinate_frame(
            boxlist, im_box)

        realigned_boxes_tensor = realigned_bboxes.get()
        valid_boxes_tensor = assert_or_prune_invalid_boxes(
            realigned_boxes_tensor)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_boxes] = valid_boxes_tensor

    if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
        keypoints = out_tensor_dict[
            fields.InputDataFields.groundtruth_keypoints]
        realigned_keypoints = keypoint_ops.change_coordinate_frame(
            keypoints, im_box)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_keypoints] = realigned_keypoints
        flds_gt_kpt = fields.InputDataFields.groundtruth_keypoints
        flds_gt_kpt_vis = fields.InputDataFields.groundtruth_keypoint_visibilities
        flds_gt_kpt_weights = fields.InputDataFields.groundtruth_keypoint_weights
        if flds_gt_kpt_vis not in out_tensor_dict:
            out_tensor_dict[flds_gt_kpt_vis] = tf.ones_like(
                out_tensor_dict[flds_gt_kpt][:, :, 0], dtype=tf.bool)
        out_tensor_dict[flds_gt_kpt_weights] = (
            keypoint_ops.keypoint_weights_from_visibilities(
                out_tensor_dict[flds_gt_kpt_vis], keypoint_type_weight))

    if use_bfloat16:
        preprocessed_resized_image = tf.cast(preprocessed_resized_image,
                                             tf.bfloat16)
        if fields.InputDataFields.context_features in out_tensor_dict:
            out_tensor_dict[fields.InputDataFields.context_features] = tf.cast(
                out_tensor_dict[fields.InputDataFields.context_features],
                tf.bfloat16)
    out_tensor_dict[fields.InputDataFields.image] = tf.squeeze(
        preprocessed_resized_image, axis=0)
    out_tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze(
        true_image_shape, axis=0)
    if fields.InputDataFields.groundtruth_instance_masks in out_tensor_dict:
        masks = out_tensor_dict[
            fields.InputDataFields.groundtruth_instance_masks]
        _, resized_masks, _ = image_resizer_fn(image, masks)
        if use_bfloat16:
            resized_masks = tf.cast(resized_masks, tf.bfloat16)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_instance_masks] = resized_masks

    zero_indexed_groundtruth_classes = out_tensor_dict[
        fields.InputDataFields.groundtruth_classes] - _LABEL_OFFSET
    if use_multiclass_scores:
        out_tensor_dict[
            fields.InputDataFields.groundtruth_classes] = out_tensor_dict[
                fields.InputDataFields.multiclass_scores]
    else:
        out_tensor_dict[
            fields.InputDataFields.groundtruth_classes] = tf.one_hot(
                zero_indexed_groundtruth_classes, num_classes)
    out_tensor_dict.pop(fields.InputDataFields.multiclass_scores, None)

    if fields.InputDataFields.groundtruth_confidences in out_tensor_dict:
        groundtruth_confidences = out_tensor_dict[
            fields.InputDataFields.groundtruth_confidences]
        # Map the confidences to the one-hot encoding of classes
        out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
            tf.reshape(groundtruth_confidences, [-1, 1]) *
            out_tensor_dict[fields.InputDataFields.groundtruth_classes])
    else:
        groundtruth_confidences = tf.ones_like(
            zero_indexed_groundtruth_classes, dtype=tf.float32)
        out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
            out_tensor_dict[fields.InputDataFields.groundtruth_classes])

    if merge_multiple_boxes:
        merged_boxes, merged_classes, merged_confidences, _ = (
            util_ops.merge_boxes_with_multiple_labels(
                out_tensor_dict[fields.InputDataFields.groundtruth_boxes],
                zero_indexed_groundtruth_classes, groundtruth_confidences,
                num_classes))
        merged_classes = tf.cast(merged_classes, tf.float32)
        out_tensor_dict[
            fields.InputDataFields.groundtruth_boxes] = merged_boxes
        out_tensor_dict[
            fields.InputDataFields.groundtruth_classes] = merged_classes
        out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
            merged_confidences)
    if fields.InputDataFields.groundtruth_boxes in out_tensor_dict:
        out_tensor_dict[
            fields.InputDataFields.num_groundtruth_boxes] = tf.shape(
                out_tensor_dict[fields.InputDataFields.groundtruth_boxes])[0]

    return out_tensor_dict