Beispiel #1
0
    def _dataset_parser(value):
      """Parse data to a fixed dimension input image and learning targets."""
      with tf.name_scope('parser'):
        data = example_decoder.decode(value)
        source_id = data['source_id']
        image = data['image']
        boxes = data['groundtruth_boxes']
        classes = data['groundtruth_classes']
        classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])

        # the image normalization is identical to Cloud TPU ResNet-50
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        image = _normalize_image(image)

        if params['input_rand_hflip']:
          image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes)
        image_original_shape = tf.shape(image)
        image, _ = preprocessor.resize_to_range(
            image,
            min_dimension=params['image_size'],
            max_dimension=params['image_size'])
        image_scale = tf.to_float(image_original_shape[0]) / tf.to_float(
            tf.shape(image)[0])
        image, boxes = preprocessor.scale_boxes_to_pixel_coordinates(
            image, boxes, keypoints=None)

        image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'],
                                             params['image_size'])
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(boxes, classes)

        source_id = tf.string_to_number(source_id, out_type=tf.float32)
        row = (image, cls_targets, box_targets, num_positives, source_id,
               image_scale)
        return row
 def _preprocess(example):
     """
     example: a string tensor holding serialized tf example proto
     """
     data = example_decoder.decode(example)
     image = data["image"]
     boxes = data["groundtruth_boxes"]
     labels = data["groundtruth_classes"]
     true_image_width = data["image/width"]
     true_image_height = data["image/height"]
     # augmentation
     if is_training:
         image = preprocessor.random_adjust_brightness(image)
         image = preprocessor.random_adjust_saturation(image)
         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
         image, boxes = preprocessor.random_horizontal_flip(image, boxes)
         image, boxes, labels = preprocessor.random_crop_image(
             image, boxes, labels)
     else:
         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
     # resize image
     image = tf.image.resize_images(image, size=params["image_size"])
     image = (image - 0.5) * 2
     boxes = pad_to_fixed_size(boxes, [params["max_instance"], 4], -1)
     labels = pad_to_fixed_size(labels, [params["max_instance"], 1], -1)
     image_shape = tf.stack([true_image_height, true_image_width], axis=0)
     return (image, boxes, labels, image_shape)
 def random_horizontal_flip(self):
   """Randomly flip input image and bounding boxes."""
   results = preprocessor.random_horizontal_flip(
       self._image, boxes=self._boxes, masks=self._masks)
   self._image = results[0]
   self._boxes = results[1]
   if self._masks is not None:
     self._masks = results[2]
Beispiel #4
0
    def _dataset_parser(value):
      """Parse data to a fixed dimension input image and learning targets."""
      with tf.name_scope('parser'):
        data = example_decoder.decode(value)

        source_id = data['source_id']
        # for xView dataset only; basically the original name is 122.tif and we will change it to number 122 later on.
        # len = tf.size(tf.string_split([data['source_id']],""))
        # source_id = tf.substr(data['source_id'],0,len - 4)

        image = data['image']
        boxes = data['groundtruth_boxes']
        classes = data['groundtruth_classes']
        classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
        # Handle crowd annotations. As crowd annotations are not large
        # instances, the model ignores them in training.
        if params['skip_crowd']:
          indices = tf.where(tf.logical_not(data['groundtruth_is_crowd']))
          classes = tf.gather_nd(classes, indices)
          boxes = tf.gather_nd(boxes, indices)

        # the image normalization is identical to Cloud TPU ResNet-50
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        image = _normalize_image(image)

        if params['input_rand_hflip']:
          image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes)
        image_original_shape = tf.shape(image)
        image, _ = preprocessor.resize_to_range(
            image,
            min_dimension=params['image_size'],
            max_dimension=params['image_size'])
        image_scale = tf.to_float(image_original_shape[0]) / tf.to_float(
            tf.shape(image)[0])
        image, boxes = preprocessor.scale_boxes_to_pixel_coordinates(
            image, boxes, keypoints=None)

        image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'],
                                             params['image_size'])
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(boxes, classes)
        #
        # sess = tf.get_default_session()
        # print("source id is", sess.run(source_id))

        source_id = tf.string_to_number(source_id, out_type=tf.float32)

        # sess = tf.get_default_session()
        # print("after conversion, source id is", sess.run(source_id))

        if params['use_bfloat16']:
          image = tf.cast(image, dtype=tf.bfloat16)
        row = (image, cls_targets, box_targets, num_positives, source_id,
               image_scale)
        return row
def random_horizontal_flip(image, boxes=None, masks=None):
  """Random horizontal flip the image, boxes, and masks.

  Args:
    image: a tensor of shape [height, width, 3] representing the image.
    boxes: (Optional) a tensor of shape [num_boxes, 4] represneting the box
      corners in normalized coordinates.
    masks: (Optional) a tensor of shape [num_masks, height, width]
      representing the object masks. Note that the size of the mask is the
      same as the image.

  Returns:
    image: the processed image tensor after being randomly flipped.
    boxes: None or the processed box tensor after being randomly flipped.
    masks: None or the processed mask tensor after being randomly flipped.
  """
  return preprocessor.random_horizontal_flip(image, boxes, masks)
Beispiel #6
0
 def random_horizontal_flip(self):
   """Randomly flip input image and bounding boxes."""
   self._image, self._boxes = preprocessor.random_horizontal_flip(
       self._image, boxes=self._boxes)
Beispiel #7
0
 def random_horizontal_flip(self):
     """Randomly flip input image and segmentation label."""
     self._label = tf.expand_dims(self._label, 0)
     self._image, self._label = preprocessor.random_horizontal_flip(
         self._image, masks=self._label)
     self._label = self._label[0, :, :]
Beispiel #8
0
        def _parse_example(data):
            with tf.name_scope('augmentation'):
                source_id = data['source_id']
                image = tf.image.convert_image_dtype(data['image'],
                                                     dtype=tf.float32)
                raw_shape = tf.shape(image)
                boxes = data['groundtruth_boxes']
                classes = tf.reshape(data['groundtruth_classes'], [-1, 1])

                # Only 80 of the 90 COCO classes are used.
                class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
                classes = tf.gather(class_map, classes)
                classes = tf.cast(classes, dtype=tf.float32)

                if self._is_training:
                    image, boxes, classes = ssd_crop(image, boxes, classes)

                    # random_horizontal_flip() is hard coded to flip with 50% chance.
                    mlperf_log.ssd_print(
                        key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
                    image, boxes = preprocessor.random_horizontal_flip(
                        image=image, boxes=boxes)

                    # TODO(shibow): Investigate the parameters for color jitter.
                    image = color_jitter(image,
                                         brightness=0.125,
                                         contrast=0.5,
                                         saturation=0.5,
                                         hue=0.05)
                    image = normalize_image(image)

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
                        boxes, classes)

                    # TODO(taylorrobie): Check that this cast is valid.
                    encoded_classes = tf.cast(encoded_classes, tf.int32)

                    labels = {
                        ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes,
                        ssd_constants.BOXES: encoded_boxes,
                        ssd_constants.CLASSES: encoded_classes,
                    }
                    # This is for dataloader visualization; actual model doesn't use this.
                    if params['visualize_dataloader']:
                        box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
                            scale_factors=ssd_constants.BOX_CODER_SCALES)
                        decoded_boxes = tf.expand_dims(box_coder.decode(
                            rel_codes=tf.squeeze(encoded_boxes),
                            anchors=box_list.BoxList(
                                tf.convert_to_tensor(
                                    DefaultBoxes()('ltrb')))).get(),
                                                       axis=0)
                        labels['decoded_boxes'] = tf.squeeze(decoded_boxes)

                    return image, labels

                else:
                    mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE,
                                         value=ssd_constants.IMAGE_SIZE)
                    image = tf.image.resize_images(
                        image[tf.newaxis, :, :, :],
                        size=(ssd_constants.IMAGE_SIZE,
                              ssd_constants.IMAGE_SIZE))[0, :, :, :]

                    image = normalize_image(image)

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    def trim_and_pad(inp_tensor, dim_1):
                        """Limit the number of boxes, and pad if necessary."""
                        inp_tensor = inp_tensor[:ssd_constants.
                                                MAX_NUM_EVAL_BOXES]
                        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(
                            inp_tensor)[0]
                        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
                        return tf.reshape(
                            inp_tensor,
                            [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1])

                    boxes, classes = trim_and_pad(boxes,
                                                  4), trim_and_pad(classes, 1)

                    return {
                        ssd_constants.IMAGE:
                        image,
                        ssd_constants.BOXES:
                        boxes,
                        ssd_constants.CLASSES:
                        classes,
                        ssd_constants.SOURCE_ID:
                        tf.string_to_number(source_id, tf.int32),
                        ssd_constants.RAW_SHAPE:
                        raw_shape,
                    }
Beispiel #9
0
        def _parse_example(data):
            with tf.name_scope('augmentation'):
                source_id = data['source_id']
                image = data['image']  # dtype uint8
                raw_shape = tf.shape(image)
                boxes = data['groundtruth_boxes']
                classes = tf.reshape(data['groundtruth_classes'], [-1, 1])

                # Only 80 of the 90 COCO classes are used.
                class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
                classes = tf.gather(class_map, classes)
                classes = tf.cast(classes, dtype=tf.float32)

                if self._is_training:
                    image, boxes, classes = ssd_crop(image, boxes, classes)
                    # ssd_crop resizes and returns image of dtype float32 and does not
                    # change its range (i.e., value in between 0--255). Divide by 255.
                    # converts it to [0, 1] range. Not doing this before cropping to
                    # avoid dtype cast (which incurs additional memory copy).
                    image /= 255.0

                    # random_horizontal_flip() is hard coded to flip with 50% chance.
                    image, boxes = preprocessor.random_horizontal_flip(
                        image=image, boxes=boxes)

                    # TODO(shibow): Investigate the parameters for color jitter.
                    image = color_jitter(image,
                                         brightness=0.125,
                                         contrast=0.5,
                                         saturation=0.5,
                                         hue=0.05)

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
                        boxes, classes)

                    # TODO(taylorrobie): Check that this cast is valid.
                    encoded_classes = tf.cast(encoded_classes, tf.int32)

                    labels = {
                        ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes,
                        ssd_constants.BOXES: encoded_boxes,
                        ssd_constants.CLASSES: tf.squeeze(encoded_classes,
                                                          axis=1),
                    }
                    # This is for dataloader visualization; actual model doesn't use this.
                    if params['visualize_dataloader']:
                        box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
                            scale_factors=ssd_constants.BOX_CODER_SCALES)
                        decoded_boxes = tf.expand_dims(box_coder.decode(
                            rel_codes=tf.squeeze(encoded_boxes),
                            anchors=box_list.BoxList(
                                tf.convert_to_tensor(
                                    DefaultBoxes()('ltrb')))).get(),
                                                       axis=0)
                        labels['decoded_boxes'] = tf.squeeze(decoded_boxes)

                    return image, labels

                else:
                    image = tf.image.resize_images(
                        image,
                        size=(ssd_constants.IMAGE_SIZE,
                              ssd_constants.IMAGE_SIZE))
                    # resize_image returns image of dtype float32 and does not change its
                    # range. Divide by 255 to convert image to [0, 1] range.
                    image /= 255.

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    def trim_and_pad(inp_tensor, dim_1):
                        """Limit the number of boxes, and pad if necessary."""
                        inp_tensor = inp_tensor[:ssd_constants.
                                                MAX_NUM_EVAL_BOXES]
                        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(
                            inp_tensor)[0]
                        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
                        return tf.reshape(
                            inp_tensor,
                            [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1])

                    boxes, classes = trim_and_pad(boxes,
                                                  4), trim_and_pad(classes, 1)

                    sample = {
                        ssd_constants.IMAGE:
                        image,
                        ssd_constants.BOXES:
                        boxes,
                        ssd_constants.CLASSES:
                        classes,
                        ssd_constants.SOURCE_ID:
                        tf.string_to_number(source_id, tf.int32),
                        ssd_constants.RAW_SHAPE:
                        raw_shape,
                    }

                    if not self._is_training and self._count > params[
                            'eval_samples']:
                        sample[ssd_constants.IS_PADDED] = data[
                            ssd_constants.IS_PADDED]
                    return sample
Beispiel #10
0
 def random_horizontal_flip(self):
     """Randomly flip input image and bounding boxes."""
     self._label = tf.expand_dims(self._label, 0)
     self._image, self._boxes, self._label = preprocessor.random_horizontal_flip(
         self._image, boxes=self._boxes, masks=self._label)
     self._label = self._label[0, :, :]