def _parse_single_example(self, example): """Parses a single serialized tf.Example proto. Args: example: a serialized tf.Example proto string. Returns: A dictionary of groundtruth with the following fields: source_id: a scalar tensor of int64 representing the image source_id. height: a scalar tensor of int64 representing the image height. width: a scalar tensor of int64 representing the image width. boxes: a float tensor of shape [K, 4], representing the groundtruth boxes in absolute coordinates with respect to the original image size. classes: a int64 tensor of shape [K], representing the class labels of each instances. is_crowds: a bool tensor of shape [K], indicating whether the instance is crowd. areas: a float tensor of shape [K], indicating the area of each instance. masks: a string tensor of shape [K], containing the bytes of the png mask of each instance. """ decoder = tf_example_decoder.TfExampleDecoder( include_mask=self._include_mask) decoded_tensors = decoder.decode(example) image = decoded_tensors['image'] image_size = tf.shape(image)[0:2] boxes = box_utils.denormalize_boxes( decoded_tensors['groundtruth_boxes'], image_size) groundtruths = { 'source_id': tf.string_to_number(decoded_tensors['source_id'], out_type=tf.int64), 'height': decoded_tensors['height'], 'width': decoded_tensors['width'], 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], 'boxes': boxes, 'classes': decoded_tensors['groundtruth_classes'], 'is_crowds': decoded_tensors['groundtruth_is_crowd'], 'areas': decoded_tensors['groundtruth_area'], } if self._include_mask: groundtruths.update({ 'masks': decoded_tensors['groundtruth_instance_masks_png'], }) return groundtruths
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, match_threshold=0.5, unmatched_threshold=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, use_autoaugment=False, autoaugment_policy_name='v0', skip_crowd_during_training=True, max_num_instances=100, use_bfloat16=True, mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. match_threshold: `float` number between 0 and 1 representing the lower-bound threshold to assign positive labels for anchors. An anchor with a score over the threshold is labeled positive. unmatched_threshold: `float` number between 0 and 1 representing the upper-bound threshold to assign negative labels for anchors. An anchor with a score below the threshold is labeled negative. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy during training. autoaugment_policy_name: `string` that specifies the name of the AutoAugment policy that will be used during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=False) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size self._match_threshold = match_threshold self._unmatched_threshold = unmatched_threshold # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Data Augmentation with AutoAugment. self._use_autoaugment = use_autoaugment self._autoaugment_policy_name = autoaugment_policy_name # Device. self._use_bfloat16 = use_bfloat16 # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, rpn_match_threshold=0.7, rpn_unmatched_threshold=0.3, rpn_batch_size_per_im=256, rpn_fg_fraction=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, max_num_instances=100, include_mask=False, mask_crop_size=112, use_bfloat16=True, mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. rpn_match_threshold: rpn_unmatched_threshold: rpn_batch_size_per_im: rpn_fg_fraction: aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. include_mask: a bool to indicate whether parse mask groundtruth. mask_crop_size: the size which groundtruth mask is cropped to. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=include_mask) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size # Target assigning. self._rpn_match_threshold = rpn_match_threshold self._rpn_unmatched_threshold = rpn_unmatched_threshold self._rpn_batch_size_per_im = rpn_batch_size_per_im self._rpn_fg_fraction = rpn_fg_fraction # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Mask. self._include_mask = include_mask self._mask_crop_size = mask_crop_size # Device. self._use_bfloat16 = use_bfloat16 # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, use_category=True, outer_box_scale=1.0, box_jitter_scale=0.025, num_sampled_masks=8, mask_crop_size=32, mask_min_level=3, mask_max_level=5, upsample_factor=4, match_threshold=0.5, unmatched_threshold=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, max_num_instances=100, use_bfloat16=True, mask_train_class='all', mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. use_category: if `False`, treat all object in all classes in one foreground category. outer_box_scale: `float` number in a range of [1.0, inf) representing the scale from object box to outer box. The mask branch predicts instance mask enclosed in outer box. box_jitter_scale: `float` number representing the noise magnitude to jitter the training groundtruth boxes for mask branch. num_sampled_masks: `int` number of sampled masks for training. mask_crop_size: `list` for [height, width] of output training masks. mask_min_level: `int` number indicating the minimum feature level to obtain instance features. mask_max_level: `int` number indicating the maximum feature level to obtain instance features. upsample_factor: `int` factor of upsampling the fine mask predictions. match_threshold: `float` number between 0 and 1 representing the lower-bound threshold to assign positive labels for anchors. An anchor with a score over the threshold is labeled positive. unmatched_threshold: `float` number between 0 and 1 representing the upper-bound threshold to assign negative labels for anchors. An anchor with a score below the threshold is labeled negative. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mask_train_class: a string of experiment mode: `all`, `voc` or `nonvoc`. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._mask_train_class = mask_train_class self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=True) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size self._match_threshold = match_threshold self._unmatched_threshold = unmatched_threshold # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Device. self._use_bfloat16 = use_bfloat16 # ShapeMask specific. # Control of which category to use. self._use_category = use_category self._num_sampled_masks = num_sampled_masks self._mask_crop_size = mask_crop_size self._mask_min_level = mask_min_level self._mask_max_level = mask_max_level self._outer_box_scale = outer_box_scale self._box_jitter_scale = box_jitter_scale self._up_sample_factor = upsample_factor # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def test_result_shape(self, image_height, image_width, num_instances): decoder = tf_example_decoder.TfExampleDecoder(include_mask=True) image = _encode_image( np.uint8(np.random.rand(image_height, image_width, 3) * 255), fmt='JPEG') if num_instances == 0: xmins = [] xmaxs = [] ymins = [] ymaxs = [] labels = [] areas = [] is_crowds = [] masks = [] else: xmins = list(np.random.rand(num_instances)) xmaxs = list(np.random.rand(num_instances)) ymins = list(np.random.rand(num_instances)) ymaxs = list(np.random.rand(num_instances)) labels = list(np.random.randint(100, size=num_instances)) areas = [(xmax - xmin) * (ymax - ymin) for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)] is_crowds = [0] * num_instances masks = [] for _ in range(num_instances): mask = _encode_image( np.uint8(np.random.rand(image_height, image_width) * 255), fmt='PNG') masks.append(mask) serialized_example = tf.train.Example( features=tf.train.Features( feature={ 'image/encoded': ( tf.train.Feature( bytes_list=tf.train.BytesList(value=[image]))), 'image/source_id': ( tf.train.Feature( bytes_list=tf.train.BytesList(value=['123']))), 'image/height': ( tf.train.Feature( int64_list=tf.train.Int64List(value=[image_height]))), 'image/width': ( tf.train.Feature( int64_list=tf.train.Int64List(value=[image_width]))), 'image/object/bbox/xmin': ( tf.train.Feature( float_list=tf.train.FloatList(value=xmins))), 'image/object/bbox/xmax': ( tf.train.Feature( float_list=tf.train.FloatList(value=xmaxs))), 'image/object/bbox/ymin': ( tf.train.Feature( float_list=tf.train.FloatList(value=ymins))), 'image/object/bbox/ymax': ( tf.train.Feature( float_list=tf.train.FloatList(value=ymaxs))), 'image/object/class/label': ( tf.train.Feature( int64_list=tf.train.Int64List(value=labels))), 'image/object/is_crowd': ( tf.train.Feature( int64_list=tf.train.Int64List(value=is_crowds))), 'image/object/area': ( tf.train.Feature( float_list=tf.train.FloatList(value=areas))), 'image/object/mask': ( tf.train.Feature( bytes_list=tf.train.BytesList(value=masks))), })).SerializeToString() decoded_tensors = decoder.decode( tf.convert_to_tensor(value=serialized_example)) results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) self.assertAllEqual( (image_height, image_width, 3), results['image'].shape) self.assertEqual('123', results['source_id']) self.assertEqual(image_height, results['height']) self.assertEqual(image_width, results['width']) self.assertAllEqual( (num_instances,), results['groundtruth_classes'].shape) self.assertAllEqual( (num_instances,), results['groundtruth_is_crowd'].shape) self.assertAllEqual( (num_instances,), results['groundtruth_area'].shape) self.assertAllEqual( (num_instances, 4), results['groundtruth_boxes'].shape) self.assertAllEqual( (num_instances, image_height, image_width), results['groundtruth_instance_masks'].shape) self.assertAllEqual( (num_instances,), results['groundtruth_instance_masks_png'].shape)
def test_handling_missing_fields(self): decoder = tf_example_decoder.TfExampleDecoder(include_mask=True) image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]] image = _encode_image(np.uint8(image_content), fmt='PNG') image_height = 4 image_width = 4 num_instances = 2 xmins = [0, 0.25] xmaxs = [0.5, 1.0] ymins = [0, 0] ymaxs = [0.5, 1.0] labels = [3, 1] mask_content = [[[255, 255, 0, 0], [255, 255, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 255, 255, 255], [0, 255, 255, 255], [0, 255, 255, 255], [0, 255, 255, 255]]] masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)] serialized_example = tf.train.Example( features=tf.train.Features( feature={ 'image/encoded': ( tf.train.Feature( bytes_list=tf.train.BytesList(value=[image]))), 'image/source_id': ( tf.train.Feature( bytes_list=tf.train.BytesList(value=['123']))), 'image/height': ( tf.train.Feature( int64_list=tf.train.Int64List(value=[image_height]))), 'image/width': ( tf.train.Feature( int64_list=tf.train.Int64List(value=[image_width]))), 'image/object/bbox/xmin': ( tf.train.Feature( float_list=tf.train.FloatList(value=xmins))), 'image/object/bbox/xmax': ( tf.train.Feature( float_list=tf.train.FloatList(value=xmaxs))), 'image/object/bbox/ymin': ( tf.train.Feature( float_list=tf.train.FloatList(value=ymins))), 'image/object/bbox/ymax': ( tf.train.Feature( float_list=tf.train.FloatList(value=ymaxs))), 'image/object/class/label': ( tf.train.Feature( int64_list=tf.train.Int64List(value=labels))), 'image/object/mask': ( tf.train.Feature( bytes_list=tf.train.BytesList(value=masks))), })).SerializeToString() decoded_tensors = decoder.decode( tf.convert_to_tensor(serialized_example)) results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) self.assertAllEqual( (image_height, image_width, 3), results['image'].shape) self.assertAllEqual(image_content, results['image']) self.assertEqual('123', results['source_id']) self.assertEqual(image_height, results['height']) self.assertEqual(image_width, results['width']) self.assertAllEqual( (num_instances,), results['groundtruth_classes'].shape) self.assertAllEqual( (num_instances,), results['groundtruth_is_crowd'].shape) self.assertAllEqual( (num_instances,), results['groundtruth_area'].shape) self.assertAllEqual( (num_instances, 4), results['groundtruth_boxes'].shape) self.assertAllEqual( (num_instances, image_height, image_width), results['groundtruth_instance_masks'].shape) self.assertAllEqual( (num_instances,), results['groundtruth_instance_masks_png'].shape) self.assertAllEqual( [3, 1], results['groundtruth_classes']) self.assertAllEqual( [False, False], results['groundtruth_is_crowd']) self.assertNDArrayNear( [0.25, 0.75], results['groundtruth_area'], 1e-4) self.assertNDArrayNear( [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]], results['groundtruth_boxes'], 1e-4) self.assertNDArrayNear( mask_content, results['groundtruth_instance_masks'], 1e-4) self.assertAllEqual( masks, results['groundtruth_instance_masks_png'])