def parse_single_example(serialized_example, params): """Parses a singel serialized TFExample string.""" if 'retinanet_parser' in dir(params): parser_params = params.retinanet_parser decoder = tf_example_decoder.TfExampleDecoder() else: parser_params = params.maskrcnn_parser decoder = tf_example_decoder.TfExampleDecoder(include_mask=True) data = decoder.decode(serialized_example) image = data['image'] source_id = data['source_id'] source_id = dataloader_utils.process_source_id(source_id) height = data['height'] width = data['width'] boxes = data['groundtruth_boxes'] boxes = box_utils.denormalize_boxes(boxes, tf.shape(image)[:2]) classes = data['groundtruth_classes'] is_crowds = data['groundtruth_is_crowd'] areas = data['groundtruth_area'] masks = data.get('groundtruth_instance_masks_png', None) image = input_utils.normalize_image(image) image, image_info = input_utils.resize_and_crop_image( image, parser_params.output_size, padded_size=input_utils.compute_padded_size( parser_params.output_size, 2 ** params.architecture.max_level), aug_scale_min=1.0, aug_scale_max=1.0) labels = { 'image_info': image_info, } groundtruths = { 'source_id': source_id, 'height': height, 'width': width, 'num_detections': tf.shape(classes), 'boxes': boxes, 'classes': classes, 'areas': areas, 'is_crowds': tf.cast(is_crowds, tf.int32), } if masks is not None: groundtruths['masks'] = masks return image, labels, groundtruths
def _parse_single_example(self, example): """Parses a single serialized tf.Example proto. Args: example: a serialized tf.Example proto string. Returns: A dictionary of groundtruth with the following fields: source_id: a scalar tensor of int64 representing the image source_id. height: a scalar tensor of int64 representing the image height. width: a scalar tensor of int64 representing the image width. boxes: a float tensor of shape [K, 4], representing the groundtruth boxes in absolute coordinates with respect to the original image size. classes: a int64 tensor of shape [K], representing the class labels of each instances. is_crowds: a bool tensor of shape [K], indicating whether the instance is crowd. areas: a float tensor of shape [K], indicating the area of each instance. masks: a string tensor of shape [K], containing the bytes of the png mask of each instance. """ decoder = tf_example_decoder.TfExampleDecoder( include_mask=self._include_mask) decoded_tensors = decoder.decode(example) image = decoded_tensors['image'] image_size = tf.shape(image)[0:2] boxes = box_utils.denormalize_boxes( decoded_tensors['groundtruth_boxes'], image_size) groundtruths = { 'source_id': tf.string_to_number(decoded_tensors['source_id'], out_type=tf.int64), 'height': decoded_tensors['height'], 'width': decoded_tensors['width'], 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], 'boxes': boxes, 'classes': decoded_tensors['groundtruth_classes'], 'is_crowds': decoded_tensors['groundtruth_is_crowd'], 'areas': decoded_tensors['groundtruth_area'], } if self._include_mask: groundtruths.update({ 'masks': decoded_tensors['groundtruth_instance_masks_png'], }) return groundtruths
def __init__(self, output_size, min_level, max_level, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, include_mask=False, mask_crop_size=112): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. include_mask: a bool to indicate whether parse mask groundtruth. mask_crop_size: the size which groundtruth mask is cropped to. """ self._skip_crowd_during_training = skip_crowd_during_training self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=include_mask) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Mask. self._include_mask = include_mask self._mask_crop_size = mask_crop_size self._parse_fn = self._parse_train_data
def parse_single_example(serialized_example, params): """Parses a singel serialized TFExample string.""" decoder = tf_example_decoder.TfExampleDecoder() data = decoder.decode(serialized_example) image = data['image'] source_id = data['source_id'] source_id = dataloader_utils.process_source_id(source_id) height = data['height'] width = data['width'] boxes = data['groundtruth_boxes'] boxes = box_utils.denormalize_boxes(boxes, tf.shape(image)[:2]) classes = data['groundtruth_classes'] is_crowds = data['groundtruth_is_crowd'] areas = data['groundtruth_area'] image = input_utils.normalize_image(image) image, image_info = input_utils.resize_and_crop_image( image, params.retinanet_parser.output_size, padded_size=input_utils.compute_padded_size( params.retinanet_parser.output_size, 2**params.anchor.max_level), aug_scale_min=1.0, aug_scale_max=1.0) anchors = anchor.Anchor(params.anchor.min_level, params.anchor.max_level, params.anchor.num_scales, params.anchor.aspect_ratios, params.anchor.anchor_size, image.get_shape().as_list()[:2]) labels = { 'anchor_boxes': anchors.multilevel_boxes, 'image_info': image_info, } groundtruths = { 'source_id': source_id, 'height': height, 'width': width, 'num_detections': tf.shape(classes), 'boxes': boxes, 'classes': classes, 'areas': areas, 'is_crowds': tf.cast(is_crowds, tf.int32), } return image, labels, groundtruths
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, match_threshold=0.5, unmatched_threshold=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, use_autoaugment=False, autoaugment_policy_name='v0', skip_crowd_during_training=True, max_num_instances=100, use_bfloat16=True, regenerate_source_id=False, mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. match_threshold: `float` number between 0 and 1 representing the lower-bound threshold to assign positive labels for anchors. An anchor with a score over the threshold is labeled positive. unmatched_threshold: `float` number between 0 and 1 representing the upper-bound threshold to assign negative labels for anchors. An anchor with a score below the threshold is labeled negative. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy during training. autoaugment_policy_name: `string` that specifies the name of the AutoAugment policy that will be used during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. regenerate_source_id: `bool`, if True TFExampleParser will use hashed value of `image/encoded` for `image/source_id`. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=False, regenerate_source_id=regenerate_source_id) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size self._match_threshold = match_threshold self._unmatched_threshold = unmatched_threshold # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Data Augmentation with AutoAugment. self._use_autoaugment = use_autoaugment self._autoaugment_policy_name = autoaugment_policy_name # Device. self._use_bfloat16 = use_bfloat16 # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, rpn_match_threshold=0.7, rpn_unmatched_threshold=0.3, rpn_batch_size_per_im=256, rpn_fg_fraction=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, max_num_instances=100, include_mask=False, mask_crop_size=112, use_bfloat16=True, mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. rpn_match_threshold: rpn_unmatched_threshold: rpn_batch_size_per_im: rpn_fg_fraction: aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. include_mask: a bool to indicate whether parse mask groundtruth. mask_crop_size: the size which groundtruth mask is cropped to. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=include_mask) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size # Target assigning. self._rpn_match_threshold = rpn_match_threshold self._rpn_unmatched_threshold = rpn_unmatched_threshold self._rpn_batch_size_per_im = rpn_batch_size_per_im self._rpn_fg_fraction = rpn_fg_fraction # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Mask. self._include_mask = include_mask self._mask_crop_size = mask_crop_size # Device. self._use_bfloat16 = use_bfloat16 # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, use_category=True, outer_box_scale=1.0, box_jitter_scale=0.025, num_sampled_masks=8, mask_crop_size=32, mask_min_level=3, mask_max_level=5, upsample_factor=4, match_threshold=0.5, unmatched_threshold=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, max_num_instances=100, use_bfloat16=True, mask_train_class='all', mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. use_category: if `False`, treat all object in all classes in one foreground category. outer_box_scale: `float` number in a range of [1.0, inf) representing the scale from object box to outer box. The mask branch predicts instance mask enclosed in outer box. box_jitter_scale: `float` number representing the noise magnitude to jitter the training groundtruth boxes for mask branch. num_sampled_masks: `int` number of sampled masks for training. mask_crop_size: `list` for [height, width] of output training masks. mask_min_level: `int` number indicating the minimum feature level to obtain instance features. mask_max_level: `int` number indicating the maximum feature level to obtain instance features. upsample_factor: `int` factor of upsampling the fine mask predictions. match_threshold: `float` number between 0 and 1 representing the lower-bound threshold to assign positive labels for anchors. An anchor with a score over the threshold is labeled positive. unmatched_threshold: `float` number between 0 and 1 representing the upper-bound threshold to assign negative labels for anchors. An anchor with a score below the threshold is labeled negative. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mask_train_class: a string of experiment mode: `all`, `voc` or `nonvoc`. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._mask_train_class = mask_train_class self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=True) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size self._match_threshold = match_threshold self._unmatched_threshold = unmatched_threshold # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Device. self._use_bfloat16 = use_bfloat16 # ShapeMask specific. # Control of which category to use. self._use_category = use_category self._num_sampled_masks = num_sampled_masks self._mask_crop_size = mask_crop_size self._mask_min_level = mask_min_level self._mask_max_level = mask_max_level self._outer_box_scale = outer_box_scale self._box_jitter_scale = box_jitter_scale self._up_sample_factor = upsample_factor # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self.parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self.parse_predict_data else: raise ValueError('mode is not defined.')