def __call__(self, params, input_context=None, batch_size=None): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( include_mask='segmentation' in params['heads'], regenerate_source_id=params['regenerate_source_id']) batch_size = batch_size or params['batch_size'] seed = params.get('tf_random_seed', None) dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training, seed=seed) if input_context: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) # Prefetch data from files. def _prefetch_dataset(filename): if params.get('dataset_type', None) == 'sstable': pass else: dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.interleave(_prefetch_dataset, num_parallel_calls=tf.data.AUTOTUNE, deterministic=bool(seed)) dataset = dataset.with_options(self.dataset_options) if self._is_training: dataset = dataset.shuffle(64, seed=seed) # Parse the fetched records to input tensors for model function. # pylint: disable=g-long-lambda if params.get('dataset_type', None) == 'sstable': map_fn = lambda key, value: self.dataset_parser( value, example_decoder, anchor_labeler, params) else: map_fn = lambda value: self.dataset_parser(value, example_decoder, anchor_labeler, params) # pylint: enable=g-long-lambda dataset = dataset.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=params['drop_remainder']) dataset = dataset.map( lambda *args: self.process_example(params, batch_size, *args)) dataset = dataset.prefetch(tf.data.AUTOTUNE) if self._is_training: dataset = dataset.repeat() if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( include_mask='segmentation' in params['heads'], regenerate_source_id=params['regenerate_source_id']) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.interleave( _prefetch_dataset, num_parallel_calls=tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_deterministic = not self._is_training dataset = dataset.with_options(options) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map( lambda value: self.dataset_parser( # pylint: disable=g-long-lambda value, example_decoder, anchor_labeler, params), num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.map( lambda *args: self.process_example(params, batch_size, *args)) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def test_parser(self): tf.random.set_seed(111111) params = hparams_config.get_detection_config( 'efficientdet-d0').as_dict() input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( regenerate_source_id=params['regenerate_source_id']) tfrecord_path = self._make_fake_tfrecord() dataset = tf.data.TFRecordDataset([tfrecord_path]) value = next(iter(dataset)) reader = dataloader.InputReader(tfrecord_path, True) result = reader.dataset_parser(value, example_decoder, anchor_labeler, params) self.assertEqual(len(result), 11)
def _create_example_decoder(self): return tf_example_decoder.TfExampleDecoder( use_instance_mask=self._use_instance_mask)
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( regenerate_source_id=params['regenerate_source_id']) @tf.autograph.experimental.do_not_convert def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preprocessed to have normalized value and fixed dimension [image_height, image_width, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the processed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tensor is padded with 0 to the fixed dimension [self._max_instances_per_image]. areas: Groundtruth areas annotations. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image]. classes: Groundtruth classes annotations. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: from aug import autoaugment # pylint: disable=g-import-not-at-top image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy'], params['use_augmix'], *params['augmix_params']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max'], params.get('target_size', None)) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.strings.to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_instances_per_image, 4]) is_crowds = pad_to_fixed_size( is_crowds, 0, [self._max_instances_per_image, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_instances_per_image, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_instances_per_image, 1]) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.data.experimental.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) @tf.autograph.experimental.do_not_convert def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) if params['data_format'] == 'channels_first': images = tf.transpose(images, [0, 3, 1, 2]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] if params['data_format'] == 'channels_first': labels['cls_targets_%d' % level] = tf.transpose( labels['cls_targets_%d' % level], [0, 3, 1, 2]) labels['box_targets_%d' % level] = tf.transpose( labels['box_targets_%d' % level], [0, 3, 1, 2]) # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # Handle crowd annotations. As crowd annotations are not large # instances, the model ignores them in training. if params['skip_crowd']: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) row = (image, cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored, source_id, image_scale) return row # batch_size = params['batch_size'] batch_size = self._batch_size dataset = tf.data.Dataset.list_files(self._file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename, buffer_size=8 * 1000 * 1000) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=1, sloppy=True)) dataset = dataset.shuffle(buffer_size=3072) dataset = dataset.map(_dataset_parser, num_parallel_calls=12) dataset = dataset.prefetch(32) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(2) (images, cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) num_negatives_batch = tf.reduce_mean(num_negatives) labels['mean_num_negatives'] = tf.reshape( tf.tile(tf.expand_dims(num_negatives_batch, 0), [ batch_size, ]), [batch_size, 1]) num_ignored_batch = tf.reduce_mean(num_ignored) labels['mean_num_ignored'] = tf.reshape( tf.tile(tf.expand_dims(num_ignored_batch, 0), [batch_size]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['cls_weights_%d' % level] = cls_weights[level] labels['box_targets_%d' % level] = box_targets[level] labels['box_weights_%d' % level] = box_weights[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def __call__(self, params): example_decoder = tf_example_decoder.TfExampleDecoder() def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32) raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # random_horizontal_flip() is hard coded to flip with 50% chance. mlperf_log.ssd_print( key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: encoded_classes, } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=ssd_constants.IMAGE_SIZE) image = tf.image.resize_images( image[tf.newaxis, :, :, :], size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))[0, :, :, :] image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) return { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, } batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) if self._is_training: dataset = dataset.shard( params['context'].num_hosts, params['context'].current_input_fn_deployment()[1]) dataset = dataset.shuffle( tf.to_int64(256 / params['context'].num_hosts)) # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) # Parse the fetched records to input tensors for model function. dataset = dataset.map(example_decoder.decode, num_parallel_calls=64) if self._is_training: dataset = dataset.map( # pylint: disable=g-long-lambda lambda data: (data, tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)), num_parallel_calls=64) dataset = dataset.filter(lambda data, pred: pred) dataset = dataset.prefetch(batch_size * 64) dataset = dataset.cache().apply( tf.contrib.data.shuffle_and_repeat(64)) dataset = dataset.prefetch(batch_size * 64) dataset = dataset.apply( tf.contrib.data.map_and_batch( lambda data, _: _parse_example(data), batch_size=batch_size, drop_remainder=True, num_parallel_calls=128)) else: dataset = dataset.prefetch(batch_size * 64) dataset = dataset.apply( tf.contrib.data.map_and_batch(_parse_example, batch_size=batch_size, drop_remainder=True, num_parallel_calls=128)) # Manually apply the double transpose trick for training data. def _transpose_dataset(image, labels): image = tf.transpose(image, [1, 2, 3, 0]) labels[ssd_constants.BOXES] = tf.transpose( labels[ssd_constants.BOXES], [1, 2, 0]) labels[ssd_constants.CLASSES] = tf.transpose( labels[ssd_constants.CLASSES], [1, 2, 0]) return image, labels if self._transpose_input and self._is_training: dataset = dataset.map(_transpose_dataset, num_parallel_calls=128) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def __call__(self, params): image_size = (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, boxes, classes, instance_masks) input_processor.normalize_image() if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): input_processor.random_horizontal_flip() if self._mode == tf.estimator.ModeKeys.TRAIN: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() instance_masks = input_processor.resize_and_crop_masks() cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) image_scale = input_processor.image_scale_to_original scaled_height = input_processor.get_height_length() scaled_width = input_processor.get_width_length() image_info = tf.stack([ tf.to_float(scaled_height), tf.to_float(scaled_width), image_scale, tf.to_float(input_processor.get_original_height), tf.to_float(input_processor.get_original_width), ]) # Pad groundtruth data for evaluation. boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape(cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, score_targets, box_targets, source_id, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks) # batch_size = params['batch_size'] batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, score_targets, box_targets, source_ids, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks): """Processes one batch of data.""" # Transposes images for TPU performance. # Given the batch size, the batch dimesion (N) goes to either the minor # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C) # dimension. Here, we assume N is 4 or 8 and C is 3, so we use # (H, W, C, N). if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): images = tf.transpose(images, [1, 2, 3, 0]) labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_info'] = image_info labels['cropped_gt_masks'] = cropped_gt_masks if self._mode == tf.estimator.ModeKeys.PREDICT: features = dict(images=images, image_info=image_info, groundtruth_data=groundtruth_data, source_ids=source_ids) return features else: return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) if self._num_examples > 0: dataset = dataset.take(self._num_examples) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): image_size = (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=self._use_instance_mask) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size) input_processor.normalize_image() input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return { 'images': image, 'image_info': image_info, 'source_ids': source_id } elif self._mode == tf.estimator.ModeKeys.TRAIN: instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() if self._use_instance_mask: instance_masks = input_processor.resize_and_crop_masks( ) cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. image_info = input_processor.get_image_info() boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return (features, labels) batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.apply( tf.contrib.data.map_and_batch(_dataset_parser, batch_size=batch_size, num_parallel_batches=64, drop_remainder=True)) # Transposes images for TPU performance. # Given the batch size, the batch dimesion (N) goes to either the minor # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C) # dimension. Here, we assume N is 4 or 8 and C is 3, so we use # (H, W, C, N). if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): def _transpose_images(features, labels): features['images'] = tf.transpose(features['images'], [1, 2, 3, 0]) return features, labels dataset = dataset.map(_transpose_images, num_parallel_calls=64) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) if self._num_examples > 0: dataset = dataset.take(self._num_examples) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): example_decoder = tf_example_decoder.TfExampleDecoder() def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: tf.squeeze(encoded_classes, axis=1), } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: image = tf.image.resize_images( image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) sample = { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > params[ 'eval_samples']: sample[ssd_constants.IS_PADDED] = data[ ssd_constants.IS_PADDED] return sample batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) if self._is_training or self._distributed_eval: if 'context' in params: dataset = dataset.shard( params['context'].num_hosts, params['context'].current_input_fn_deployment()[1]) if self._is_training: dataset = dataset.shuffle( tf.to_int64(256 / params['context'].num_hosts)) else: dataset = dataset.shard(params['dataset_num_shards'], params['dataset_index']) if self._is_training: dataset = dataset.shuffle( tf.to_int64(256 / params['dataset_num_shards'])) # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.data.experimental.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) # Parse the fetched records to input tensors for model function. dataset = dataset.map(example_decoder.decode, num_parallel_calls=64) def _mark_is_padded(data): sample = data sample[ssd_constants.IS_PADDED] = tf.constant(True, dtype=tf.bool) return sample def _mark_is_not_padded(data): sample = data sample[ssd_constants.IS_PADDED] = tf.constant(False, dtype=tf.bool) return sample # Pad dataset to the desired size and mark if the data is padded. # During eval/predict, if local_batch_size * num_shards > 5000, # original dataset size won't be fit for computations on that number # of shards. In this case, will take # (local_batch_size - 5000 / num_shards) data from the original dataset # on each shard and mark the padded data as `is_padded`. # Also mark the original data as `not_padded`. # Append the padded data to the original dataset. if not self._is_training and self._count > params['eval_samples']: padded_dataset = dataset.map(_mark_is_padded) dataset = dataset.map(_mark_is_not_padded) dataset = dataset.concatenate(padded_dataset).take( self._count // params['dataset_num_shards']) if self._is_training: dataset = dataset.map( # pylint: disable=g-long-lambda lambda data: (data, tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)), num_parallel_calls=64) dataset = dataset.filter(lambda data, pred: pred) # Prefetching and caching increases the memory usage, so disable when # using fake data. if not self._use_fake_data: dataset = dataset.cache().shuffle(64).repeat() dataset = dataset.map(lambda data, _: _parse_example(data), num_parallel_calls=64) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) else: dataset = dataset.prefetch(batch_size * 64) dataset = dataset.map(_parse_example, num_parallel_calls=64) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) if params['conv0_space_to_depth']: def _space_to_depth_training_fn(images, labels): images = fused_transpose_and_space_to_depth( images, block_size=ssd_constants.SPACE_TO_DEPTH_BLOCK_SIZE, transpose_input=self._transpose_input) if self._transpose_input and batch_size > 8: labels[ssd_constants.BOXES] = tf.transpose( labels[ssd_constants.BOXES], [1, 2, 0]) return images, labels def _space_to_depth_eval_fn(labels): images = labels[ssd_constants.IMAGE] labels[ ssd_constants.IMAGE] = fused_transpose_and_space_to_depth( images, block_size=ssd_constants.SPACE_TO_DEPTH_BLOCK_SIZE, transpose_input=False) return labels if self._is_training: space_to_depth_fn = _space_to_depth_training_fn else: space_to_depth_fn = _space_to_depth_eval_fn dataset = dataset.map(space_to_depth_fn, num_parallel_calls=64) elif self._transpose_input and self._is_training: # Manually apply the double transpose trick for training data. def _transpose_dataset(image, labels): if batch_size > 8: image = tf.transpose(image, [1, 2, 3, 0]) labels[ssd_constants.BOXES] = tf.transpose( labels[ssd_constants.BOXES], [1, 2, 0]) else: image = tf.transpose(image, [1, 2, 0, 3]) return image, labels dataset = dataset.map(_transpose_dataset, num_parallel_calls=64) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 options.experimental_threading.private_threadpool_size = 48 dataset = dataset.with_options(options) if self._use_fake_data: dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] # for xView dataset only; basically the original name is 122.tif and we will change it to number 122 later on. # len = tf.size(tf.string_split([data['source_id']],"")) # source_id = tf.substr(data['source_id'],0,len - 4) image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # Handle crowd annotations. As crowd annotations are not large # instances, the model ignores them in training. if params['skip_crowd']: indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float(image_original_shape[0]) / tf.to_float( tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) # # sess = tf.get_default_session() # print("source id is", sess.run(source_id)) source_id = tf.string_to_number(source_id, out_type=tf.float32) # sess = tf.get_default_session() # print("after conversion, source id is", sess.run(source_id)) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( prefetch_dataset, cycle_length=32, sloppy=True)) dataset = dataset.shuffle(20) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(1) (images, cls_targets, box_targets, num_positives, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales # from tensorflow.python.data.ops import dataset_ops # return dataset_ops.Dataset.zip((images, labels)) return images, labels
def _create_dataset_parser_fn(self, params): """Create parser for parsing input data (dictionary).""" example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=self._use_instance_mask) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] image = tf.image.convert_image_dtype(image, dtype=tf.float32) source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: image = preprocess_ops.normalize_image(image) image, image_info, _, _ = preprocess_ops.resize_and_pad( image, params['image_size'], 2**params['max_level']) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return { 'images': image, 'image_info': image_info, 'source_ids': source_id, } elif self._mode == tf.estimator.ModeKeys.TRAIN: instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) image = preprocess_ops.normalize_image(image) # Random flipping. if params['input_rand_hflip']: flipped_results = ( preprocess_ops.random_horizontal_flip( image, boxes=boxes, masks=instance_masks)) if self._use_instance_mask: image, boxes, instance_masks = flipped_results else: image, boxes = flipped_results # Scaling and padding. image, image_info, boxes, instance_masks = ( preprocess_ops.resize_and_pad(image, params['image_size'], 2**params['max_level'], boxes=boxes, masks=instance_masks)) padded_height, padded_width, _ = image.get_shape().as_list( ) padded_image_size = (padded_height, padded_width) if self._use_instance_mask: cropped_gt_masks = preprocess_ops.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], padded_image_size) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], padded_image_size) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. boxes *= image_info[2] boxes = preprocess_ops.pad_to_fixed_size( boxes, -1, [self._max_num_instances, 4]) classes = preprocess_ops.pad_to_fixed_size( classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = preprocess_ops.pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return (features, labels) return _dataset_parser
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. image_scale: Scale of the processed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. areas: Groundtruth areas annotations. The tensor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tensor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) #shape=[height,weight,3] image = data['image'] #shape=[M,4]---->[0-1] boxes = data['groundtruth_boxes'] #shape=[M,] classes = data['groundtruth_classes'] # shape=[M,1] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] #shape=[M,1] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) #### # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. #if params.get('autoaugment_policy', None) and self._is_training: #from aug import autoaugment # pylint: disable=g-import-not-at-top #image, boxes = autoaugment.distort_image_with_autoaugment( #image, boxes, params['autoaugment_policy']) ##### input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) #[-1.0,1.0]---->可以改成yolo的归一化吗?通过/255 input_processor.normalize_image() # if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() #shape=[M',4],shape=[M,1] boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale #is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) #is_crowds = pad_to_fixed_size(is_crowds, 0, #[self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) #if params['use_bfloat16']: #image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, image_scale, boxes, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.data.experimental.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, image_scales, boxes, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, areas, classes], axis=2) #labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) #if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. #dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): example_decoder = tf_example_decoder.TfExampleDecoder() def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter( image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if params['dtype'] == 'bf16': image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # We transpose in dataloader instead of in the topology to save time encoded_classes, encoded_boxes = transpose_labels(encoded_classes, encoded_boxes) encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { constants.NUM_MATCHED_BOXES: num_matched_boxes, constants.BOXES: encoded_boxes, constants.CLASSES: tf.squeeze(encoded_classes, axis=1), } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor(DefaultBoxes()('ltrb'))) ).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: image = tf.image.resize_images( image, size=(constants.IMAGE_SIZE, constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if params['dtype'] == 'bf16': image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:constants.MAX_NUM_EVAL_BOXES] num_pad = constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) sample = { constants.IMAGE: image, constants.BOXES: boxes, constants.CLASSES: classes, constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > params['eval_samples']: sample[constants.IS_PADDED] = data[constants.IS_PADDED] return sample batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) tf.logging.info("Dataset file pattern '%s': found %d files.", self._file_pattern, len(glob.glob(self._file_pattern))) if self._is_training: dataset_num_shards = params['num_shards'] dataset_shard_index = params['shard_index'] dataset = dataset.shard(dataset_num_shards, dataset_shard_index) if self._is_training: dataset = dataset.shuffle( tf.cast(256 / dataset_num_shards, tf.int64)) # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset options = tf.data.Options() options.experimental_deterministic = not self._is_training dataset = dataset.interleave( map_func=_prefetch_dataset, cycle_length=32, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE).with_options(options) # Parse the fetched records to input tensors for model function. dataset = dataset.map(example_decoder.decode, num_parallel_calls=64) def _mark_is_padded(data): sample = data sample[constants.IS_PADDED] = tf.constant(True, dtype=tf.bool) return sample def _mark_is_not_padded(data): sample = data sample[constants.IS_PADDED] = tf.constant(False, dtype=tf.bool) return sample # Pad dataset to the desired size and mark if the data is padded. # During eval/predict, if local_batch_size * num_shards > 5000, # original dataset size won't be fit for computations on that number # of shards. In this case, will take # (local_batch_size - 5000 / num_shards) data from the original dataset # on each shard and mark the padded data as `is_padded`. # Also mark the original data as `not_padded`. # Append the padded data to the original dataset. if not self._is_training and self._count > params['eval_samples']: padded_dataset = dataset.map(_mark_is_padded) dataset = dataset.map(_mark_is_not_padded) dataset = dataset.concatenate(padded_dataset).take( self._count) if self._is_training: dataset = dataset.map( # pylint: disable=g-long-lambda lambda data: (data, tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)), num_parallel_calls=64) dataset = dataset.filter(lambda data, pred: pred) # Prefetching and caching increases the memory usage, so disable when # using fake data. meminfo = dict((i.split()[0].rstrip(':'),int(i.split()[1])) for i in open('/proc/meminfo').readlines()) mem_kib = meminfo['MemTotal'] caching_mem_kib = len(glob.glob(self._file_pattern)) * 1000000 # rough approx. 1 GiB per tf-record if not self._use_fake_data: if caching_mem_kib > mem_kib: dataset = dataset.shuffle(64).repeat() tf.logging.info("Dataset cache OFF because MemTotal = %d KiB! It may decrease performance.", mem_kib) elif dataset_num_shards < 8: dataset = dataset.shuffle(64).repeat() tf.logging.info("Dataset cache OFF because it is not 8 node training! It may decrease performance.") else: dataset = dataset.cache().shuffle(64).repeat() dataset = dataset.map(lambda data, _: _parse_example(data), num_parallel_calls=64) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) else: dataset = dataset.prefetch(batch_size * 64) dataset = dataset.map(_parse_example, num_parallel_calls=64) dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 options.experimental_threading.private_threadpool_size = 48 dataset = dataset.with_options(options) if self._use_fake_data: dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training, seed=tf.random.set_random_seed( int(time.time() * 1e9))) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def __call__(self, params): image_size = params['dynamic_image_size'] if params[ 'dynamic_input_shapes'] else (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) if params['dynamic_input_shapes']: height_long_side_image_size = image_size[::-1] height_long_side_input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], height_long_side_image_size) height_long_side_anchor_labeler = anchors.AnchorLabeler( height_long_side_input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): input_processor.random_horizontal_flip() if self._mode == tf.estimator.ModeKeys.TRAIN: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_mlperf_reference_size( ) image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() instance_masks = input_processor.resize_and_crop_masks() cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. if params['dynamic_input_shapes']: is_height_short_side = tf.less( input_processor._scaled_height, # pylint: disable=protected-access input_processor._scaled_width) # pylint: disable=protected-access score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long else: score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) image_scale = input_processor.image_scale_to_original scaled_height = input_processor.get_height_length() scaled_width = input_processor.get_width_length() image_info = tf.stack([ tf.to_float(scaled_height), tf.to_float(scaled_width), image_scale, tf.to_float(input_processor.get_original_height), tf.to_float(input_processor.get_original_width), ]) # Pad groundtruth data for evaluation. boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape(cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, score_targets, box_targets, source_id, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks) # batch_size = params['batch_size'] batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) if params['dynamic_input_shapes']: def key_func(image, *args): del args return tf.cast(tf.shape(image)[0], dtype=tf.int64) def reduce_func(unused_key, dataset): return dataset.batch(batch_size, drop_remainder=True) dataset = dataset.apply( tf.contrib.data.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=params['global_batch_size'])) else: dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, score_targets, box_targets, source_ids, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks): """Processes one batch of data.""" # Transposes images from (N, H, W, C)->(H, W, N, C). As batch size is # less than 8, the batch goes to the second minor dimension. if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): images = tf.transpose(images, [1, 2, 0, 3]) labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_info'] = image_info labels['cropped_gt_masks'] = cropped_gt_masks if self._mode == tf.estimator.ModeKeys.PREDICT: features = dict(images=images, image_info=image_info, groundtruth_data=groundtruth_data, source_ids=source_ids) return features elif params['dynamic_input_shapes']: # For dynamic input shapes, we have 2 TPU programs. A tf.cond op is run # on the host side to decide which TPU program to launch. As we have # data prefetch in device side, the data for evaluating the shape needs # to sent back from device to host. Thus we retun `images` shape here # explictly to avoid copy the entire `images` back. return tf.shape(images), images, labels else: return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def get_dataset_for_mode(data_dir, is_training): """Return the location of input samples for a given mode.""" if is_training: return '%s/coco_train2017_nocrowd-*' % data_dir return '%s/coco_val2017-*' % data_dir def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row batch_size = params['batch_size'] data_file_pattern = get_dataset_for_mode(self._data_dir, self._is_training) dataset = tf.data.Dataset.list_files(data_file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=32, sloppy=True)) dataset = dataset.shuffle(20) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(1) (images, cls_targets, box_targets, num_positives, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def __call__(self, params, num_examples=0): image_size = params['image_size'] input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) height_long_side_image_size = image_size[::-1] height_long_side_input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], height_long_side_image_size) height_long_side_anchor_labeler = anchors.AnchorLabeler( height_long_side_input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: A dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: An image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: Image information that includes the original height and width, the scale of the processed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: (only for training) A dictionary that contains groundtruth labels. The following describes {key: value} pairs in the dictionary. score_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: Groundtruth masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size']) input_processor.normalize_image() input_processor.set_scale_factors_to_mlperf_reference_size() image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return {'images': image, 'image_info': image_info, 'source_ids': source_id} # The following part is for training. instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_scale_factors_to_mlperf_reference_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() cropped_gt_masks = input_processor.crop_gt_masks( params['gt_mask_size']) image_info = input_processor.get_image_info() # Assign anchors. is_height_short_side = tf.less(image_info[3], image_info[4]) score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long # Pad groundtruth data. boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape( cropped_gt_masks, [-1, (params['gt_mask_size'] + 4) ** 2]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4) ** 2]) cropped_gt_masks = tf.reshape( cropped_gt_masks, [self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes labels['cropped_gt_masks'] = cropped_gt_masks return features, labels batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) if self._mode == tf.estimator.ModeKeys.TRAIN: # shard and shuffle the image files so each shard has distinctive and # random set of images. # To improve model convergence under large number of hosts, multiple hosts # may share a same dataset shard. This allows a host to get more training # images. if 'dataset_num_shards' in params: train_actual_num_shards = int(params['dataset_num_shards'] // params['hosts_per_dataset_shard']) dataset = dataset.shard( train_actual_num_shards, int(params['dataset_shard_id'] // params['hosts_per_dataset_shard'])) dataset = dataset.shuffle(tf.to_int64(256 // train_actual_num_shards)) # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.data.experimental.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: # Cache the raw images and shuffle them with a resonably large buffer. dataset = dataset.cache().shuffle(params['shuffle_buffer_size']).repeat() if self._distributed_eval: dataset = dataset.shard(params['dataset_num_shards'], params['dataset_shard_id']) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) def horizontal_image(*args): image_info = args[0]['image_info'] return tf.less(image_info[3], image_info[4]) def vertical_image(*args): return tf.logical_not(horizontal_image(*args)) # Pad dataset to the desired size and mark if the dataset is padding. # During PREDICT, if batch_size_per_shard * num_shards > 5000, the # original dataset size won't be evenly divisible by the number of shards. # Note that 5000 is the number of eval samples in COCO dataset. In this # case, the eval dataset will take (batch_per_shard * num_shards - 5000) # samples from the original dataset and mark those extra samples as # `is_padding` and the original data as `is_not_padding`. This ensures # correctness of evaluation on only 5000 samples. # Appends the dataset padding to the original dataset (only in PREDICT). if (self._mode == tf.estimator.ModeKeys.PREDICT and num_examples > params['eval_samples']): def _mark_is_padding(features): features[mask_rcnn_params.IS_PADDING] = tf.constant( True, dtype=tf.bool, shape=[1]) return features def _mark_is_not_padding(features): features[mask_rcnn_params.IS_PADDING] = tf.constant( False, dtype=tf.bool, shape=[1]) return features dataset_padding = dataset # padd equal number of horizontal and vertical images and interleave them. pad_size = int(math.ceil(num_examples - params['eval_samples'])) dataset_padding_hor = dataset_padding.filter(horizontal_image).map( _mark_is_padding).take(pad_size) dataset_padding_ver = dataset_padding.filter(vertical_image).map( _mark_is_padding).take(pad_size) interleaved_dataset_padding = tf.data.experimental.choose_from_datasets( [dataset_padding_hor, dataset_padding_ver], tf.data.Dataset.range(2).repeat(pad_size)) if self._distributed_eval: dataset = dataset.map(_mark_is_not_padding).take( int( math.ceil(params['eval_samples'] / params['dataset_num_shards']))) else: dataset = dataset.map(_mark_is_not_padding).take(params['eval_samples']) dataset = dataset.concatenate(interleaved_dataset_padding) def key_func(*args): return tf.cast(horizontal_image(*args), dtype=tf.int64) def reduce_func(unused_key, dataset): return dataset.batch(batch_size, drop_remainder=True) dataset = dataset.apply( tf.data.experimental.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=(params['batch_size'] * params['replicas_per_worker']))) dataset = dataset.map( functools.partial(self._transform_images, params), num_parallel_calls=16) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if (self._mode == tf.estimator.ModeKeys.TRAIN and num_examples > 0): dataset = dataset.take(num_examples) # Make eval dataset repeat to get rid of eval dataset init per epoch. if self._distributed_eval: dataset = dataset.take( int(num_examples / params['dataset_num_shards'] / params['batch_size'])).cache().repeat() if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 dataset = dataset.with_options(options) return dataset