def __call__(self, params): bs = params["batch_size"] dataset = get_split("train" if self._is_training else "validation", dataset_dir=self._data_dir) if self._is_training: dataset = dataset.shuffle(buffer_size=1024) dataset = dataset.repeat() def _load_records(filename): return tf.data.TFRecordDataset(filename, buffer_size=32 * 1000 * 1000) dataset = dataset.apply( contrib_data.parallel_interleave(_load_records, sloppy=True, cycle_length=64)) dataset = dataset.prefetch(bs * 4) dataset = dataset.map(self._parse_record, num_parallel_calls=32) dataset = dataset.batch(bs, drop_remainder=True) dataset = dataset.prefetch(4) features, labels = dataset.make_one_shot_iterator().get_next() labels = tf.cast(labels, tf.int32) features.set_shape([bs, 224, 224, 3]) labels.set_shape([bs]) return features, labels
def parallel_dataset(dataset_fn, num_shards, seed): """ Builds the dataset to pull in parallel by using parallel interleaved datasets. Parameters ---------- dataset_fn: a function which creates a dataset with the given seed. num_shards: the number of shards for each dataset. seed: the seed to use. Returns ------- dataset: a dataset pulling in parallel from the given number of shards. """ # from tensorflow.data.experimental import parallel_interleave from tensorflow.contrib.data import parallel_interleave if num_shards is None or num_shards == 1: return dataset_fn(seed) print('parallel dataset seems ok...') print(num_shards) seed_offset_dataset = tf.data.Dataset.range(num_shards) seed_offset_dataset = seed_offset_dataset.repeat() seed_offset_dataset = seed_offset_dataset.shuffle(num_shards) return seed_offset_dataset.apply( parallel_interleave(lambda input: dataset_fn(seed + input), cycle_length=num_shards))
def _get_dataset_from_path(self): dataset = tf.data.Dataset.list_files(self._train_data_path) dataset = dataset.apply(contrib_data.shuffle_and_repeat(buffer_size=1000)) dataset = dataset.apply( contrib_data.parallel_interleave( tf.data.TFRecordDataset, cycle_length=20, sloppy=True)) return dataset
def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. d = tf.data.Dataset.list_files(input_file, shuffle=False) d = d.apply( contrib_data.parallel_interleave( functools.partial( tf.data.TFRecordDataset, compression_type=FLAGS.compression_type), cycle_length=32, sloppy=is_training)) if is_training: d = d.repeat() d = d.shuffle(buffer_size=100) d = d.apply( contrib_data.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder)) return d
def make_source_dataset(self, index, num_hosts): """See base class.""" if not self.data_dir: tf.logging.info('Undefined data_dir implies null input') return tf.data.Dataset.range(1).repeat().map(self._get_null_input) # Shuffle the filenames to ensure better randomization. file_pattern = os.path.join( self.data_dir, 'train-*' if self.is_training else 'validation-*') # For multi-host training, we want each hosts to always process the same # subset of files. Each host only sees a subset of the entire dataset, # allowing us to cache larger datasets in memory. dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False) dataset = dataset.shard(num_hosts, index) if self.is_training and not self.cache: dataset = dataset.repeat() def fetch_dataset(filename): buffer_size = 8 * 1024 * 1024 # 8 MiB per file dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size) return dataset # Read the data from disk in parallel dataset = dataset.apply( contrib_data.parallel_interleave( fetch_dataset, cycle_length=64, sloppy=True)) if self.cache: dataset = dataset.cache().apply( contrib_data.shuffle_and_repeat(1024 * 16)) else: dataset = dataset.shuffle(1024) return dataset
def input_fn(): """Supplies input to our model. This function supplies input to our model, where this input is a function of the mode. For example, we supply different data if we're performing training versus evaluation. Returns: A tuple consisting of 1) a dictionary of tensors whose keys are the feature names, and 2) a tensor of target labels if the mode is not INFER (and None, otherwise). """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_epochs = None if is_training else 1 with tf.name_scope('read_batch'): file_names = input_files files = tf.data.Dataset.list_files(file_names) if shuffle: files = files.shuffle(buffer_size=len(file_names)) dataset = (files.apply( contrib_data.parallel_interleave( tf.data.TFRecordDataset, cycle_length=10)).repeat(num_epochs)) if shuffle: dataset = dataset.shuffle(buffer_size=100) parse_fn = _make_parsing_fn(mode, label_name, include_age, categorical_context_features, sequence_features, time_crossed_features) feature_engineering_fn = _make_feature_engineering_fn( dedup, time_windows, include_age, sequence_features, time_crossed_features) if tf.__version__ < '1.12.0': dataset = dataset.map(parse_fn, num_parallel_calls=8) feature_map = (dataset.prefetch(buffer_size=batch_size). make_one_shot_iterator().get_next()) # Batch with padding. feature_map = tf.train.batch(feature_map, batch_size, num_threads=8, capacity=2, enqueue_many=False, dynamic_pad=True) feature_map = feature_engineering_fn(feature_map) else: feature_map = ( dataset.batch(batch_size) # Parallelize the input processing and put it behind a # queue to increase performance by removing it from the # critical path of per-step-computation. .map(parse_fn, num_parallel_calls=8).map( feature_engineering_fn, num_parallel_calls=8).prefetch( buffer_size=1).make_one_shot_iterator().get_next()) label = None if mode != tf.estimator.ModeKeys.PREDICT: label = feature_map.pop(CONTEXT_KEY_PREFIX + label_name) return feature_map, label
def make_source_dataset(self, index=0, num_hosts=1): """See base class.""" if not self.data_dir: tf.logging.info("Undefined data_dir implies null input") return tf.data.Dataset.range(1).repeat().map(self._get_null_input) get_filenames = get_filenames_func() filenames, _ = get_filenames(self.dataset_split) dataset = tf.data.Dataset.from_tensor_slices(filenames) if self.is_training and not self.cache: if filenames is not None: dataset = dataset.shuffle(len(filenames)) dataset = dataset.repeat() def fetch_dataset(filename): buffer_size = 8 * 1024 * 1024 # 8 MB per file dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size) return dataset cycle_length = 64 shuffle_size = 1024 # Read the data from disk in parallel if self.is_training: dataset = dataset.apply( contrib_data.parallel_interleave(fetch_dataset, cycle_length=cycle_length, sloppy=True)) else: dataset = dataset.apply( contrib_data.parallel_interleave(fetch_dataset, cycle_length=1, sloppy=False)) if self.cache: dataset = dataset.cache().apply( contrib_data.shuffle_and_repeat(shuffle_size)) else: if self.is_training: dataset = dataset.shuffle(shuffle_size) return dataset
def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] name_to_features = { "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "masked_lm_positions": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([max_predictions_per_seq], tf.float32), "next_sentence_labels": tf.FixedLenFeature([1], tf.int64), } # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) d = d.repeat() d = d.shuffle(buffer_size=len(input_files)) # `cycle_length` is the number of parallel files that get read. cycle_length = min(num_cpu_threads, len(input_files)) # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. d = d.apply( contrib_data.parallel_interleave( tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length)) d = d.shuffle(buffer_size=100) else: d = tf.data.TFRecordDataset(input_files) # Since we evaluate for a fixed number of steps we don't want to encounter # out-of-range exceptions. d = d.repeat() # We must `drop_remainder` on training because the TPU requires fixed # size dimensions. For eval, we assume we are evaluating on the CPU or GPU # and we *don't* want to drop the remainder, otherwise we wont cover # every sample. d = d.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, num_parallel_batches=num_cpu_threads, drop_remainder=True)) return d
def input_fn(self, params): """Input function which provides a single batch for train or eval. Args: params: `dict` of parameters passed from the `TPUEstimator`. `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A (images, labels) tuple of `Tensor`s for a batch of samples. """ batch_size = params['batch_size'] if FLAGS.use_data == 'real': file_pattern = os.path.join( self.data_dir, 'train-*' if self.is_training else 'validation-*') dataset = tf.data.Dataset.list_files(file_pattern, shuffle=self.is_training) if self.is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset( filename, buffer_size=FLAGS.prefetch_dataset_buffer_size) return dataset dataset = dataset.apply( contrib_data.parallel_interleave( prefetch_dataset, cycle_length=FLAGS.num_files_infeed, sloppy=True)) if FLAGS.followup_shuffle_buffer_size > 0: dataset = dataset.shuffle( buffer_size=FLAGS.followup_shuffle_buffer_size) dataset = dataset.map( self.dataset_parser, num_parallel_calls=FLAGS.num_parallel_calls) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(2) # Prefetch overlaps in-feed with training images, labels = dataset.make_one_shot_iterator().get_next() else: images = tf.random_uniform( [batch_size, FLAGS.height, FLAGS.width, 3], minval=-1, maxval=1) labels = tf.random_uniform( [batch_size], minval=0, maxval=999, dtype=tf.int32) images = tensor_transform_fn(images, params['pipeline_transpose_dims']) return images, labels
def input_clouds_fn(filelist, batch_size=32, copy_size=4, prefetch=1, read_threads=4, distribute=(1, 0)): """ INPUT: prefetch: tf.int64. How many "minibatch" we asynchronously prepare on CPU ahead of GPU """ def parser(ser): """ Decode & Pass datast in tf.record *Cuation* floating point: tfrecord data ==> tf.float64 """ features = { "shape": tf.FixedLenFeature([3], tf.int64), "patch": tf.FixedLenFeature([], tf.string), "filename": tf.FixedLenFeature([], tf.string), "coordinate": tf.FixedLenFeature([2], tf.int64), } decoded = tf.parse_single_example(ser, features) patch = tf.reshape( tf.decode_raw(decoded["patch"], tf.float64), decoded["shape"] #tf.decode_raw(decoded["patch"], tf.float32), decoded["shape"] ) print("shape check in pipeline {}".format(patch.shape), flush=True) #patch = tf.random_crop(patch, shape) #return decoded["filename"], decoded["coordinate"], patch # conversion of tensor patch = tf.cast(patch, tf.float32) return patch # check batch/copy ratio try: if batch_size % copy_size == 0: print("\n Number of actual original images == {} ".format( int(batch_size))) except: raise ValueError( "\n Division of batch size and copy size is not Integer \n") dataset = (tf.data.Dataset.list_files( filelist, shuffle=True).shard(*distribute).apply( parallel_interleave( lambda f: tf.data.TFRecordDataset(f).map(parser), cycle_length=read_threads, sloppy=True, ))) dataset = dataset.shuffle(1000).repeat().batch( int(batch_size)).prefetch(prefetch) return dataset
def read_dataset(file_read_func, input_files, config, filename_shard_fn=None): """Reads a dataset, and handles repetition and shuffling. Args: file_read_func: Function to use in tf_data.parallel_interleave, to read every individual file into a tf.data.Dataset. input_files: A list of file paths to read. config: A input_reader_builder.InputReader object. filename_shard_fn: optional, A funciton used to shard filenames across replicas. This function takes as input a TF dataset of filenames and is expected to return its sharded version. It is useful when the dataset is being loaded on one of possibly many replicas and we want to evenly shard the files between the replicas. Returns: A tf.data.Dataset of (undecoded) tf-records based on config. Raises: RuntimeError: If no files are found at the supplied path(s). """ # Shard, shuffle, and read files. filenames = tf.gfile.Glob(input_files) if not filenames: raise RuntimeError('Did not find any input files matching the glob pattern ' '{}'.format(input_files)) num_readers = config.num_readers if num_readers > len(filenames): num_readers = len(filenames) tf.logging.warning('num_readers has been reduced to %d to match input file ' 'shards.' % num_readers) filename_dataset = tf.data.Dataset.from_tensor_slices(filenames) if config.shuffle: filename_dataset = filename_dataset.shuffle( config.filenames_shuffle_buffer_size) elif num_readers > 1: tf.logging.warning('`shuffle` is false, but the input data stream is ' 'still slightly shuffled since `num_readers` > 1.') if filename_shard_fn: filename_dataset = filename_shard_fn(filename_dataset) filename_dataset = filename_dataset.repeat(config.num_epochs or None) records_dataset = filename_dataset.apply( tf_data.parallel_interleave( file_read_func, cycle_length=num_readers, block_length=config.read_block_length, sloppy=config.shuffle)) if config.shuffle: records_dataset = records_dataset.shuffle(config.shuffle_buffer_size) return records_dataset
def input_fn(params, sequence_schema, context_schema, part_files): dataset = Dataset.from_tensor_slices(part_files).shuffle(len(part_files)) dataset = dataset.apply( parallel_interleave( lambda file: TFRecordDataset(file, compression_type='GZIP'), cycle_length=params['cycle_length'], sloppy=True)) dataset = dataset.map(partial(parse_example, context_schema, sequence_schema), num_parallel_calls=cpu_count()) dataset = dataset.apply( shuffle_and_repeat(params['buffer_size'], count=params['epochs'])) dataset = dataset.batch(params['batch_size']) return dataset
def __call__(self, params): """Input function which provides a single batch for train or eval.""" if self.data_dir is None: tf.logging.info('Using fake input.') return self._input_fn_null(params) # Retrieves the batch size for the current shard. The # of shards is # computed according to the input pipeline deployment. See # `tf.contrib.tpu.RunConfig` for details. batch_size = params["batch_size"] # Shuffle the filenames to ensure better randomization file_pattern = os.path.join( self.data_dir, "train-*" if self.is_training else "validation-*") dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False) if self.is_training: dataset = dataset.shuffle( buffer_size=1024) # 1024 files in dataset if self.is_training: dataset = dataset.repeat() def prefetch_dataset(filename): buffer_size = FLAGS.prefetch_dataset_buffer_size dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size) return dataset dataset = dataset.apply( contrib_data.parallel_interleave( prefetch_dataset, cycle_length=FLAGS.num_files_infeed, sloppy=True)) dataset = dataset.shuffle(FLAGS.shuffle_buffer_size) dataset = dataset.map(self.dataset_parser, num_parallel_calls=128) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch( 2) # Prefetch overlaps in-feed with training return dataset
def input_fn(params): def _dataset_fn(s): walk_length = int(params.num_edges / params.window_size) return RandomWalkDataset( walk_length, _constant_hidden_value(adjacency_list.neighbours, 'create_neighbours'), _constant_hidden_value(adjacency_list.lengths, 'create_lengths'), _constant_hidden_value(adjacency_list.offsets, 'create_offsets'), seed=s).prefetch(params.batch_size * 2) if dataset_shards is None: dataset = _dataset_fn(params.seed) else: from tensorflow.contrib.data import parallel_interleave dataset = tf.data.Dataset.range(dataset_shards).apply( parallel_interleave(_dataset_fn, cycle_length=dataset_shards, sloppy=True)) window = adapters.adapt_random_walk_window(params.window_size) add_negative_samples = relational_erm.sampling.negative_sampling.add_negative_sample( num_vertex, params.num_negative, seed=params.seed) processing_fn = adapters.compose( window, add_negative_samples, adapters.relabel_subgraph(), adapters.append_packed_vertex_labels(packed_labels.labels, packed_labels.lengths, packed_labels.offsets), adapters.add_sample_size_info(), adapters.format_features_labels()) dataset = dataset.map(processing_fn, num_parallel_calls=12) dataset = dataset.prefetch(params.batch_size * 2) dataset = dataset.apply( adapters.padded_batch_samples(params.batch_size)) return dataset.apply(tf.contrib.data.prefetch_to_device('/gpu:0'))
def load_tfrecord(serialized_data, shape, batch_size=1, read_threads=4, shuffle_buffer_size=1000, prefetch=1, distribute=(1, 0)): def parser(serialized_data): features = { "shape": tf.FixedLenFeature([3], tf.int64), "patch": tf.FixedLenFeature([], tf.string), "filename": tf.FixedLenFeature([], tf.string), "coordinate": tf.FixedLenFeature([2], tf.int64), } decoded = tf.parse_single_example(serialized_data, features) # &&&&&& My output id tf.float64 !!!! &&&&&& patch = tf.reshape(tf.decode_raw(decoded["patch"], tf.float64), decoded["shape"]) # randomly crop mini-patches from data patch = tf.random_crop(patch, shape) print(decoded["filename"], decoded["coordinate"], patch) return decoded["filename"], decoded["coordinate"], patch # TODO: understand this code dataset = (tf.data.Dataset.list_files( serialized_data, shuffle=True).shard(*distribute).apply( parallel_interleave( lambda f: tf.data.TFRecordDataset(f).map(parser), cycle_length=read_threads, sloppy=True, ))) # TODO: understand the code print(dataset) dataset = dataset.apply( batch_and_drop_remainder(batch_size)).prefetch(prefetch) return dataset
def input_fn(self, params): """Input function which provides a single batch for train or eval. Args: params: `dict` of parameters passed from the `TPUEstimator`. `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `tf.data.Dataset` object. """ batch_size = params['batch_size'] if FLAGS.use_data == 'real': assert self.data_dir, 'data_dir is required' file_pattern = os.path.join( self.data_dir, 'train-*' if self.is_training else 'validation-*') dataset = tf.data.Dataset.list_files(file_pattern, shuffle=self.is_training) if self.is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset( filename, buffer_size=FLAGS.prefetch_dataset_buffer_size) return dataset dataset = dataset.apply( contrib_data.parallel_interleave( prefetch_dataset, cycle_length=FLAGS.num_files_infeed, sloppy=True)) if FLAGS.followup_shuffle_buffer_size > 0: dataset = dataset.shuffle( buffer_size=FLAGS.followup_shuffle_buffer_size) dataset = dataset.map(self.dataset_parser, num_parallel_calls=FLAGS.num_parallel_calls) else: random_image = tf.random.uniform([FLAGS.height, FLAGS.width, 3], minval=-1, maxval=1) random_label = tf.random.uniform([], minval=0, maxval=999, dtype=tf.int32) dataset = tf.data.Dataset.range(1).repeat().map( lambda data: (random_image, random_label)) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch( 2) # Prefetch overlaps in-feed with training def transpose_images(images): return tensor_transform_fn(images, params['output_perm']) dataset = dataset.map(lambda images, labels: (transpose_images(images), labels), num_parallel_calls=FLAGS.num_parallel_calls) return dataset
def get_pretraining_dataset(opts, data_type, is_training=True, num_cpu_threads=4, use_static_mask=False): if is_training: input_file = opts['train_file'] else: input_file = opts['test_file'] micro_batch_size = opts['micro_batch_size'] max_seq_length = opts['seq_length'] max_predictions_per_seq = opts['max_predictions_per_seq'] input_files = [] for input_pattern in input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) if use_static_mask: # The masked tokens have been re-arranaged to always be at the first # 'max_predictions_per_seq' positions. name_to_features = { "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "input_position": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "mask_padding_index": tf.FixedLenFeature([1], tf.int64), "seq_padding_index": tf.FixedLenFeature([1], tf.int64), "masked_labels": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([max_predictions_per_seq], tf.float32), "next_sentence_labels": tf.FixedLenFeature([1], tf.int64), } else: # Default, the tokens have not been re-arranged. name_to_features = { "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "masked_lm_positions": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([max_predictions_per_seq], tf.float32), "next_sentence_labels": tf.FixedLenFeature([1], tf.int64), } # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) d = d.repeat() # `cycle_length` is the number of parallel files that get read. cycle_length = min(num_cpu_threads, len(input_files)) # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. d = d.apply(parallel_interleave(tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length)) # `buffer_size` should be set big enough to keep data shuffle sufficiently. if opts['distributed_worker_count'] > 1: d = d.shard(num_shards=opts['distributed_worker_count'], index=opts['distributed_worker_index']) d = d.shuffle(buffer_size=1000, seed=opts['seed']) else: d = d.shuffle(buffer_size=1000) else: d = tf.data.TFRecordDataset(input_files) d = d.repeat() d = d.apply(map_and_batch( lambda record: _decode_record(record, name_to_features, data_type), batch_size=micro_batch_size, num_parallel_batches=num_cpu_threads, drop_remainder=True)) return d
def input_fn(params): """The actual input function.""" batch_size = params['batch_size'] name_to_features = { 'input_ids': tf.FixedLenFeature([max_seq_length], tf.int64), 'input_mask': tf.FixedLenFeature([max_seq_length], tf.int64), 'segment_ids': tf.FixedLenFeature([max_seq_length], tf.int64), # Note: We keep this feature name `next_sentence_labels` to be # compatible with the original data created by lanzhzh@. However, in # the ALBERT case it does represent sentence_order_labels. 'next_sentence_labels': tf.FixedLenFeature([1], tf.int64), } if FLAGS.masked_lm_budget: name_to_features.update( { 'token_boundary': tf.FixedLenFeature( [max_seq_length], tf.int64 ) } ) else: name_to_features.update( { 'masked_lm_positions': tf.FixedLenFeature( [max_predictions_per_seq], tf.int64 ), 'masked_lm_ids': tf.FixedLenFeature( [max_predictions_per_seq], tf.int64 ), 'masked_lm_weights': tf.FixedLenFeature( [max_predictions_per_seq], tf.float32 ), } ) # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) d = d.repeat() d = d.shuffle(buffer_size = len(input_files)) # `cycle_length` is the number of parallel files that get read. cycle_length = min(num_cpu_threads, len(input_files)) # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. d = d.apply( contrib_data.parallel_interleave( tf.data.TFRecordDataset, sloppy = is_training, cycle_length = cycle_length, ) ) d = d.shuffle(buffer_size = 100) else: d = tf.data.TFRecordDataset(input_files) # Since we evaluate for a fixed number of steps we don't want to encounter # out-of-range exceptions. d = d.repeat() # We must `drop_remainder` on training because the TPU requires fixed # size dimensions. For eval, we assume we are evaluating on the CPU or GPU # and we *don't* want to drop the remainder, otherwise we wont cover # every sample. d = d.apply( tf.data.experimental.map_and_batch_with_legacy_function( lambda record: _decode_record(record, name_to_features), batch_size = batch_size, num_parallel_batches = num_cpu_threads, drop_remainder = True, ) ) tf.logging.info(d) return d
def get_batch(datasets, preprocess_name, is_training, batch_size, num_gpu=1, seed=None): with tf.device('/cpu:0'): num_class = datasets.num_class file_name = datasets.source feature = datasets.feature decoder = datasets.decoder name = datasets.description['name'] image_preprocessing_fn = get_preprocess_fn(preprocess_name) dataset = tf.data.Dataset.from_tensor_slices(file_name) if is_training: # Shuffle the input files dataset = dataset.shuffle(len(file_name), seed=seed, reshuffle_each_iteration=True) ''' Convert to individual records. cycle_length = 8 means 8 files will be read and deserialized in parallel. This number is low enough to not cause too much contention on small systems but high enough to provide the benefits of parallelization. You may want to increase this number if you have a large number of CPU cores. ''' cycle_length = min(10, len(file_name)) dataset = dataset.apply( data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=cycle_length)) # We prefetch a batch at a time, This can help smooth out the time taken to # load input files as we go through shuffling and processing. dataset = dataset.prefetch(buffer_size=batch_size) if is_training: dataset = dataset.apply( data.shuffle_and_repeat(buffer_size=10000, seed=seed)) else: dataset = dataset.repeat() def map_func(record): parsed = tf.parse_single_example(record, feature) image = decoder(parsed['image/encoded']) # Perform additional preprocessing on the parsed data. image = image_preprocessing_fn(image, datasets, is_training=is_training) label = parsed['image/class/label'] label = tf.one_hot(label, num_class) return image, label ''' Parse the raw records into images and labels. Testing has shown that setting num_parallel_batches > 1 produces no improvement in throughput, since batch_size is almost always much greater than the number of CPU cores. ''' dataset = dataset.apply( data.map_and_batch(map_func=map_func, batch_size=batch_size, num_parallel_batches=1)) ''' Operations between the final prefetch and the get_next call to the iterator will happen synchronously during run time. We prefetch here again to background all of the above processing work and keep it out of the critical training path. ''' dataset = dataset.prefetch(buffer_size=32) iterator = dataset.make_one_shot_iterator() return iterator
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) data['groundtruth_is_crowd'] = tf.cond( tf.greater(tf.size(data['groundtruth_is_crowd']), 0), lambda: data['groundtruth_is_crowd'], lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) input_height = tf.shape(image)[0] input_width = tf.shape(image)[1] if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original scaled_height = tf.to_float( input_height) * input_processor.image_scale scaled_width = tf.to_float( input_width) * input_processor.image_scale image_info = tf.stack([ tf.cast(scaled_height, dtype=tf.float32), tf.cast(scaled_width, dtype=tf.float32), image_scale, tf.cast(input_height, dtype=tf.float32), tf.cast(input_width, dtype=tf.float32), ]) boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, image_info, boxes, is_crowds, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training, seed=tf.random.set_random_seed( int(time.time() * 1e9))) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset( filename, buffer_size=BUFFER_SIZE).prefetch(1) return dataset dataset = dataset.apply( contrib_data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if params.get('dataset_private_threadpool_size', None): options = tf.data.Options() options.experimental_threading.private_threadpool_size = params[ 'dataset_private_threadpool_size'] dataset = dataset.with_options(options) if params.get('dataset_max_intra_op_parallelism', None): options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = params[ 'dataset_max_intra_op_parallelism'] dataset = dataset.with_options(options) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, image_info, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales labels['image_info'] = image_info if not self._is_training: return { 'inputs': images, 'image_info': image_info, 'labels': labels } return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._num_examples > 0: dataset = dataset.take(self._num_examples) return dataset
def __call__(self, params): example_decoder = tf_example_decoder.TfExampleSegmentationDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: A list of the following elements in order: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] label: label tensor of the same spatial dimension as the image. """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] label = data['labels_class'] label = tf.to_int32(label) input_processor = SegmentationInputProcessor( image, params['image_size'], label) # The image normalization is identical to Cloud TPU ResNet. input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) image = input_processor.resize_and_crop_image() # Set padding to background (class=0) during training. if self._is_training: label = input_processor.resize_and_crop_label(0) else: label = input_processor.resize_and_crop_label( params['ignore_label']) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return image, label batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training) if self._is_training: dataset = dataset.repeat() def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( contrib_data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset