def _create_dataset_iterator(data_sources, parse_fn, reader=None): with tf.device("/cpu:0"): experimental_data_namespace = _experimental_data_namespace() files = tf.data.Dataset.from_tensor_slices(data_sources) dataset = files.apply( experimental_data_namespace.parallel_interleave( tf.data.TFRecordDataset, cycle_length=1, buffer_output_elements=FLAGS.batch_size * 8, prefetch_input_elements=FLAGS.batch_size * 8)) if FLAGS.datasets_use_caching: dataset = dataset.cache() dataset = dataset.apply( experimental_data_namespace.shuffle_and_repeat( buffer_size=FLAGS.shuffle_buffer_size, count=FLAGS.num_epochs)) dataset = dataset.apply( experimental_data_namespace.map_and_batch( map_func=parse_fn, batch_size=FLAGS.batch_size, num_parallel_batches=FLAGS.num_parallel_batches)) dataset = dataset.prefetch(buffer_size=FLAGS.prefetch_buffer_size) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( FLAGS.num_preprocessing_threads, display_name='input_pipeline_thread_pool')) ds_iterator = dataset.make_initializable_iterator() tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, ds_iterator.initializer) return ds_iterator
def train_data_generator(batch_size): with tf.name_scope('train_batch_processing'): data_dir = FLAGS.train_data_dir glob_pattern = os.path.join(data_dir, 'train-*-of-*') file_names = gfile.Glob(glob_pattern) import random random.shuffle(file_names) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( interleave_ops.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() flags = tf.data.Dataset.from_tensors(tf.constant('train')) flags = flags.repeat() ds = tf.data.Dataset.zip((ds, counter, flags)) ds = ds.prefetch(buffer_size=batch_size * 4) ds = ds.shuffle(buffer_size=1000) ds = ds.repeat() ds = ds.apply( batching.map_and_batch(map_func=preprocess_fn, batch_size=batch_size, num_parallel_batches=10)) ds = ds.prefetch(buffer_size=10) from tensorflow.contrib.data.python.ops import threadpool ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( 10, display_name='input_pipeline_thread_pool')) # ds_iterator = ds.make_initializable_iterator() return ds
def create_dataset(self, batch_size, num_splits, batch_size_per_split, dataset, subset, train, datasets_repeat_cached_sample=False, num_threads=None, datasets_use_caching=False, datasets_parallel_interleave_cycle_length=None, datasets_sloppy_parallel_interleave=False, datasets_parallel_interleave_prefetch=None): assert self.supports_dataset() self.counter = 0 self.options = dataset.options if self.options.data_mode == 'poison': self.poison_pattern, self.poison_mask = dataset.read_poison_pattern( self.options.poison_pattern_file) glob_pattern = dataset.tf_record_pattern(self.options.data_subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format( glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( tf.data.experimental.parallel_interleave( tf.data.TFRecordDataset, cycle_length=datasets_parallel_interleave_cycle_length or 10, sloppy=datasets_sloppy_parallel_interleave, prefetch_input_elements=datasets_parallel_interleave_prefetch)) if datasets_repeat_cached_sample: # Repeat a single sample element indefinitely to emulate memory-speed IO. ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=batch_size) if datasets_use_caching: ds = ds.cache() if self.options.shuffle: ds = ds.apply( tf.data.experimental.shuffle_and_repeat(buffer_size=10000)) else: ds = ds.repeat() ds = ds.apply( tf.data.experimental.map_and_batch( map_func=self.parse_and_preprocess, batch_size=batch_size_per_split, num_parallel_batches=num_splits)) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) return ds
def testOverrideThreadPool(self): def get_thread_id(_): # Python creates a dummy thread object to represent the current # thread when called from an "alien" thread (such as a # `PrivateThreadPool` thread in this case). It does not include # the TensorFlow-given display name, but it has a unique # identifier that maps one-to-one with the underlying OS thread. return np.array(threading.current_thread().ident).astype(np.int64) for num_threads in [1, 2, 4, 8, 16]: dataset = (Dataset.range(1000).map( lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64), num_parallel_calls=32).apply(unique.unique())) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( num_threads, display_name='private_thread_pool_%d' % num_threads)) thread_ids = [] for next_element in datasets.Iterator(dataset): thread_ids.append(next_element) self.assertEqual(len(thread_ids), len(set(thread_ids))) self.assertGreater(len(thread_ids), 0) # NOTE(mrry): We don't control the thread pool scheduling, and # so cannot guarantee that all of the threads in the pool will # perform work. self.assertLessEqual(len(thread_ids), num_threads)
def testOverrideThreadPool(self): def get_thread_id(_): # Python creates a dummy thread object to represent the current # thread when called from an "alien" thread (such as a # `PrivateThreadPool` thread in this case). It does not include # the TensorFlow-given display name, but it has a unique # identifier that maps one-to-one with the underlying OS thread. return np.array(threading.current_thread().ident).astype(np.int64) for num_threads in [1, 2, 4, 8, 16]: dataset = ( Dataset.range(1000).map( lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64), num_parallel_calls=32).apply(unique.unique())) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( num_threads, display_name='private_thread_pool_%d' % num_threads)) thread_ids = [] for next_element in datasets.Iterator(dataset): thread_ids.append(next_element) self.assertEqual(len(thread_ids), len(set(thread_ids))) self.assertGreater(len(thread_ids), 0) # NOTE(mrry): We don't control the thread pool scheduling, and # so cannot guarantee that all of the threads in the pool will # perform work. self.assertLessEqual(len(thread_ids), num_threads)
def parallel_read_data(data_dir, batch_size, batch_size_per_split, num_splits, cycle_length, num_threads, cache_data=False, train=False): file_names = gfile.Glob(os.path.join(data_dir, '%s*_*.tfrecords' % 'train')) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( # interleave_ops.parallel_interleave( # tf.data.TFRecordDataset, cycle_length=cycle_length)) tf.data.experimental.parallel_interleave( tf.data.TFRecordDataset, cycle_length=cycle_length, )) if cache_data: ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=batch_size) if train: # ds = ds.shuffle(buffer_size=10000) ds = ds.apply( tf.data.experimental.shuffle_and_repeat(buffer_size=10000)) else: ds = ds.repeat() # ds = ds.apply( # batching.map_and_batch( # map_func=parse_and_preprocess, # batch_size=batch_size_per_split, # num_parallel_batches=num_splits)) ds = ds.apply( tf.data.experimental.map_and_batch(map_func=parse_and_preprocess, batch_size=batch_size_per_split, num_parallel_batches=num_splits)) # ds = ds.apply( # batching.map_and_batch( # map_func=parse_and_preprocess, # batch_size=batch_size_per_split, # num_parallel_calls=num_splits)) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) ds_iterator = ds.make_initializable_iterator() tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, ds_iterator.initializer) else: ds_iterator = ds.make_one_shot_iterator() return ds_iterator
def create_dataset(self, batch_size, num_splits, batch_size_per_split, dataset, subset, train, datasets_repeat_cached_sample, num_threads=None, datasets_use_caching=False, datasets_parallel_interleave_cycle_length=None, datasets_sloppy_parallel_interleave=False): """Creates a dataset for the benchmark.""" # TODO(laigd): currently the only difference between this and the one in # BaseImagePreprocessor is, this uses map() and padded_batch() while the # latter uses batching.map_and_batch(). Try to merge them. assert self.supports_datasets() glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError('Found no files in --data_dir matching: {}' .format(glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( interleave_ops.parallel_interleave( tf.data.TFRecordDataset, cycle_length=datasets_parallel_interleave_cycle_length or 10, sloppy=datasets_sloppy_parallel_interleave)) if datasets_repeat_cached_sample: # Repeat a single sample element indefinitely to emulate memory-speed IO. ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=batch_size) if datasets_use_caching: ds = ds.cache() if train: ds = ds.apply(tf.contrib.data.shuffle_and_repeat(buffer_size=10000)) else: ds = ds.repeat() ds = ds.map(map_func=self.parse_and_preprocess, num_parallel_calls=batch_size_per_split*num_splits) ds = ds.padded_batch( batch_size=batch_size_per_split, padded_shapes=tuple([ tf.TensorShape(output_shape[1:]) for output_shape in self.output_shapes ]), drop_remainder=True) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) return ds
def create_dataset(self, batch_size, num_splits, batch_size_per_split, dataset, subset, train, datasets_repeat_cached_sample=False, num_threads=None, datasets_use_caching=False, datasets_parallel_interleave_cycle_length=None, datasets_sloppy_parallel_interleave=False, datasets_parallel_interleave_prefetch=None): """Creates a dataset for the benchmark.""" assert self.supports_datasets() self.options = dataset.options self.meanpose = dataset.meanpose self.scale_size = dataset.scale_size if self.options.data_mode == 'poison': self.poison_pattern, self.poison_mask = dataset.read_poison_pattern( self.options.poison_pattern_file) ds = tf.data.TFRecordDataset.from_tensor_slices(dataset.data) if datasets_repeat_cached_sample: ds = ds.take(1).cache().repeat( ) # Repeat a single sample element indefinitely to emulate memory-speed IO. ds = ds.prefetch(buffer_size=batch_size) if datasets_use_caching: ds = ds.cache() if self.options.shuffle: ds = ds.apply( tf.data.experimental.shuffle_and_repeat( buffer_size=min(100000, dataset.num_examples_per_epoch()))) else: ds = ds.repeat() ds = ds.apply( tf.data.experimental.map_and_batch(map_func=self.preprocess, batch_size=batch_size_per_split, num_parallel_batches=num_splits, drop_remainder=True)) ds = ds.prefetch(buffer_size=num_splits) num_threads = 1 if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) return ds
def create_iterator( params, #hsj batch_size, num_splits, batch_size_per_split, preprocess_fn, dataset, subset, train, cache_data, num_threads=None): """Creates a dataset iterator for the benchmark.""" glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format(glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( interleave_ops.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) if cache_data: ds = ds.take(1).cache().repeat() ##hsj if train: ds = ds.shard(params.num_shards, params.shard_idx) ## counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=batch_size) if train: ds = ds.shuffle(buffer_size=10000) ds = ds.repeat() ds = ds.apply( batching.map_and_batch(map_func=preprocess_fn, batch_size=batch_size_per_split, num_parallel_batches=num_splits)) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) ds_iterator = ds.make_initializable_iterator() tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, ds_iterator.initializer) else: ds_iterator = ds.make_one_shot_iterator() return ds_iterator
def _create_dataset_iterator(data_sources, batch_size, parse_fn, reader=None, is_training=True): with tf.device("/cpu:0"): experimental_data_namespace = _experimental_data_namespace() files = tf.data.Dataset.from_tensor_slices(data_sources) dataset = files.apply( experimental_data_namespace.parallel_interleave( tf.data.TFRecordDataset, cycle_length=1, buffer_output_elements=batch_size * 8, prefetch_input_elements=batch_size * 8)) if FLAGS.datasets_use_caching: dataset = dataset.cache() if is_training: dataset = dataset.apply( experimental_data_namespace.shuffle_and_repeat( buffer_size=FLAGS.shuffle_buffer_size, count=FLAGS.num_epochs)) dataset = dataset.apply( experimental_data_namespace.map_and_batch( map_func=parse_fn, batch_size=batch_size, num_parallel_batches=FLAGS.num_parallel_batches)) dataset = dataset.prefetch(buffer_size=FLAGS.prefetch_buffer_size) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( FLAGS.num_preprocessing_threads, display_name="input_pipeline_thread_pool")) if Version(__version__) >= Version("1.12.0") and \ Version(__version__) < Version("1.14.0"): ds_iterator = dataset.make_initializable_iterator() elif Version(__version__) < Version("2.0"): ds_iterator = tf.compat.v1.data.make_initializable_iterator( dataset) else: raise RuntimeError("Version {} not supported.".format( Version(__version__))) tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, ds_iterator.initializer) return ds_iterator
def create_iterator(batch_size, num_threads, parallel_interleave_cycle_length=0, input_file_spec=None, input_filenames=None, dataset_buffer_size=None, prefetch_records=None): if input_filenames: ds = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(input_filenames)) elif input_file_spec: ds = tf.data.TFRecordDataset.list_files(input_file_spec) else: raise ValueError('You must specify input_file_spec or input_filenames') if parallel_interleave_cycle_length: ds = ds.apply( interleave_ops.parallel_interleave( lambda f: tf.data.TFRecordDataset( f, buffer_size=dataset_buffer_size), cycle_length=parallel_interleave_cycle_length)) else: ds = ds.apply(tf.data.TFRecordDataset) ds = ds.prefetch(buffer_size=prefetch_records) ds = ds.repeat() num_splits = 1 ds = ds.apply( batching.map_and_batch(map_func=process_record, batch_size=batch_size, num_parallel_batches=num_splits)) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) ds_iterator = ds.make_initializable_iterator() tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, ds_iterator.initializer) else: ds_iterator = ds.make_one_shot_iterator() return ds_iterator
def create_dataset(batch_size, num_splits, batch_size_per_split, preprocess_fn, dataset, subset, train, cache_data, num_threads=None): """Creates a dataset for the benchmark.""" glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError('Found no files in --data_dir matching: {}' .format(glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( interleave_ops.parallel_interleave( tf.data.TFRecordDataset, cycle_length=10)) if cache_data: ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=batch_size) if train: ds = ds.shuffle(buffer_size=10000) ds = ds.repeat() ds = ds.apply( batching.map_and_batch( map_func=preprocess_fn, batch_size=batch_size_per_split, num_parallel_batches=num_splits)) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) return ds
def testNumThreads(self): def get_thread_id(_): # Python creates a dummy thread object to represent the current # thread when called from an "alien" thread (such as a # `PrivateThreadPool` thread in this case). It does not include # the TensorFlow-given display name, but it has a unique # identifier that maps one-to-one with the underlying OS thread. return np.array(threading.current_thread().ident).astype(np.int64) for num_threads in [1, 2, 4, 8, 16]: dataset = (dataset_ops.Dataset.range(1000).map( lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64), num_parallel_calls=32).apply(unique.unique())) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( num_threads, display_name="private_thread_pool_%d" % num_threads)) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() with self.test_session() as sess: sess.run(iterator.initializer) thread_ids = [] try: while True: thread_ids.append(sess.run(next_element)) except errors.OutOfRangeError: pass self.assertEqual(len(thread_ids), len(set(thread_ids))) self.assertGreater(len(thread_ids), 0) # NOTE (mrry): We don't control the thread pool scheduling, and id:756 # https://github.com/imdone/tensorflow/issues/757 # so cannot guarantee that all of the threads in the pool will # perform work. self.assertLessEqual(len(thread_ids), num_threads)
def testNumThreads(self, num_threads, max_intra_op_parallelism): def get_thread_id(_): # Python creates a dummy thread object to represent the current # thread when called from an "alien" thread (such as a # `PrivateThreadPool` thread in this case). It does not include # the TensorFlow-given display name, but it has a unique # identifier that maps one-to-one with the underlying OS thread. return np.array(threading.current_thread().ident).astype(np.int64) dataset = ( dataset_ops.Dataset.range(1000).map( lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64), num_parallel_calls=32).apply(unique.unique())) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( num_threads, max_intra_op_parallelism=max_intra_op_parallelism, display_name="private_thread_pool_%d" % num_threads)) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() with self.cached_session() as sess: sess.run(iterator.initializer) thread_ids = [] try: while True: thread_ids.append(sess.run(next_element)) except errors.OutOfRangeError: pass self.assertEqual(len(thread_ids), len(set(thread_ids))) self.assertGreater(len(thread_ids), 0) # NOTE(mrry): We don't control the thread pool scheduling, and # so cannot guarantee that all of the threads in the pool will # perform work. self.assertLessEqual(len(thread_ids), num_threads)
def _create_mock_seq2seq_iterator(): with tf.device('/cpu:0'): dataset = tf.data.Dataset.from_tensor_slices( (tf.ones(shape=[FLAGS.batch_size * 10 * 8, FLAGS.mock_seq_length], dtype=tf.int64), tf.ones(shape=[FLAGS.batch_size * 10 * 8, FLAGS.mock_seq_length], dtype=tf.int64), tf.ones(shape=[FLAGS.batch_size * 10 * 8, FLAGS.mock_seq_length], dtype=tf.int64), tf.random.uniform(shape=[FLAGS.batch_size * 10 * 8], minval=1, maxval=FLAGS.mock_seq_length + 1, dtype=tf.int32, seed=None, name=None), tf.random.uniform(shape=[FLAGS.batch_size * 10 * 8], minval=1, maxval=FLAGS.mock_seq_length + 1, dtype=tf.int32, seed=None, name=None))) dataset = dataset.repeat() dataset = dataset.batch( FLAGS.batch_size).prefetch(buffer_size=FLAGS.prefetch_buffer_size) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( FLAGS.num_preprocessing_threads, display_name='input_pipeline_thread_pool')) mock_iterate_op = dataset.make_initializable_iterator() tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, mock_iterate_op.initializer) return mock_iterate_op
def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer, parse_record_fn, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1): """Given a Dataset with raw records, return an iterator over the records. Args: dataset: A Dataset representing raw records is_training: A boolean denoting whether the input is for training. batch_size: The number of samples per batch. shuffle_buffer: The buffer size to use when shuffling records. A larger value results in better randomness, but smaller values reduce startup time and use less memory. parse_record_fn: A function that takes a raw record and returns the corresponding (image, label) pair. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features. datasets_num_private_threads: Number of threads for a private threadpool created for all datasets computation. num_parallel_batches: Number of parallel batches for tf.data. Returns: Dataset of (image, label) pairs ready for iteration. """ # Prefetches a batch at a time to smooth out the time taken to load input # files for shuffling and processing. dataset = dataset.prefetch(buffer_size=batch_size) if is_training: # Shuffles records before repeating to respect epoch boundaries. dataset = dataset.shuffle(buffer_size=shuffle_buffer) # Repeats the dataset for the number of epochs to train. dataset = dataset.repeat(num_epochs) # Parses the raw records into images and labels. dataset = dataset.apply( tf.contrib.data.map_and_batch( lambda value: parse_record_fn(value, is_training, dtype), batch_size=batch_size, num_parallel_batches=num_parallel_batches, drop_remainder=False)) # Operations between the final prefetch and the get_next call to the iterator # will happen synchronously during run time. We prefetch here again to # background all of the above processing work and keep it out of the # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE # allows DistributionStrategies to adjust how many batches to fetch based # on how many devices are present. dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) # Defines a specific size thread pool for tf.data operations. if datasets_num_private_threads: tf.logging.info('datasets_num_private_threads: %s', datasets_num_private_threads) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( datasets_num_private_threads, display_name='input_pipeline_thread_pool')) return dataset
def __init__(self, sess, batch_size, shuffle, is_training, config, dataset_path, input_size): self.ds_handle_ph = tf.placeholder(tf.string, shape=[]) self.sess = sess self.config = config self.input_size = input_size self.dataset_path = dataset_path self.training_handle = None self.validation_handle = None if is_training: conf_key = "train_name" dataset_map = self.train_dataset_map else: conf_key = "validation_name" dataset_map = self.test_dataset_map # files = tf.data.Dataset.list_files(dataset_path) files = tf.data.Dataset.list_files( os.path.join(config.dataset_dir, "%s_%s*tfrecord" % (config.dataset_name, getattr(config, conf_key)))) # if hasattr(tf.contrib.data, "parallel_interleave"): # ds = files.apply(tf.contrib.data.parallel_interleave( # tf.data.TFRecordDataset, cycle_length=config.num_parallel_readers)) # else: ds = files.interleave(tf.data.TFRecordDataset, cycle_length=config.num_parallel_readers) if config.cache_data: ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=batch_size) # ds = ds.repeat() if shuffle: ds = ds.shuffle(buffer_size=config.buffer_size) if True: # config.num_gpus > 1: batch_size_per_split = batch_size // config.num_gpus images = [] labels = [] ds = ds.apply( batching.map_and_batch( map_func=dataset_map, batch_size=batch_size_per_split, num_parallel_batches=config.num_gpus)) ds = ds.prefetch(buffer_size=config.num_gpus) # ds = ds.map(dataset_map, num_parallel_calls=batch_size) # ds = ds.batch(batch_size) # ds = ds.prefetch(buffer_size=batch_size) iterator = tf.data.Iterator.from_string_handle( self.ds_handle_ph, ds.output_types, ds.output_shapes) if config.datasets_num_private_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( config.datasets_num_private_threads, display_name='input_pipeline_thread_pool')) self.training_iterator = ds.make_initializable_iterator() tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, self.training_iterator.initializer) else: self.training_iterator = ds.make_one_shot_iterator() # self.training_iterator = ds.make_one_shot_iterator() for d in range(config.num_gpus): image, label = iterator.get_next() size = image.get_shape()[1] depth = image.get_shape()[3] image = tf.reshape( image, shape=[batch_size_per_split, size, size, depth]) label = tf.reshape(label, [batch_size_per_split, config.num_class]) labels.append(label) images.append(image) # labels[d], images[d] = iterator.get_next() # for split_index in range(config.num_gpus): # images[split_index] = tf.reshape( # images[split_index], # shape=[batch_size_per_split, config.input_size, config.input_size, # config.num_channel]) # labels[split_index] = tf.reshape(labels[split_index], # [batch_size_per_split]) self.images = images self.labels = labels else: if hasattr(tf.contrib.data, "map_and_batch"): ds = ds.apply(tf.contrib.data.map_and_batch(map_func=dataset_map, batch_size=batch_size)) else: ds = ds.map(map_func=dataset_map, num_parallel_calls=config.num_parallel_calls) ds = ds.batch(batch_size) ds = ds.prefetch(buffer_size=batch_size) self.iterator = ds.make_initializable_iterator() self.next_batch = self.iterator.get_next()
def get_input_fn(filenames, batch_size=1, num_threads=2, perform_shuffle=False, perform_augmentation=False, per_image_standardization=False, enable_cutout=False, repeat_count=1): """ Input pipeline for ImageNet tfrecord """ def parse(example_proto): features = { 'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1), 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), "image/height": tf.FixedLenFeature((), tf.int64, default_value=0), "image/width": tf.FixedLenFeature((), tf.int64, default_value=0), } parsed_features = tf.parse_single_example(example_proto, features) encoded_image = parsed_features['image/encoded'] height = tf.cast(parsed_features['image/height'], tf.int32) width = tf.cast(parsed_features['image/width'], tf.int32) label = tf.cast(parsed_features['image/class/label'], tf.int32) label = tf.reshape(label, []) image = tf.image.decode_image(encoded_image, channels=3) image = tf.reshape(image, [height, width, 3]) return image, label def resize(image): # resize_bilinear needs a 4-D tensor image = tf.expand_dims(image, 0) # resize to target dimensions. output image's type is float image = tf.image.resize_bilinear(image, [TARGET_HEIGHT, TARGET_WIDTH]) # remove extra dimension introduced for resize_bilinear image = tf.squeeze(image, [0]) return image def distort_image(image): bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]) bbox_begin, bbox_size, _ = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=bbox, min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=max_attempts, use_image_if_no_bounding_boxes=True) # Crop the image to the specified bounding box. image = tf.slice(image, bbox_begin, bbox_size) image.set_shape([None, None, 3]) return image def augment(image): # distort image image = distort_image(image) # resize_bilinear image = resize(image) # Randomly flip the image horizontally. image = tf.image.random_flip_left_right(image) image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.clip_by_value(image, 0.0, 1.0) if enable_cutout: image = cutout(image, p=0.5, s_l=0.02, s_h=0.4, r_1=0.3, r_2=3.3, v_l=0, v_h=1.0) return image def preprocess_fn(example_proto): # decode example from proto image, label = parse(example_proto) image = tf.image.convert_image_dtype(image, dtype=tf.float32) if perform_augmentation: # data augmentation and resize image = augment(image) else: # central crop like slim image = tf.image.central_crop(image, central_fraction=central_fraction) # resize image = resize(image) if per_image_standardization: # Subtract off the mean and divide by the variance of the pixels image = tf.image.per_image_standardization(image) else: # Convert from [0, 255] -> [-1.0, 1.0] floats. image = tf.subtract(image, 0.5) image = tf.multiply(image, 2.0) # convert from HWC to CHW image = tf.transpose(image, [2, 0, 1]) # Convert label from a scalar uint8 tensor to an int32 scalar. label = tf.cast(label, tf.int32) return image, label ds = tf.data.TFRecordDataset.list_files(filenames) ds = ds.apply( interleave_ops.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) ds = ds.prefetch(buffer_size=batch_size) if perform_shuffle: ds = ds.shuffle(buffer_size=10000) ds = ds.repeat(repeat_count) ds = ds.apply( batching.map_and_batch(map_func=preprocess_fn, batch_size=batch_size, num_parallel_batches=2)) ds = ds.prefetch(buffer_size=1) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) return ds
def create_dataset(self, batch_size, num_splits, batch_size_per_split, dataset, subset, train, datasets_repeat_cached_sample=False, num_threads=None, datasets_use_caching=False, datasets_parallel_interleave_cycle_length=None, datasets_sloppy_parallel_interleave=False, datasets_parallel_interleave_prefetch=None): """Creates a dataset for the benchmark.""" assert self.supports_datasets() self.options = dataset.options if 'poison' in self.options.data_mode: self.poison_pattern, self.poison_mask = dataset.read_poison_pattern( self.options.poison_pattern_file) ds = tf.data.TFRecordDataset.from_tensor_slices(dataset.data) # def serialize_example(img_path, img_label): # feature = { # 'img_path': _bytes_feature(img_path), # 'img_label': _int64_feature(img_label), # } # ##Create a Features message using tf.train.Example. # example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) # return example_proto.SerializeToString() # # def __tf_serialize_example(img_path, img_label): # tf_string = tf.py_func( # serialize_example, # (img_path, img_label), # tf.string # ) # return tf.reshape(tf_string, ()) # ds = ds.map(__tf_serialize_example) if datasets_repeat_cached_sample: ds = ds.take(1).cache().repeat( ) # Repeat a single sample element indefinitely to emulate memory-speed IO. ds = ds.prefetch(buffer_size=batch_size) if datasets_use_caching: ds = ds.cache() if self.options.shuffle: ds = ds.apply( tf.data.experimental.shuffle_and_repeat( buffer_size=min(100000, dataset.num_examples_per_epoch()))) else: ds = ds.repeat() # def __tf_parse_single_example(example_proto): # feature_description = { # 'img_path': tf.FixedLenFeature([], tf.string), # 'img_label': tf.FixedLenFeature([], tf.int64), # } # return tf.parse_single_example(example_proto, feature_description) # ds = ds.map(__tf_parse_single_example) ds = ds.apply( tf.data.experimental.map_and_batch(map_func=self.preprocess, batch_size=batch_size_per_split, num_parallel_batches=num_splits, drop_remainder=True)) ds = ds.prefetch(buffer_size=num_splits) if num_threads: ds = threadpool.override_threadpool( ds, threadpool.PrivateThreadPool( num_threads, display_name='input_pipeline_thread_pool')) return ds
def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer, parse_record_fn, vocab_table, num_repeat=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1, bucket_width=1): """Given a Dataset with raw records, return an iterator over the records. Args: dataset: A Dataset representing raw records is_training: A boolean denoting whether the input is for training. batch_size: The number of samples per batch. shuffle_buffer: The buffer size to use when shuffling records. A larger value results in better randomness, but smaller values reduce startup time and use less memory. parse_record_fn: A function that takes a raw record and returns the corresponding (image, caption) pair. vocab_table: vocab for parsing caption. num_repeat: The number to repeat the dataset. dtype: Data type to use for images/features. datasets_num_private_threads: Number of threads for a private threadpool created for all datasets computation. num_parallel_batches: Number of parallel batches for tf.data. bucket_width: Returns: Dataset of (image, caption) pairs ready for iteration. """ # Prefetches a batch at a time to smooth out the time taken to load input # files for shuffling and processing. dataset = dataset.prefetch(buffer_size=batch_size) if False: # is_training: # the shuffle is turned off, we will globally shulffle data with subprocess # before each epoch via cmd ``shuf``. dataset = dataset.shuffle(buffer_size=shuffle_buffer) # Repeats the dataset for the number of epochs to train. dataset = dataset.repeat(num_repeat) # Parses the raw records into images and captions. dataset = dataset.map( lambda value: parse_record_fn(value, vocab_table, is_training, dtype), num_parallel_calls=batch_size * num_parallel_batches) if is_training: dataset = dataset.filter( lambda tokens, noise_tokens: tf.logical_and( tf.size(tokens) <= 175, tf.size(tokens) > 0)) padded_shapes = get_padded_shapes(dataset) if is_training: def _key_func(token_ids, noise_token_ids): bucket_id = tf.constant(0, dtype=tf.int32) bucket_id = tf.maximum(bucket_id, tf.size(token_ids) // bucket_width) return tf.to_int64(bucket_id) def _reduce_func(unused_key, windowed_data): return windowed_data.padded_batch( batch_size, padded_shapes=padded_shapes) def _window_size_func(key): key += 1 # For bucket_width == 1, key 0 is unassigned. size = batch_size // (key * bucket_width) return tf.to_int64(size) if batch_size >= 1024: # too large, it is very likely token based batch size. dataset = dataset.apply( tf.data.experimental.group_by_window( key_func=_key_func, reduce_func=_reduce_func, window_size_func=_window_size_func)) else: dataset = dataset.apply( tf.data.experimental.group_by_window( key_func=_key_func, reduce_func=_reduce_func, window_size=batch_size)) else: dataset = dataset.padded_batch( batch_size, padded_shapes=padded_shapes, drop_remainder=False) # Operations between the final prefetch and the get_next call to the iterator # will happen synchronously during run time. We prefetch here again to # background all of the above processing work and keep it out of the # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE # allows DistributionStrategies to adjust how many batches to fetch based # on how many devices are present. dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) # Defines a specific size thread pool for tf.data operations. if datasets_num_private_threads: tf.logging.info('datasets_num_private_threads: %s', datasets_num_private_threads) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( datasets_num_private_threads, display_name='input_pipeline_thread_pool')) return dataset
def process_dataset(dataset, data_set, is_training, batch_size, shuffle_buffer, parse_fn, anchors_path, num_epochs=1, dtype=tf.float32, max_num_boxes_per_image=20, image_size=416, datasets_num_private_threads=None, augmentation=None, num_parallel_batches=1): """Given a Dataset with raw records, return an iterator over the records. Args: dataset: A Dataset representing raw records data_set: A dataset obj. is_training: A boolean denoting whether the input is for training. batch_size: The number of samples per batch. shuffle_buffer: The buffer size to use when shuffling records. A larger value results in better randomness, but smaller values reduce startup time and use less memory. parse_fn: A function that takes a raw record and returns the corresponding (image, label) pair. anchors_path: Path to the anchors file. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features. max_num_boxes_per_image: Max num boxes per scale. image_size: Input image size for yolo. datasets_num_private_threads: Number of threads for a private threadpool created for all datasets computation. augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation. For example, passing imgaug.augmenters.Fliplr(0.5) flips images right/left 50% of the time. num_parallel_batches: Number of parallel batches for tf.data. Returns: Dataset of (image, label) pairs ready for iteration. """ # Prefetches a batch at a time to smooth out the time taken to load input # files for shuffling and processing. dataset = dataset.prefetch(buffer_size=batch_size * 10) if is_training: # Shuffles records before repeating to respect epoch boundaries. dataset = dataset.shuffle(buffer_size=shuffle_buffer) # Repeats the dataset for the number of epochs to train. dataset = dataset.repeat(num_epochs) if dtype == DEFAULT_DTYPE: _dtype = np.float32 else: _dtype = np.float16 dataset = dataset.apply( tf.contrib.data.map_and_batch( lambda img_id: tf.py_func(func=functools.partial( parse_fn, dataset=data_set, augmentation=augmentation, dtype=_dtype, anchors_path=anchors_path, max_num_boxes_per_image=max_num_boxes_per_image, image_size=image_size), inp=[img_id], Tout=[dtype, dtype]), batch_size=batch_size, num_parallel_batches=num_parallel_batches, drop_remainder=False)) # Operations between the final prefetch and the get_next call to the iterator # will happen synchronously during run time. We prefetch here again to # background all of the above processing work and keep it out of the # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE # allows DistributionStrategies to adjust how many batches to fetch based # on how many devices are present. dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) # dataset = dataset.prefetch(buffer_size=200) # Defines a specific size thread pool for tf.data operations. if datasets_num_private_threads: tf.logging.info('datasets_num_private_threads: %s', datasets_num_private_threads) dataset = threadpool.override_threadpool( dataset, threadpool.PrivateThreadPool( datasets_num_private_threads, display_name='input_pipeline_thread_pool')) return dataset