def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records dataset = dataset.flat_map(tf.data.TFRecordDataset) return resnet_run_loop.process_record_dataset(dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_parallel_calls=1, multi_gpu=False): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_parallel_calls: The number of records that are processed in parallel. This can be optimized per data set but for generally homogeneous data sets, should be approximately the number of available CPU cores. multi_gpu: Whether this is run multi-GPU. Note that this is only required currently to handle the batch leftovers, and can be removed when that is handled directly by Estimator. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation'] return resnet_run_loop.process_record_dataset( dataset, is_training, batch_size, _NUM_IMAGES['train'], parse_record, num_epochs, num_parallel_calls, examples_per_epoch=num_images, multi_gpu=multi_gpu)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None, dtype=tf.float32): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_gpus: The number of gpus used for training. dtype: Data type to use for images/features Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_NUM_IMAGES['train'], parse_record_fn=parse_record, num_epochs=num_epochs, num_gpus=num_gpus, examples_per_epoch=_NUM_IMAGES['train'] if is_training else None, dtype=dtype )
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_NUM_IMAGES['train'], parse_record_fn=parse_record, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_parallel_calls=1, multi_gpu=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_parallel_calls: The number of records that are processed in parallel. This can be optimized per data set but for generally homogeneous data sets, should be approximately the number of available CPU cores. multi_gpu: Whether this is run multi-GPU. Note that this is only required currently to handle the batch leftovers, and can be removed when that is handled directly by Estimator. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation'] # Convert to individual records dataset = dataset.flat_map(tf.data.TFRecordDataset) return resnet_run_loop.process_record_dataset( dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs, num_parallel_calls, examples_per_epoch=num_images, multi_gpu=multi_gpu)
def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records dataset = dataset.flat_map(tf.data.TFRecordDataset) return resnet_run_loop.process_record_dataset( dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs )
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_parallel_calls=1, multi_gpu=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_parallel_calls: The number of records that are processed in parallel. This can be optimized per data set but for generally homogeneous data sets, should be approximately the number of available CPU cores. multi_gpu: Whether this is run multi-GPU. Note that this is only required currently to handle the batch leftovers, and can be removed when that is handled directly by Estimator. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation'] # Convert to individual records dataset = dataset.flat_map(tf.data.TFRecordDataset) return resnet_run_loop.process_record_dataset( dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs, num_parallel_calls, examples_per_epoch=num_images, multi_gpu=multi_gpu)
def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means 10 files will be read and deserialized in parallel. # This number is low enough to not cause too much contention on small systems # but high enough to provide the benefits of parallelization. You may want # to increase this number if you have a large number of CPU cores. dataset = dataset.apply( tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) return resnet_run_loop.process_record_dataset(dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1, parse_record_fn=parse_record): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. parse_record_fn: Function to use for parsing the records. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=NUM_IMAGES['train'], parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches )
def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means 10 files will be read and deserialized in parallel. # This number is low enough to not cause too much contention on small systems # but high enough to provide the benefits of parallelization. You may want # to increase this number if you have a large number of CPU cores. dataset = dataset.apply(tf.contrib.data.parallel_interleave( tf.data.TFRecordDataset, cycle_length=10)) return resnet_run_loop.process_record_dataset( dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs )
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_gpus: The number of gpus used for training. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_NUM_IMAGES['train'], parse_record_fn=parse_record, num_epochs=num_epochs, num_gpus=num_gpus, examples_per_epoch=_NUM_IMAGES['train'] if is_training else None )
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1, parse_record_fn=parse_record, input_context=None): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if input_context: tf.compat.v1.logging.info( 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % (input_context.input_pipeline_id, input_context.num_input_pipelines)) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means 10 files will be read and deserialized in parallel. # This number is low enough to not cause too much contention on small systems # but high enough to provide the benefits of parallelization. You may want # to increase this number if you have a large number of CPU cores. dataset = dataset.apply( tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_parallel_calls=1, multi_gpu=False): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_parallel_calls: The number of records that are processed in parallel. This can be optimized per data set but for generally homogeneous data sets, should be approximately the number of available CPU cores. multi_gpu: Whether this is run multi-GPU. Note that this is only required currently to handle the batch leftovers, and can be removed when that is handled directly by Estimator. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation'] return resnet_run_loop.process_record_dataset(dataset, is_training, batch_size, _NUM_IMAGES['train'], parse_record, num_epochs, num_parallel_calls, examples_per_epoch=num_images, multi_gpu=multi_gpu)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None, indx=None): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_gpus: The number of gpus used for training. indx :worker index # Mahtab Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) #print("*******",len(filenames) dataset = tf.data.Dataset.from_tensor_slices(filenames) print("dataset obj", dataset) print("batch_size", batch_size) print("I am in input_fn") if is_training: # Shuffle the input files #dataset = dataset.shard(num_gpus,indx) dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) print("I am in input_fn, shuffling dataset") # Convert to individual records. # cycle_length = 10 means 10 files will be read and deserialized in parallel. # This number is low enough to not cause too much contention on small systems # but high enough to provide the benefits of parallelization. You may want # to increase this number if you have a large number of CPU cores. dataset = dataset.apply( tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10, sloppy=True)) #if is_training: # Shuffle the input files #dataset = dataset.shard(num_gpus,indx) #dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) print("I am in input_fn,before calling process_record_dataset") return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record, num_epochs=num_epochs, num_gpus=num_gpus, examples_per_epoch=_NUM_IMAGES['train'] if is_training else None)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, parse_record_fn=_parse_record, input_context=None, drop_remainder=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. drop_remainder: A boolean indicates whether to drop the remainder of the batches. If True, the batch dimension will be static. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.TFRecordDataset(filenames) # dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) # if input_context: # tf.compat.v1.logging.info( # 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % ( # input_context.input_pipeline_id, input_context.num_input_pipelines)) # dataset = dataset.shard(input_context.num_input_pipelines, # input_context.input_pipeline_id) # if is_training: # # Shuffle the input files # dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=NUM_IMAGES['train'], parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, drop_remainder=drop_remainder)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_parallel_calls=1, multi_gpu=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_parallel_calls: The number of records that are processed in parallel. This can be optimized per data set but for generally homogeneous data sets, should be approximately the number of available CPU cores. multi_gpu: Whether this is run multi-GPU. Note that this is only required currently to handle the batch leftovers, and can be removed when that is handled directly by Estimator. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means 10 files will be read and deserialized in parallel. # This number is low enough to not cause too much contention on small systems # but high enough to provide the benefits of parallelization. You may want # to increase this number if you have a large number of CPU cores. dataset = dataset.apply( tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) return resnet_run_loop.process_record_dataset( dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record, num_epochs, num_parallel_calls, examples_per_epoch=num_images, multi_gpu=multi_gpu)
def input_fn(mode, is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1, parse_record_fn=parse_record): """Input function which provides batches for train or eval. Args: mode: one of 'train', 'val', or 'test' is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. Returns: A dataset that can be used for iteration. """ # filenames = get_filenames(is_training, data_dir) # dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) if mode == 'train': filepath = TRAIN_JR_RECORD_PATH elif mode == 'val': filepath = VALID_JR_RECORD_PATH else: filepath = TEST_JR_RECORD_PATH tfrecord_dataset = tf.data.TFRecordDataset(filepath) return resnet_run_loop.process_record_dataset( dataset=tfrecord_dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=NUM_IMAGES['train'], parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1, parse_record_fn=parse_record, input_context=None): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) if input_context: tf.compat.v1.logging.info( 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % (input_context.input_pipeline_id, input_context.num_input_pipelines)) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=NUM_IMAGES['train'], parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None, dtype=tf.float32): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_gpus: The number of gpus used for training. dtype: Data type to use for images/features Returns: A dataset that can be used for iteration. """ mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER) filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records dataset = dataset.flat_map(tf.data.TFRecordDataset) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record, num_epochs=num_epochs, num_gpus=num_gpus, examples_per_epoch=_NUM_IMAGES['train'] if is_training else None, dtype=dtype)
def input_fn( is_training, data_dir, batch_size, #################### My Changes ######################### # """ # purpose -- val by steps not by epochs # change -- add two args, start_index and num steps, remove num_epochs # want to get the data from start_index to start_index+num_steps*batch_size # """ start_index=0, num_steps=500, ######################################################### num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) #################### My Changes ######################### """ purpose -- val by steps not by epochs change -- get the data from start_index to start_index+num_steps """ # dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) # taker_bytes=3073 # rst_dataset = dataset.take(batch_size) if is_training: num_records = num_steps * batch_size end_index = start_index + num_records # header_bytes = start_index * _RECORD_BYTES if end_index <= _NUM_IMAGES['train']: # taker_bytes = num_records * _RECORD_BYTES rst_dataset = dataset.skip(start_index).take(num_records) else: rst_dataset = dataset.skip(start_index) remian_records = num_records - (_NUM_IMAGES['train'] - start_index) num_repeats = remian_records // _NUM_IMAGES['train'] if num_repeats > 0: dataset_ = dataset.repeat(num_repeats) rst_dataset = rst_dataset.concatenate(dataset_) end_index = remian_records % _NUM_IMAGES['train'] # taker_bytes = end_index * _RECORD_BYTES dataset_ = dataset.take(end_index) rst_dataset = rst_dataset.concatenate(dataset_) else: rst_dataset = dataset ######################################################### return resnet_run_loop.process_record_dataset( dataset=rst_dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_NUM_IMAGES['train'], parse_record_fn=parse_record, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, parse_record_fn=parse_record, input_context=None, drop_remainder=False, tf_data_experimental_slack=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. drop_remainder: A boolean indicates whether to drop the remainder of the batches. If True, the batch dimension will be static. tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack` option. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if input_context: tf.compat.v1.logging.info( 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % ( input_context.input_pipeline_id, input_context.num_input_pipelines)) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means that up to 10 files will be read and deserialized in # parallel. You may want to increase this number if you have a large number of # CPU cores. dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, num_parallel_calls=tf.data.experimental.AUTOTUNE) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, drop_remainder=drop_remainder, tf_data_experimental_slack=tf_data_experimental_slack, )
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, num_parallel_batches=1, parse_record_fn=parse_record, input_context=None, drop_remainder=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. drop_remainder: A boolean indicates whether to drop the remainder of the batches. If True, the batch dimension will be static. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) if input_context: tf.compat.v1.logging.info( 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % ( input_context.input_pipeline_id, input_context.num_input_pipelines)) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means that up to 10 files will be read and deserialized in # parallel. You may want to increase this number if you have a large number of # CPU cores. dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, num_parallel_calls=tf.data.experimental.AUTOTUNE) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches, drop_remainder=drop_remainder )