def apply(self, dataset: tf.data.Dataset, mode: str = None): # pylint: disable=unused-argument if mode is not None and self.modes is not None and mode not in self.modes: LOGGER.info(f"Not applying {self} (mode={mode})") return dataset if self.filename: return dataset.cache(self.filename) else: return dataset.cache()
def transform(self, ds: tf.data.Dataset) -> tf.data.Dataset: if self.cache_dir is None: return ds elif self.cache_dir == "": log.info("Using memory cache for %s", ds) ds = ds.cache() else: cache_path = os.path.join(self.cache_dir, "cache") log.info("Using cache path:[%s] for %s", self.cache_dir, ds) tf.io.gfile.makedirs(os.path.dirname(cache_path)) ds = ds.cache(cache_path) return ds
def process(self, dataset: tf.data.Dataset, batch_size: int): dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE) if self.cache: dataset = dataset.cache() if self.shuffle: dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True) # PADDED BATCH the dataset dataset = dataset.padded_batch( batch_size=batch_size, padded_shapes=( tf.TensorShape([]), tf.TensorShape(self.speech_featurizer.shape), tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([]), ), padding_values=("", 0., 0, self.text_featurizer.blank, 0, self.text_featurizer.blank, 0), drop_remainder=self.drop_remainder) # PREFETCH to improve speed of input length dataset = dataset.prefetch(AUTOTUNE) self.total_steps = get_num_batches(self.total_steps, batch_size) return dataset
def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ if self._num_gpus > 1: dataset = dataset.shard(self._num_gpus, hvd.rank()) if self.is_training: # Shuffle the input files. dataset.shuffle(buffer_size=self._file_shuffle_buffer_size) if self.is_training and not self._cache: dataset = dataset.repeat() # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self._shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel preprocess = self.parse_record dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._num_gpus > 1: # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # Apply Mixup mixup_alpha = self.mixup_alpha if self.is_training else 0.0 dataset = dataset.map( functools.partial(self.mixup, self.local_batch_size, mixup_alpha), num_parallel_calls=64) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def prepare_ds(dataset: tf.data.Dataset, config: HyperparameterDict) -> tf.data.Dataset: # Cast to float dataset = dataset.map(lambda x: tf.cast(x, tf.float32), num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.map(lambda x: config['rescaling'](x), num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.map(config['resizing'], num_parallel_calls=tf.data.experimental.AUTOTUNE) if config['cache_data']: dataset.cache( ) # As the dataset fit in memory, cache before shuffling for better performance. dataset = dataset.shuffle( 1000 ) # For true randomness, set the shuffle buffer to the full dataset size. dataset = dataset.batch(config['batch_size']) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def prepare_for_testing(data_set: tf.data.Dataset, batch_size, cache_path=''): if cache_path != '': cache_filename = 'dataset_test.tfcache' data_set = data_set.cache(''.join([cache_path, '/', cache_filename])) data_set = data_set.repeat() data_set = data_set.batch(batch_size=batch_size) return data_set
def create_dataset(self, dataset: tf.data.Dataset, input_columns, output_columns, batch_size: int, use_cache: bool): dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns) if use_cache: dataset = dataset.cache("cache").repeat() dataset = dataset.shuffle(1000, reshuffle_each_iteration=True) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def _prepare_test_dataset(dataset: tf.data.Dataset, batch_size, cache_path=''): if cache_path != '': cache_filename = 'dataset_test.tfcache' dataset = dataset.cache( os.path.join(opt.data_path, cache_path, cache_filename)) # dataset = dataset.cache(''.join([cache_path, '/', cache_filename])) dataset = dataset.repeat() dataset = dataset.batch(batch_size=batch_size) return dataset
def prepare_for_training(data_set: tf.data.Dataset, batch_size, cache_path=None, shuffle_buffer_size=1000): if cache_path != '': cache_filename = 'dataset_train.tfcache' data_set = data_set.cache(''.join([cache_path, '/', cache_filename])) data_set = data_set.shuffle(buffer_size=shuffle_buffer_size) # repeat forever data_set = data_set.repeat() data_set = data_set.batch(batch_size=batch_size) # `prefetch` lets the dataset fetch batches in the background # while the model is training. data_set = data_set.prefetch(buffer_size=AUTOTUNE) return data_set
def cache(self, data: tf.data.Dataset) -> tf.data.Dataset: """ Cache the dataset Parameters ---------- data tensorflow dataset to cache Returns ------- data_cached cached data """ self._cache_file = _get_cache_fname( self.cache_dir, "-".join([self.__class__.__name__, self._subtype, str(self.mode)])) data = data.cache(self._cache_file) return data
def transform_dataset(self, ds_input: tf.data.Dataset) -> tf.data.Dataset: """Create a dataset that generates preloaded elements. Args: ds_input: Any `tf.data.Dataset` that generates examples as a dictionary of tensors. Should not be repeating infinitely. Return: A dataset that generates the same examples. This is similar to prefetching, except that examples are yielded through a generator and loaded when this method is called rather than during pipeline iteration. """ ds = ds_input.cache() # Preload examples from the input dataset and populate cache. self.examples = list(iter(ds)) return ds
def _optimize_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset: """Return a dataset with caching and prefetching enabled.""" return dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ if (self.config.builder != 'tfds' and self.input_context and self.input_context.num_input_pipelines > 1): dataset = dataset.shard(self.input_context.num_input_pipelines, self.input_context.input_pipeline_id) logging.info( 'Sharding the dataset: input_pipeline_id=%d ' 'num_input_pipelines=%d', self.input_context.num_input_pipelines, self.input_context.input_pipeline_id) if self.is_training and self.config.builder == 'records': # Shuffle the input files. dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size) if self.is_training and not self.config.cache: dataset = dataset.repeat() if self.config.builder == 'records': # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self.config.cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self.config.shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel if self.config.builder == 'records': preprocess = self.parse_record else: preprocess = self.preprocess dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self.input_context and self.config.num_devices > 1: if not self.config.use_per_replica_batch_size: raise ValueError( 'The builder does not support a global batch size with more than ' 'one replica. Got {} replicas. Please set a ' '`per_replica_batch_size` and enable ' '`use_per_replica_batch_size=True`.'.format( self.config.num_devices)) # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self.config.tf_data_service: if not hasattr(tf.data.experimental, 'service'): raise ValueError( 'The tf_data_service flag requires Tensorflow version ' '>= 2.3.0, but the version is {}'.format(tf.__version__)) dataset = dataset.apply( tf.data.experimental.service.distribute( processing_mode='parallel_epochs', service=self.config.tf_data_service, job_name='resnet_train')) dataset = dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) return dataset
def pipeline( self, dataset: tf.data.Dataset, input_context: tf.distribute.InputContext = None ) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. input_context: An optional context provided by `tf.distribute` for cross-replica training. This isn't necessary if using Keras compile/fit. Returns: A TensorFlow dataset outputting batched images and labels. """ if input_context and input_context.num_input_pipelines > 1: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if self.is_training and not self.config.cache: dataset = dataset.repeat() if self.config.builder == 'records': # Read the data from disk in parallel buffer_size = 8 * 1024 * 1024 # Use 8 MiB per file dataset = dataset.interleave( lambda name: tf.data.TFRecordDataset(name, buffer_size=buffer_size), cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(self.global_batch_size) if self.config.cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self.config.shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel if self.config.builder == 'records': preprocess = self.parse_record else: preprocess = self.preprocess dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(self.batch_size, drop_remainder=self.is_training) # Note: we could do image normalization here, but we defer it to the model # which can perform it much faster on a GPU/TPU # TODO(dankondratyuk): if we fix prefetching, we can do it here if self.is_training and self.config.deterministic_train is not None: options = tf.data.Options() options.experimental_deterministic = self.config.deterministic_train options.experimental_slack = self.config.use_slack options.experimental_optimization.parallel_batch = True options.experimental_optimization.map_fusion = True options.experimental_optimization.map_vectorization.enabled = True options.experimental_optimization.map_parallelization = True dataset = dataset.with_options(options) # Prefetch overlaps in-feed with training # Note: autotune here is not recommended, as this can lead to memory leaks. # Instead, use a constant prefetch size like the the number of devices. dataset = dataset.prefetch(self.config.num_devices) return dataset
def _optimize_dataset(ds: tf.data.Dataset): return ds.cache().prefetch(buffer_size=PREFETCH_BUFFER_SIZE)
def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ # This can help resolve OOM issues when using only 1 GPU for training options = tf.data.Options() options.experimental_optimization.map_parallelization = ( not self.disable_map_parallelization) dataset = dataset.with_options(options) if self._num_gpus > 1: # For multi-host training, we want each hosts to always process the same # subset of files. Each host only sees a subset of the entire dataset, # allowing us to cache larger datasets in memory. dataset = dataset.shard(self._num_gpus, hvd.rank()) if self.is_training: # Shuffle the input files. dataset.shuffle(buffer_size=self._file_shuffle_buffer_size) if self.is_training and not self._cache: dataset = dataset.repeat() # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self._shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel preprocess = self.parse_record dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._num_gpus > 1: # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # apply Mixup/CutMix only during training, if requested in the data pipeline, # otherwise they will be applied in the model module on device mixup_alpha = self.mixup_alpha if self.is_training else 0.0 cutmix_alpha = self.cutmix_alpha if self.is_training else 0.0 dataset = dataset.map(functools.partial(mixing, self.local_batch_size, mixup_alpha, cutmix_alpha, self.defer_img_mixing), num_parallel_calls=64) # Assign static batch size dimension # dataset = dataset.map( # functools.partial(self.set_shapes, batch_size), # num_parallel_calls=tf.data.experimental.AUTOTUNE) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def pipeline( self, dataset: tf.data.Dataset, input_context: tf.distribute.InputContext = None ) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. input_context: An optional context provided by `tf.distribute` for cross-replica training. If set with more than one replica, this function assumes `use_per_replica_batch_size=True`. Returns: A TensorFlow dataset outputting batched images and labels. """ if input_context and input_context.num_input_pipelines > 1: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if self.is_training and not self.config.cache: dataset = dataset.repeat() if self.config.builder == 'records': # Read the data from disk in parallel buffer_size = 8 * 1024 * 1024 # Use 8 MiB per file dataset = dataset.interleave( lambda name: tf.data.TFRecordDataset(name, buffer_size=buffer_size), cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self.config.cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self.config.shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel if self.config.builder == 'records': preprocess = self.parse_record else: preprocess = self.preprocess dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if input_context and self.config.num_devices > 1: if not self.config.use_per_replica_batch_size: raise ValueError( 'The builder does not support a global batch size with more than ' 'one replica. Got {} replicas. Please set a ' '`per_replica_batch_size` and enable ' '`use_per_replica_batch_size=True`.'.format( self.config.num_devices)) # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) if self.is_training: options = tf.data.Options() options.experimental_deterministic = self.config.deterministic_train options.experimental_slack = self.config.use_slack options.experimental_optimization.parallel_batch = True options.experimental_optimization.map_fusion = True options.experimental_optimization.map_vectorization.enabled = True options.experimental_optimization.map_parallelization = True dataset = dataset.with_options(options) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def transform(dataset: tf.data.Dataset): return dataset.cache(filename)