def input_fn(self, # noqa: C901 mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = 4 if is_training else 1 max_length = self.max_length(hparams) def tpu_valid_size(example): return data_reader.example_valid_size( example, hparams.min_length, max_length ) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, max_length if drop_long_sequences else 10**9 ) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if is_training: # Repeat and skip a random number of records dataset = dataset.repeat() data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map( data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens. " "Override batch_size_means_tokens() " "in your problem subclass if this is undesired behavior.") batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( batch_size, padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) if not is_training: def _pad_batch(features): if not config or config.data_parallelism.n <= 1: return features tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case.") return pad_batch(features, config.data_parallelism.n) dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = 4 if is_training else 1 def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, hparams.max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, hparams.max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or hparams.data_dir dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams }) dataset = self.dataset(**dataset_kwargs) dataset = dataset.map(data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) if is_training: dataset = dataset.repeat(None) # Batching if _are_shapes_fully_defined(dataset.output_shapes): # Static shape features (e.g. images) if config and config.use_tpu: tpu_batch_size = params["batch_size"] dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(tpu_batch_size)) else: num_shards = (config and config.data_parallelism.n) or 1 dataset = dataset.batch(hparams.batch_size * num_shards) else: # Variable length features if config and config.use_tpu: # On TPU, pad to hparams.max_length dataset = dataset.filter(tpu_valid_size) padded_shapes = _fill_shape_nones( dataset.output_shapes, none_filler=hparams.max_length) dataset = dataset.apply( tf.contrib.data.padded_batch_and_drop_remainder( params["batch_size"], padded_shapes)) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=(config and config.data_parallelism.n) or 1, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = data_reader.bucket_by_sequence_length( dataset, data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"]) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) dataset = dataset.prefetch(1) features = dataset.make_one_shot_iterator().get_next() if not config or not config.use_tpu: _summarize_features(features, (config and config.data_parallelism.n) or 1) if mode == tf.estimator.ModeKeys.PREDICT: features["infer_targets"] = features["targets"] features["targets"] = None # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return features, features["targets"]
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, force_repeat=False, prevent_repeat=False, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU force_repeat: bool, whether to repeat the data even if not training prevent_repeat: bool, whether to not repeat when in training mode. Overrides force_repeat. dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = cpu_count() if is_training else 1 if config and hasattr(config, "data_parallelism") and config.data_parallelism: num_shards = config.data_parallelism.n else: num_shards = 1 max_length = self.max_length(hparams) mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if (force_repeat or is_training) and not prevent_repeat: # Repeat and skip a random number of records dataset = dataset.repeat() if is_training and self.skip_random_fraction_when_training: data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map(data_reader.cast_ints_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens." ) batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] if hparams.pad_batch: tf.logging.warn( "Padding the batch to ensure that remainder eval batches are " "processed. This may lead to incorrect metrics for " "non-zero-padded features, e.g. images. Use a smaller batch " "size that has no remainder in that case.") dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=False) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_size), num_parallel_calls=num_threads) else: dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=True) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_shards, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = dataset.apply( tf.contrib.data.bucket_by_sequence_length( data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"])) if not is_training: batch_multiple = num_shards if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case." ) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) # Add shuffling for training batches. This is necessary along with record # level shuffling in the dataset generation. Record shuffling will shuffle # the examples. However, in some cases, it's possible that the shuffle # buffer size for record shuffling is smaller than the batch size. In such # cases, adding batch shuffling ensures that the data is in random order # during training if (is_training and hasattr(hparams, "batch_shuffle_size") and hparams.batch_shuffle_size): dataset = dataset.shuffle(hparams.batch_shuffle_size) def prepare_for_output(example): if not config or not config.use_tpu: _summarize_features(example, num_shards) if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example["targets"] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return dataset
def input_fn(self, mode, hparams, data_dir=None, params=None, force_repeat=False, prevent_repeat=False, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hp data_dir: str, data directory; if None, will use hp.data_dir params: dict, may include "batch_size" force_repeat: bool, whether to repeat the data even if not training prevent_repeat: bool, whether to not repeat when in training mode. Overrides force_repeat. dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = problem.cpu_count() if is_training else 1 # def gpu_valid_size(example): # return data_utils.example_valid_size(example, hparams.min_length, hparams.max_length) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams }) dataset = self.dataset(**dataset_kwargs) if (force_repeat or is_training) and not prevent_repeat: # Repeat and skip a random number of records dataset = dataset.repeat() dataset = dataset.map(data_reader.cast_ints_to_int32, num_parallel_calls=num_threads) # dataset = dataset.filter(gpu_valid_size) dataset = dataset.apply( tf.contrib.data.bucket_by_sequence_length( data_utils.example_length, [], [hparams.batch_size])) def prepare_for_output(example): problem._summarize_features(example, 1) return example dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return dataset