def testDatasetPacking(self): dataset = tf.data.Dataset.from_generator( example_generator, output_types={"inputs": tf.int64, "targets": tf.int64}, output_shapes={"inputs": tf.TensorShape((None,)), "targets": tf.TensorShape((None,))} ) dataset = generator_utils.pack_dataset( dataset, length=5, keys=("inputs", "targets"), use_custom_ops=False) with tf.Session().as_default() as sess: batch = dataset.make_one_shot_iterator().get_next() for reference in reference_packing(): example = sess.run(batch) self.assertAllEqual(set(example.keys()), set(reference.keys())) for k in reference: self.assertAllEqual(example[k], reference[k])
def dataset(self, mode, data_dir=None, num_threads=None, output_buffer_size=None, shuffle_files=None, hparams=None, preprocess=True, dataset_split=None, shard=None, partition_id=0, num_partitions=1, shuffle_buffer_size=1024, max_records=-1): """Build a Dataset for this problem. Args: mode: tf.estimator.ModeKeys; determines which files to read from. data_dir: directory that contains data files. num_threads: int, number of threads to use for decode and preprocess Dataset.map calls. output_buffer_size: int, how many elements to prefetch at end of pipeline. shuffle_files: whether to shuffle input files. Default behavior (i.e. when shuffle_files=None) is to shuffle if mode == TRAIN. hparams: HParams; hparams to be passed to Problem.preprocess_example and Problem.hparams. If None, will use a default set that is a no-op. preprocess: bool, whether to map the Dataset through Problem.preprocess_example. dataset_split: DatasetSplit, which split to read data from (TRAIN:"-train", EVAL:"-dev", "test":"-test"). Defaults to mode. shard: int, if provided, will only read data from the specified shard. partition_id: integer - which partition of the dataset to read from num_partitions: how many partitions in the dataset shuffle_buffer_size: if shuffle_files is True, this is the buffer size used to shuffle records. max_records: int, number of records to truncate to. Returns: Dataset containing dict<feature name, Tensor>. Raises: ValueError: if num_partitions is greater than the number of data files. """ is_training = mode == tf.estimator.ModeKeys.TRAIN shuffle_files = shuffle_files or shuffle_files is None and is_training dataset_split = dataset_split or mode assert data_dir if hparams is None: hparams = default_model_hparams() if not hasattr(hparams, "data_dir"): hparams.add_hparam("data_dir", data_dir) if not hparams.data_dir: hparams.data_dir = data_dir # Construct the Problem's hparams so that items within it are accessible _ = self.get_hparams(hparams) data_filepattern = self.filepattern(data_dir, dataset_split, shard=shard) tf.logging.info("Reading data files from %s", data_filepattern) data_files = sorted(slim.parallel_reader.get_data_files( data_filepattern)) # Functions used in dataset transforms below. `filenames` can be either a # `tf.string` tensor or `tf.data.Dataset` containing one or more filenames. def _load_records_and_preprocess(filenames): """Reads files from a string tensor or a dataset of filenames.""" # Load records from file(s) with an 8MiB read buffer. dataset = tf.data.TFRecordDataset(filenames, buffer_size=8 * 1024 * 1024) # Decode. dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads) # Preprocess if requested. # Note that preprocessing should happen per-file as order may matter. if preprocess: dataset = self.preprocess(dataset, mode, hparams, interleave=shuffle_files) return dataset if len(data_files) < num_partitions: raise ValueError( "number of data files (%d) must be at least the number of hosts (%d)" % (len(data_files), num_partitions)) data_files = [f for (i, f) in enumerate(data_files) if i % num_partitions == partition_id] tf.logging.info( "partition: %d num_data_files: %d" % (partition_id, len(data_files))) if shuffle_files: mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) random.shuffle(data_files) dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files)) # Create data-set from files by parsing, pre-processing and interleaving. if shuffle_files: dataset = dataset.apply( tf.data.experimental.parallel_interleave( _load_records_and_preprocess, sloppy=True, cycle_length=8)) else: dataset = _load_records_and_preprocess(dataset) dataset = dataset.map( self.maybe_reverse_and_copy, num_parallel_calls=num_threads) dataset = dataset.take(max_records) ## Shuffle records only for training examples. if shuffle_files and is_training: dataset = dataset.shuffle(shuffle_buffer_size) if hparams.get("pack_dataset", False): dataset = generator_utils.pack_dataset( dataset, hparams.max_length, keys=["inputs", "targets"], use_custom_ops=hparams.get("use_custom_ops", False)) if output_buffer_size: dataset = dataset.prefetch(output_buffer_size) return dataset