Ejemplo n.º 1
0
  def testDatasetPacking(self):
    dataset = tf.data.Dataset.from_generator(
        example_generator,
        output_types={"inputs": tf.int64, "targets": tf.int64},
        output_shapes={"inputs": tf.TensorShape((None,)),
                       "targets": tf.TensorShape((None,))}
    )
    dataset = generator_utils.pack_dataset(
        dataset, length=5, keys=("inputs", "targets"), use_custom_ops=False)

    with tf.Session().as_default() as sess:
      batch = dataset.make_one_shot_iterator().get_next()
      for reference in reference_packing():
        example = sess.run(batch)
        self.assertAllEqual(set(example.keys()), set(reference.keys()))
        for k in reference:
          self.assertAllEqual(example[k], reference[k])
Ejemplo n.º 2
0
  def dataset(self,
              mode,
              data_dir=None,
              num_threads=None,
              output_buffer_size=None,
              shuffle_files=None,
              hparams=None,
              preprocess=True,
              dataset_split=None,
              shard=None,
              partition_id=0,
              num_partitions=1,
              shuffle_buffer_size=1024,
              max_records=-1):
    """Build a Dataset for this problem.

    Args:
      mode: tf.estimator.ModeKeys; determines which files to read from.
      data_dir: directory that contains data files.
      num_threads: int, number of threads to use for decode and preprocess
        Dataset.map calls.
      output_buffer_size: int, how many elements to prefetch at end of pipeline.
      shuffle_files: whether to shuffle input files. Default behavior (i.e. when
        shuffle_files=None) is to shuffle if mode == TRAIN.
      hparams: HParams; hparams to be passed to
        Problem.preprocess_example and Problem.hparams. If None, will use a
        default set that is a no-op.
      preprocess: bool, whether to map the Dataset through
        Problem.preprocess_example.
      dataset_split: DatasetSplit, which split to read data
        from (TRAIN:"-train", EVAL:"-dev", "test":"-test"). Defaults to mode.
      shard: int, if provided, will only read data from the specified shard.
      partition_id: integer - which partition of the dataset to read from
      num_partitions: how many partitions in the dataset
      shuffle_buffer_size: if shuffle_files is True, this is the buffer size
        used to shuffle records.
      max_records: int, number of records to truncate to.

    Returns:
      Dataset containing dict<feature name, Tensor>.

    Raises:
      ValueError: if num_partitions is greater than the number of data files.
    """
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    shuffle_files = shuffle_files or shuffle_files is None and is_training

    dataset_split = dataset_split or mode
    assert data_dir

    if hparams is None:
      hparams = default_model_hparams()

    if not hasattr(hparams, "data_dir"):
      hparams.add_hparam("data_dir", data_dir)
    if not hparams.data_dir:
      hparams.data_dir = data_dir
    # Construct the Problem's hparams so that items within it are accessible
    _ = self.get_hparams(hparams)

    data_filepattern = self.filepattern(data_dir, dataset_split, shard=shard)
    tf.logging.info("Reading data files from %s", data_filepattern)
    data_files = sorted(slim.parallel_reader.get_data_files(
        data_filepattern))

    # Functions used in dataset transforms below. `filenames` can be either a
    # `tf.string` tensor or `tf.data.Dataset` containing one or more filenames.
    def _load_records_and_preprocess(filenames):
      """Reads files from a string tensor or a dataset of filenames."""
      # Load records from file(s) with an 8MiB read buffer.
      dataset = tf.data.TFRecordDataset(filenames, buffer_size=8 * 1024 * 1024)
      # Decode.
      dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
      # Preprocess if requested.
      # Note that preprocessing should happen per-file as order may matter.
      if preprocess:
        dataset = self.preprocess(dataset, mode, hparams,
                                  interleave=shuffle_files)
      return dataset

    if len(data_files) < num_partitions:
      raise ValueError(
          "number of data files (%d) must be at least the number of hosts (%d)"
          % (len(data_files), num_partitions))
    data_files = [f for (i, f) in enumerate(data_files)
                  if i % num_partitions == partition_id]
    tf.logging.info(
        "partition: %d num_data_files: %d" % (partition_id, len(data_files)))
    if shuffle_files:
      mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
      random.shuffle(data_files)

    dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
    # Create data-set from files by parsing, pre-processing and interleaving.
    if shuffle_files:
      dataset = dataset.apply(
          tf.data.experimental.parallel_interleave(
              _load_records_and_preprocess, sloppy=True, cycle_length=8))
    else:
      dataset = _load_records_and_preprocess(dataset)

    dataset = dataset.map(
        self.maybe_reverse_and_copy, num_parallel_calls=num_threads)
    dataset = dataset.take(max_records)

    ## Shuffle records only for training examples.
    if shuffle_files and is_training:
      dataset = dataset.shuffle(shuffle_buffer_size)
    if hparams.get("pack_dataset", False):
      dataset = generator_utils.pack_dataset(
          dataset, hparams.max_length, keys=["inputs", "targets"],
          use_custom_ops=hparams.get("use_custom_ops", False))
    if output_buffer_size:
      dataset = dataset.prefetch(output_buffer_size)

    return dataset