def get_input_dataset(data_filepattern: str,
                      input_config: input_config_pb2.InputConfig,
                      vocab_file_dir: str, batch_size: int) -> tf.data.Dataset:
    """An input_fn to create input datasets.

  Args:
    data_filepattern: The file pattern of the input data.
    input_config: The input config input_config_pb2.InputConfig proto.
    vocab_file_dir: The path to the directory storing the vocabulary files.
    batch_size: Batch size of to-be generated dataset.

  Returns:
    A Dataset where each element is a batch of feature dicts.
  """
    features_and_vocabs_by_name = get_features_and_vocabs_by_name(
        input_config, vocab_file_dir)
    if not input_config.HasField('label_feature'):
        raise ValueError('Field label_feature is required.')
    input_files = utils.GetShardFilenames(data_filepattern)
    d = tf.data.TFRecordDataset(input_files)
    d = d.shuffle(len(input_files))
    d = d.repeat()
    d = d.shuffle(buffer_size=10000)
    d = d.map(functools.partial(
        decode_example,
        features_and_vocabs_by_name=features_and_vocabs_by_name,
        label_feature_name=input_config.label_feature.feature_name),
              num_parallel_calls=8)
    d = d.batch(batch_size, drop_remainder=True)
    d = d.prefetch(1)
    return d
Exemple #2
0
  def input_fn():
    """An input_fn satisfying the TF estimator spec.

    Returns:
      a Dataset where each element is a batch of `features` dicts, passed to the
      Estimator model_fn.

    """
    input_files = utils.GetShardFilenames(data_filepattern)
    d = tf.data.TFRecordDataset(input_files)
    d.shuffle(len(input_files))
    d = d.repeat()
    d = d.shuffle(buffer_size=100)
    d = d.map(decode_example)
    d = d.batch(FLAGS.batch_size, drop_remainder=True)
    d = d.prefetch(1)
    return d
 def read_dataset(data_filepattern):
   input_files = utils.GetShardFilenames(data_filepattern)
   return tf.data.TFRecordDataset(input_files)