def get_input_dataset(data_filepattern: str, input_config: input_config_pb2.InputConfig, vocab_file_dir: str, batch_size: int) -> tf.data.Dataset: """An input_fn to create input datasets. Args: data_filepattern: The file pattern of the input data. input_config: The input config input_config_pb2.InputConfig proto. vocab_file_dir: The path to the directory storing the vocabulary files. batch_size: Batch size of to-be generated dataset. Returns: A Dataset where each element is a batch of feature dicts. """ features_and_vocabs_by_name = get_features_and_vocabs_by_name( input_config, vocab_file_dir) if not input_config.HasField('label_feature'): raise ValueError('Field label_feature is required.') input_files = utils.GetShardFilenames(data_filepattern) d = tf.data.TFRecordDataset(input_files) d = d.shuffle(len(input_files)) d = d.repeat() d = d.shuffle(buffer_size=10000) d = d.map(functools.partial( decode_example, features_and_vocabs_by_name=features_and_vocabs_by_name, label_feature_name=input_config.label_feature.feature_name), num_parallel_calls=8) d = d.batch(batch_size, drop_remainder=True) d = d.prefetch(1) return d
def input_fn(): """An input_fn satisfying the TF estimator spec. Returns: a Dataset where each element is a batch of `features` dicts, passed to the Estimator model_fn. """ input_files = utils.GetShardFilenames(data_filepattern) d = tf.data.TFRecordDataset(input_files) d.shuffle(len(input_files)) d = d.repeat() d = d.shuffle(buffer_size=100) d = d.map(decode_example) d = d.batch(FLAGS.batch_size, drop_remainder=True) d = d.prefetch(1) return d
def read_dataset(data_filepattern): input_files = utils.GetShardFilenames(data_filepattern) return tf.data.TFRecordDataset(input_files)