def LoadDataset(self, file_pattern): file_pattern_glob = py_utils.ShardedFilePatternToGlob(file_pattern) dataset = tf.data.Dataset.list_files(file_pattern_glob, shuffle=False) def MakeExample(data): return py_utils.NestedMap(data=data, source_id=tf.constant(0)) return dataset.map(MakeExample, deterministic=True)
def LoadDataset(self, file_pattern): file_pattern_glob = py_utils.ShardedFilePatternToGlob(file_pattern) filenames = sorted(tf.io.gfile.glob(file_pattern_glob)) dataset = tf.data.Dataset.from_tensor_slices(filenames) def MakeExample(data): return py_utils.NestedMap(data=data, source_id=tf.constant(0)) return dataset.map(MakeExample, deterministic=True)
def GetDataset(self): p = self.params if not p.file_pattern: raise ValueError('A file pattern must be provided.') file_pattern_glob = py_utils.ShardedFilePatternToGlob(p.file_pattern) dataset = tf.data.Dataset.list_files( file_pattern_glob, shuffle=not self.cluster.require_sequential_input_order) dataset = tf.data.TextLineDataset( dataset, num_parallel_reads=(1 if self.cluster.in_unit_test else tf.data.experimental.AUTOTUNE)) if not self.cluster.require_sequential_input_order: dataset = dataset.shuffle(p.shuffle_buffer_size, reshuffle_each_iteration=True) dataset = dataset.repeat() return dataset