Esempio n. 1
0
    def LoadDataset(self, file_pattern):
        file_pattern_glob = py_utils.ShardedFilePatternToGlob(file_pattern)
        dataset = tf.data.Dataset.list_files(file_pattern_glob, shuffle=False)

        def MakeExample(data):
            return py_utils.NestedMap(data=data, source_id=tf.constant(0))

        return dataset.map(MakeExample, deterministic=True)
Esempio n. 2
0
  def LoadDataset(self, file_pattern):
    file_pattern_glob = py_utils.ShardedFilePatternToGlob(file_pattern)
    filenames = sorted(tf.io.gfile.glob(file_pattern_glob))
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    def MakeExample(data):
      return py_utils.NestedMap(data=data, source_id=tf.constant(0))

    return dataset.map(MakeExample, deterministic=True)
Esempio n. 3
0
    def GetDataset(self):
        p = self.params
        if not p.file_pattern:
            raise ValueError('A file pattern must be provided.')
        file_pattern_glob = py_utils.ShardedFilePatternToGlob(p.file_pattern)
        dataset = tf.data.Dataset.list_files(
            file_pattern_glob,
            shuffle=not self.cluster.require_sequential_input_order)
        dataset = tf.data.TextLineDataset(
            dataset,
            num_parallel_reads=(1 if self.cluster.in_unit_test else
                                tf.data.experimental.AUTOTUNE))

        if not self.cluster.require_sequential_input_order:
            dataset = dataset.shuffle(p.shuffle_buffer_size,
                                      reshuffle_each_iteration=True)
            dataset = dataset.repeat()
        return dataset