Example #1
0
def create_dataset_pipeline(files, epochs, num_windows):
    if num_windows > 1:
        file_splits = np.array_split(files, num_windows)

        class Windower:
            def __init__(self):
                self.i = 0
                self.iterations = epochs * num_windows

            def __iter__(self):
                return self

            def __next__(self):
                if self.i >= self.iterations:
                    raise StopIteration()
                split = file_splits[self.i % num_windows]
                self.i += 1
                return lambda: ray.data.read_parquet(
                    list(split), _spread_resource_prefix="node:"
                )

        pipe = DatasetPipeline.from_iterable(Windower())
        pipe = pipe.random_shuffle_each_window(_spread_resource_prefix="node:")
    else:
        ds = ray.data.read_parquet(files, _spread_resource_prefix="node:")
        pipe = ds.repeat(epochs)
        pipe = pipe.random_shuffle_each_window(_spread_resource_prefix="node:")
    return pipe
Example #2
0
def create_dataset(files, num_workers=4, epochs=50, num_windows=1):
    if num_windows > 1:
        num_rows = ray.data.read_parquet(
            files
        ).count()  # This should only read Parquet metadata.
        file_splits = np.array_split(files, num_windows)

        class Windower:
            def __init__(self):
                self.i = 0
                self.iterations = epochs * num_windows

            def __iter__(self):
                return self

            def __next__(self):
                if self.i >= self.iterations:
                    raise StopIteration()
                split = file_splits[self.i % num_windows]
                self.i += 1
                return lambda: ray.data.read_parquet(list(split))

        pipe = DatasetPipeline.from_iterable(Windower())
        split_indices = [
            i * num_rows // num_windows // num_workers for i in range(1, num_workers)
        ]
        pipe = pipe.random_shuffle_each_window()
        pipe_shards = pipe.split_at_indices(split_indices)
    else:
        ds = ray.data.read_parquet(files)
        pipe = ds.repeat(epochs)
        pipe = pipe.random_shuffle_each_window()
        pipe_shards = pipe.split(num_workers, equal=True)
    return pipe_shards
Example #3
0
 def __init__(
     self,
     dataset_shard: DatasetPipeline,
     features: Dict[str, Dict],
     training_set_metadata: Dict[str, Any],
 ):
     self.dataset_shard = dataset_shard
     self.features = features
     self.training_set_metadata = training_set_metadata
     self.dataset_iter = dataset_shard.iter_datasets()
Example #4
0
    def _create_async_parallel_reader(self, pipeline: DatasetPipeline,
                                      num_threads: int):
        q = queue.Queue(maxsize=100)

        batch_size = self.batch_size

        to_tensors = self._to_tensors_fn()
        splits = pipeline.split(n=num_threads)

        def producer(i):
            for batch in (splits[i].map_batches(
                    to_tensors, batch_format="pandas").iter_batches(
                        prefetch_blocks=0,
                        batch_size=batch_size,
                        batch_format="pandas")):
                res = self._prepare_batch(batch)
                q.put(res)
            q.put(None)

        def async_parallel_read():
            threads = [
                threading.Thread(target=producer, args=(i, ))
                for i in range(num_threads)
            ]
            for t in threads:
                t.start()

            active_threads = num_threads
            while True:
                batch = q.get(block=True)
                if batch is None:
                    active_threads -= 1
                    if active_threads == 0:
                        break
                yield batch

            for t in threads:
                t.join()

        return async_parallel_read()
Example #5
0
def test_from_iterable(ray_start_regular_shared):
    pipe = DatasetPipeline.from_iterable(
        [lambda: ray.data.range(3), lambda: ray.data.range(2)])
    assert pipe.take() == [0, 1, 2, 0, 1]