def create_dataset_pipeline(files, epochs, num_windows): if num_windows > 1: file_splits = np.array_split(files, num_windows) class Windower: def __init__(self): self.i = 0 self.iterations = epochs * num_windows def __iter__(self): return self def __next__(self): if self.i >= self.iterations: raise StopIteration() split = file_splits[self.i % num_windows] self.i += 1 return lambda: ray.data.read_parquet( list(split), _spread_resource_prefix="node:" ) pipe = DatasetPipeline.from_iterable(Windower()) pipe = pipe.random_shuffle_each_window(_spread_resource_prefix="node:") else: ds = ray.data.read_parquet(files, _spread_resource_prefix="node:") pipe = ds.repeat(epochs) pipe = pipe.random_shuffle_each_window(_spread_resource_prefix="node:") return pipe
def create_dataset(files, num_workers=4, epochs=50, num_windows=1): if num_windows > 1: num_rows = ray.data.read_parquet( files ).count() # This should only read Parquet metadata. file_splits = np.array_split(files, num_windows) class Windower: def __init__(self): self.i = 0 self.iterations = epochs * num_windows def __iter__(self): return self def __next__(self): if self.i >= self.iterations: raise StopIteration() split = file_splits[self.i % num_windows] self.i += 1 return lambda: ray.data.read_parquet(list(split)) pipe = DatasetPipeline.from_iterable(Windower()) split_indices = [ i * num_rows // num_windows // num_workers for i in range(1, num_workers) ] pipe = pipe.random_shuffle_each_window() pipe_shards = pipe.split_at_indices(split_indices) else: ds = ray.data.read_parquet(files) pipe = ds.repeat(epochs) pipe = pipe.random_shuffle_each_window() pipe_shards = pipe.split(num_workers, equal=True) return pipe_shards
def __init__( self, dataset_shard: DatasetPipeline, features: Dict[str, Dict], training_set_metadata: Dict[str, Any], ): self.dataset_shard = dataset_shard self.features = features self.training_set_metadata = training_set_metadata self.dataset_iter = dataset_shard.iter_datasets()
def _create_async_parallel_reader(self, pipeline: DatasetPipeline, num_threads: int): q = queue.Queue(maxsize=100) batch_size = self.batch_size to_tensors = self._to_tensors_fn() splits = pipeline.split(n=num_threads) def producer(i): for batch in (splits[i].map_batches( to_tensors, batch_format="pandas").iter_batches( prefetch_blocks=0, batch_size=batch_size, batch_format="pandas")): res = self._prepare_batch(batch) q.put(res) q.put(None) def async_parallel_read(): threads = [ threading.Thread(target=producer, args=(i, )) for i in range(num_threads) ] for t in threads: t.start() active_threads = num_threads while True: batch = q.get(block=True) if batch is None: active_threads -= 1 if active_threads == 0: break yield batch for t in threads: t.join() return async_parallel_read()
def test_from_iterable(ray_start_regular_shared): pipe = DatasetPipeline.from_iterable( [lambda: ray.data.range(3), lambda: ray.data.range(2)]) assert pipe.take() == [0, 1, 2, 0, 1]