def prepare_read(self, parallelism: int, n: int, use_arrow: bool) -> List[ReadTask]: read_tasks: List[ReadTask] = [] block_size = max(1, n // parallelism) # Example of a read task. In a real datasource, this would pull data # from an external system instead of generating dummy data. def make_block(start: int, count: int) -> Block: if use_arrow: return pyarrow.Table.from_arrays( [np.arange(start, start + count)], names=["value"]) else: return list(builtins.range(start, start + count)) i = 0 while i < n: count = min(block_size, n - i) if use_arrow: import pyarrow schema = pyarrow.Table.from_pydict({"value": [0]}).schema else: schema = int read_tasks.append( ReadTask( lambda i=i, count=count: make_block(i, count), BlockMetadata( num_rows=count, size_bytes=8 * count, schema=schema, input_files=None))) i += block_size return read_tasks
def wrapped_fn(block: Block, meta: BlockMetadata): new_block = fn(block) accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=meta.input_files) return new_block, new_meta
def process_block( self, block: Block[T], meta: BlockMetadata) -> (Block[U], BlockMetadata): new_block = fn(block) accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=meta.input_files) return new_block, new_metadata
def truncate(block: Block, meta: BlockMetadata, count: int) -> (Block, BlockMetadata): block = BlockAccessor.for_block(block) logger.debug("Truncating last block to size: {}".format(count)) new_block = block.slice(0, count, copy=True) accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=meta.schema, input_files=meta.input_files) return new_block, new_meta
def shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata): builder = DelegatingArrowBlockBuilder() assert len(mapper_outputs) == input_num_blocks for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None) return new_block, new_metadata
def read_datasource(datasource: Datasource[T], *, parallelism: int = 200, **read_args) -> Dataset[T]: """Read a dataset from a custom data source. Args: datasource: The datasource to read data from. parallelism: The requested parallelism of the read. read_args: Additional kwargs to pass to the datasource impl. Returns: Dataset holding the data read from the datasource. """ read_tasks = datasource.prepare_read(parallelism, **read_args) @ray.remote def remote_read(task: ReadTask) -> Block: return task() calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for task in read_tasks: calls.append(lambda task=task: remote_read.remote(task)) metadata.append(task.get_metadata()) block_list = LazyBlockList(calls, metadata) # Get the schema from the first block synchronously. if metadata and metadata[0].schema is None: @ray.remote def get_schema(block: Block) -> Any: return BlockAccessor.for_block(block).schema() schema0 = ray.get(get_schema.remote(next(iter(block_list)))) block_list.set_metadata( 0, BlockMetadata( num_rows=metadata[0].num_rows, size_bytes=metadata[0].size_bytes, schema=schema0, input_files=metadata[0].input_files, )) return Dataset(block_list)
def read_parquet(paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, **arrow_parquet_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: >>> # Read a directory of files in remote storage. >>> ray.data.read_parquet("s3://bucket/path") >>> # Read multiple local files. >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The amount of parallelism to use for the dataset. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ import pyarrow.parquet as pq if filesystem is None: filesystem, paths = _parse_paths(paths) pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args, filesystem=filesystem) pieces = pq_ds.pieces read_tasks = [[] for _ in builtins.range(parallelism)] # TODO(ekl) support reading row groups (maybe as an option) for i, piece in enumerate(pq_ds.pieces): read_tasks[i % len(read_tasks)].append(piece) nonempty_tasks = [r for r in read_tasks if r] @ray.remote def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [piece.to_table() for piece in pieces] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return table calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for pieces in nonempty_tasks: calls.append(lambda pieces=pieces: gen_read.remote(pieces)) piece_metadata = [] for p in pieces: try: piece_metadata.append(p.metadata) except AttributeError: break input_files = [p.path for p in pieces] if len(piece_metadata) == len(pieces): # Piece metadata was available, constructo a normal BlockMetadata. block_metadata = BlockMetadata( num_rows=sum(m.num_rows for m in piece_metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in builtins.range(m.num_row_groups)) for m in piece_metadata), schema=piece_metadata[0].schema.to_arrow_schema(), input_files=input_files) else: # Piece metadata was not available, construct an empty # BlockMetadata. block_metadata = BlockMetadata(num_rows=None, size_bytes=None, schema=None, input_files=input_files) metadata.append(block_metadata) return Dataset(LazyBlockList(calls, metadata))