def from_items(items: List[Any], parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ray.data.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = DelegatingArrowBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(input_files=None)) i += block_size return Dataset(BlockList(blocks, metadata))
def from_items(items: List[Any], parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ds.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = SimpleBlock.builder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockMetadata(num_rows=block.num_rows(), size_bytes=block.size_bytes(), schema=type(items[0]), input_files=None)) i += block_size return Dataset(BlockList(blocks, metadata))
def read_datasource(datasource: Datasource[T], parallelism: int = 200, **read_args) -> Dataset[T]: """Read a dataset from a custom data source. Args: datasource: The datasource to read data from. parallelism: The requested parallelism of the read. read_args: Additional kwargs to pass to the datasource impl. Returns: Dataset holding the data read from the datasource. """ read_tasks = datasource.prepare_read(parallelism, **read_args) @ray.remote def remote_read(task: ReadTask) -> Block: return task() calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for task in read_tasks: calls.append(lambda task=task: remote_read.remote(task)) metadata.append(task.get_metadata()) return Dataset(LazyBlockList(calls, metadata))
def read_csv(paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = 200, **arrow_csv_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from csv files. Examples: # Read a directory of files in remote storage. >>> ds.read_csv("s3://bucket/path") # Read multiple local files. >>> ds.read_csv(["/path/to/file1", "/path/to/file2"]) # Read multiple directories. >>> ds.read_csv(["s3://bucket/path1", "s3://bucket/path2"]) Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The amount of parallelism to use for the dataset. arrow_csv_args: Other csv read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ import pyarrow as pa from pyarrow import csv import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) @ray.remote(num_returns=2) def csv_read(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append( csv.read_csv( f, read_options=csv.ReadOptions(use_threads=False), **arrow_csv_args)) block = ArrowBlock(pa.concat_tables(tables)) return block, block.get_metadata(input_files=read_paths) res = [ csv_read.remote(read_paths) for read_paths in np.array_split(paths, parallelism) if len(read_paths) > 0 ] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))))
def read_datasource(datasource: Datasource[T], *, parallelism: int = 200, **read_args) -> Dataset[T]: """Read a dataset from a custom data source. Args: datasource: The datasource to read data from. parallelism: The requested parallelism of the read. read_args: Additional kwargs to pass to the datasource impl. Returns: Dataset holding the data read from the datasource. """ read_tasks = datasource.prepare_read(parallelism, **read_args) @ray.remote def remote_read(task: ReadTask) -> Block: return task() calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for task in read_tasks: calls.append(lambda task=task: remote_read.remote(task)) metadata.append(task.get_metadata()) block_list = LazyBlockList(calls, metadata) # Get the schema from the first block synchronously. if metadata and metadata[0].schema is None: @ray.remote def get_schema(block: Block) -> Any: return BlockAccessor.for_block(block).schema() schema0 = ray.get(get_schema.remote(next(iter(block_list)))) block_list.set_metadata( 0, BlockMetadata( num_rows=metadata[0].num_rows, size_bytes=metadata[0].size_bytes, schema=schema0, input_files=metadata[0].input_files, )) return Dataset(block_list)
def from_arrow(tables: List[ObjectRef["pyarrow.Table"]], parallelism: int = 200) -> Dataset[ArrowRow]: """Create a dataset from a set of Arrow tables. Args: dfs: A list of Ray object references to Arrow tables. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding Arrow records from the tables. """ @ray.remote def get_metadata(table: "pyarrow.Table") -> BlockMetadata: return BlockAccessor.for_block(table).get_metadata(input_files=None) metadata = [get_metadata.remote(t) for t in tables] return Dataset(BlockList(tables, ray.get(metadata)))
def from_pandas(dfs: List[ObjectRef["pandas.DataFrame"]], parallelism: int = 200) -> Dataset[ArrowRow]: """Create a dataset from a set of Pandas dataframes. Args: dfs: A list of Ray object references to pandas dataframes. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding Arrow records read from the dataframes. """ import pyarrow as pa @ray.remote(num_returns=2) def df_to_block(df: "pandas.DataFrame"): block = ArrowBlock(pa.table(df)) return block, block.get_metadata(input_files=None) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))))
def read_parquet(paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, **arrow_parquet_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: # Read a directory of files in remote storage. >>> ds.read_parquet("s3://bucket/path") # Read multiple local files. >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The amount of parallelism to use for the dataset. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ import pyarrow.parquet as pq pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args) read_tasks = [[] for _ in builtins.range(parallelism)] # TODO(ekl) support reading row groups (maybe as an option) for i, piece in enumerate(pq_ds.pieces): read_tasks[i % len(read_tasks)].append(piece) nonempty_tasks = [r for r in read_tasks if r] partitions = pq_ds.partitions @ray.remote def gen_read(pieces: List[pq.ParquetDatasetPiece]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [ piece.read(columns=columns, use_threads=False, partitions=partitions) for piece in pieces ] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return ArrowBlock(table) calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for pieces in nonempty_tasks: calls.append(lambda pieces=pieces: gen_read.remote(pieces)) piece_metadata = [p.get_metadata() for p in pieces] metadata.append( BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in builtins.range(m.num_row_groups)) for m in piece_metadata), schema=piece_metadata[0].schema.to_arrow_schema(), input_files=[p.path for p in pieces])) return Dataset(LazyBlockList(calls, metadata))
def read_parquet(paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, **arrow_parquet_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: >>> # Read a directory of files in remote storage. >>> ray.data.read_parquet("s3://bucket/path") >>> # Read multiple local files. >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The amount of parallelism to use for the dataset. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ import pyarrow.parquet as pq if filesystem is None: filesystem, paths = _parse_paths(paths) pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args, filesystem=filesystem) pieces = pq_ds.pieces read_tasks = [[] for _ in builtins.range(parallelism)] # TODO(ekl) support reading row groups (maybe as an option) for i, piece in enumerate(pq_ds.pieces): read_tasks[i % len(read_tasks)].append(piece) nonempty_tasks = [r for r in read_tasks if r] @ray.remote def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [piece.to_table() for piece in pieces] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return table calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for pieces in nonempty_tasks: calls.append(lambda pieces=pieces: gen_read.remote(pieces)) piece_metadata = [] for p in pieces: try: piece_metadata.append(p.metadata) except AttributeError: break input_files = [p.path for p in pieces] if len(piece_metadata) == len(pieces): # Piece metadata was available, constructo a normal BlockMetadata. block_metadata = BlockMetadata( num_rows=sum(m.num_rows for m in piece_metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in builtins.range(m.num_row_groups)) for m in piece_metadata), schema=piece_metadata[0].schema.to_arrow_schema(), input_files=input_files) else: # Piece metadata was not available, construct an empty # BlockMetadata. block_metadata = BlockMetadata(num_rows=None, size_bytes=None, schema=None, input_files=input_files) metadata.append(block_metadata) return Dataset(LazyBlockList(calls, metadata))