def read_parquet(paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, **arrow_parquet_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: >>> # Read a directory of files in remote storage. >>> ray.data.read_parquet("s3://bucket/path") >>> # Read multiple local files. >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The amount of parallelism to use for the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ return read_datasource( ParquetDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, columns=columns, ray_remote_args=ray_remote_args, **arrow_parquet_args)
def read_parquet( paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = -1, ray_remote_args: Dict[str, Any] = None, tensor_column_schema: Optional[Dict[str, Tuple[np.dtype, Tuple[int, ...]]]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(), **arrow_parquet_args, ) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: >>> import ray >>> # Read a directory of files in remote storage. >>> ray.data.read_parquet("s3://bucket/path") # doctest: +SKIP >>> # Read multiple local files. >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"]) # doctest: +SKIP Args: paths: A single file path or directory, or a list of file paths. Multiple directories are not supported. filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. tensor_column_schema: A dict of column name --> tensor dtype and shape mappings for converting a Parquet column containing serialized tensors (ndarrays) as their elements to our tensor column extension type. This assumes that the tensors were serialized in the raw NumPy array format in C-contiguous order (e.g. via `arr.tobytes()`). meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ arrow_parquet_args = _resolve_parquet_args( tensor_column_schema, **arrow_parquet_args, ) return read_datasource( ParquetDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, columns=columns, ray_remote_args=ray_remote_args, meta_provider=meta_provider, **arrow_parquet_args, )
def read_parquet( paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, _tensor_column_schema: Optional[Dict[str, Tuple[np.dtype, Tuple[int, ...]]]] = None, **arrow_parquet_args, ) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: >>> # Read a directory of files in remote storage. >>> ray.data.read_parquet("s3://bucket/path") >>> # Read multiple local files. >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. _tensor_column_schema: A dict of column name --> tensor dtype and shape mappings for converting a Parquet column containing serialized tensors (ndarrays) as their elements to our tensor column extension type. This assumes that the tensors were serialized in the raw NumPy array format in C-contiguous order (e.g. via `arr.tobytes()`). arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ if _tensor_column_schema is not None: existing_block_udf = arrow_parquet_args.pop("_block_udf", None) def _block_udf(block: "pyarrow.Table") -> "pyarrow.Table": from ray.data.extensions import ArrowTensorArray for tensor_col_name, (dtype, shape) in _tensor_column_schema.items(): # NOTE(Clark): We use NumPy to consolidate these potentially # non-contiguous buffers, and to do buffer bookkeeping in # general. np_col = np.array([ np.ndarray(shape, buffer=buf.as_buffer(), dtype=dtype) for buf in block.column(tensor_col_name) ]) block = block.set_column( block._ensure_integer_index(tensor_col_name), tensor_col_name, ArrowTensorArray.from_numpy(np_col), ) if existing_block_udf is not None: # Apply UDF after casting the tensor columns. block = existing_block_udf(block) return block arrow_parquet_args["_block_udf"] = _block_udf return read_datasource( ParquetDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, columns=columns, ray_remote_args=ray_remote_args, **arrow_parquet_args, )