def prepare_read(self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource. """ import pyarrow as pa import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_infos = _expand_paths(paths, filesystem) file_sizes = [file_info.size for file_info in file_infos] read_file = self._read_file filesystem = _wrap_s3_serialization_workaround(filesystem) def read_files(read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]): logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() builder = DelegatingArrowBlockBuilder() for read_path in read_paths: with fs.open_input_stream(read_path) as f: data = read_file(f, read_path, **reader_args) if isinstance(data, pa.Table) or isinstance( data, np.ndarray): builder.add_block(data) else: builder.add(data) return builder.build() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)): if len(read_paths) <= 0: continue if self._rows_per_file() is None: num_rows = None else: num_rows = len(read_paths) * self._rows_per_file() read_task = ReadTask(lambda read_paths=read_paths: read_files( read_paths, filesystem), BlockMetadata(num_rows=num_rows, size_bytes=sum(file_sizes), schema=schema, input_files=read_paths)) read_tasks.append(read_task) return read_tasks
def get_read_tasks(self, parallelism: int) -> List[ReadTask]: # NOTE: We override the base class FileBasedDatasource.get_read_tasks() # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. read_tasks = [] for pieces, metadata in zip( np.array_split(self._pq_ds.pieces, parallelism), np.array_split(self._metadata, parallelism), ): if len(pieces) <= 0: continue serialized_pieces = [_SerializedPiece(p) for p in pieces] input_files = [p.path for p in pieces] meta = self._meta_provider( input_files, self._inferred_schema, pieces=pieces, prefetched_metadata=metadata, ) block_udf, reader_args, columns, schema = ( self._block_udf, self._reader_args, self._columns, self._schema, ) read_tasks.append( ReadTask( lambda p=serialized_pieces: _read_pieces( block_udf, reader_args, columns, schema, p, ), meta, )) return read_tasks
def prepare_read(self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource. """ # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset(paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema([schema.field(column) for column in columns], schema.metadata) def read_pieces(serialized_pieces: List[str]): # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. pieces: List["pyarrow._dataset.ParquetFileFragment"] = [ cloudpickle.loads(p) for p in serialized_pieces ] # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) tables = [] for piece in pieces: table = piece.to_table(use_threads=use_threads, columns=columns, schema=schema, **reader_args) part = _get_partition_keys(piece.partition_expression) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table))) # If the table is empty, drop it. if table.num_rows > 0: tables.append(table) if len(tables) > 1: table = pa.concat_tables(tables, promote=True) elif len(tables) == 1: table = tables[0] if _block_udf is not None: table = _block_udf(table) # If len(tables) == 0, all fragments were empty, and we return the # empty table from the last fragment. return table if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata( schema.metadata) except Exception: logger.info( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True) inferred_schema = schema else: inferred_schema = schema read_tasks = [] for pieces in np.array_split(pq_ds.pieces, parallelism): if len(pieces) == 0: continue metadata = _get_metadata(pieces, inferred_schema) pieces = [cloudpickle.dumps(p) for p in pieces] read_tasks.append( ReadTask(lambda pieces_=pieces: read_pieces(pieces_), metadata)) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_infos = _expand_paths(paths, filesystem) file_sizes = [file_info.size for file_info in file_infos] read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: with fs.open_input_stream(read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue if self._rows_per_file() is None: num_rows = None else: num_rows = len(read_paths) * self._rows_per_file() meta = BlockMetadata( num_rows=num_rows, size_bytes=sum(file_sizes), schema=schema, input_files=read_paths, exec_stats=None, ) # Exec stats filled in later. read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: PathPartitionFilter = None, # TODO(ekl) deprecate this once read fusion is available. _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_sizes = meta_provider.expand_paths(paths, filesystem) if partition_filter is not None: paths = partition_filter(paths) read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: compression = open_stream_args.pop("compression", None) if compression is None: import pyarrow as pa try: # If no compression manually given, try to detect # compression codec from path. compression = pa.Codec.detect(read_path).name except (ValueError, TypeError): # Arrow's compression inference on the file path # doesn't work for Snappy, so we double-check ourselves. import pathlib suffix = pathlib.Path(read_path).suffix if suffix and suffix[1:] == "snappy": compression = "snappy" else: compression = None if compression == "snappy": # Pass Snappy compression as a reader arg, so datasource subclasses # can manually handle streaming decompression in # self._read_stream(). reader_args["compression"] = compression reader_args["filesystem"] = fs elif compression is not None: # Non-Snappy compression, pass as open_input_stream() arg so Arrow # can take care of streaming decompression for us. open_stream_args["compression"] = compression with self._open_input_source(fs, read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() # fix https://github.com/ray-project/ray/issues/24296 parallelism = min(parallelism, len(paths)) read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue meta = meta_provider( read_paths, schema, rows_per_file=self._rows_per_file(), file_sizes=file_sizes, ) read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(), _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource.""" # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset( paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False ) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema( [schema.field(column) for column in columns], schema.metadata ) def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]: # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. try: _register_parquet_file_fragment_serialization() pieces: List[ "pyarrow._dataset.ParquetFileFragment" ] = cloudpickle.loads(serialized_pieces) finally: _deregister_parquet_file_fragment_serialization() # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) for piece in pieces: part = _get_partition_keys(piece.partition_expression) batches = piece.to_batches( use_threads=use_threads, columns=columns, schema=schema, batch_size=PARQUET_READER_ROW_BATCH_SIZE, **reader_args, ) for batch in batches: table = pyarrow.Table.from_batches([batch], schema=schema) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table)), ) # If the table is empty, drop it. if table.num_rows > 0: output_buffer.add_block(table) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata(schema.metadata) except Exception: logger.debug( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True, ) inferred_schema = schema else: inferred_schema = schema read_tasks = [] metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or [] try: _register_parquet_file_fragment_serialization() for pieces, metadata in zip( np.array_split(pq_ds.pieces, parallelism), np.array_split(metadata, parallelism), ): if len(pieces) <= 0: continue serialized_pieces = cloudpickle.dumps(pieces) input_files = [p.path for p in pieces] meta = meta_provider( input_files, inferred_schema, pieces=pieces, prefetched_metadata=metadata, ) read_tasks.append( ReadTask(lambda p=serialized_pieces: read_pieces(p), meta) ) finally: _deregister_parquet_file_fragment_serialization() return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), # TODO(ekl) deprecate this once read fusion is available. _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_sizes = meta_provider.expand_paths(paths, filesystem) read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: with fs.open_input_stream(read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue meta = meta_provider( read_paths, schema, rows_per_file=self._rows_per_file(), file_sizes=file_sizes, ) read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks
def get_read_tasks(self, parallelism: int) -> List[ReadTask]: import numpy as np open_stream_args = self._open_stream_args reader_args = self._reader_args _block_udf = self._block_udf paths, file_sizes = self._paths, self._file_sizes if self._partition_filter is not None: paths = self._partition_filter(paths) read_stream = self._delegate._read_stream filesystem = _wrap_s3_serialization_workaround(self._filesystem) read_options = reader_args.get("read_options") if read_options is not None: import pyarrow.json as pajson if isinstance(read_options, pajson.ReadOptions): _register_arrow_json_readoptions_serializer() if open_stream_args is None: open_stream_args = {} open_input_source = self._delegate._open_input_source def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size) for read_path in read_paths: compression = open_stream_args.pop("compression", None) if compression is None: import pyarrow as pa try: # If no compression manually given, try to detect # compression codec from path. compression = pa.Codec.detect(read_path).name except (ValueError, TypeError): # Arrow's compression inference on the file path # doesn't work for Snappy, so we double-check ourselves. import pathlib suffix = pathlib.Path(read_path).suffix if suffix and suffix[1:] == "snappy": compression = "snappy" else: compression = None if compression == "snappy": # Pass Snappy compression as a reader arg, so datasource subclasses # can manually handle streaming decompression in # self._delegate._read_stream(). reader_args["compression"] = compression reader_args["filesystem"] = fs elif compression is not None: # Non-Snappy compression, pass as open_input_stream() arg so Arrow # can take care of streaming decompression for us. open_stream_args["compression"] = compression with open_input_source(fs, read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() # fix https://github.com/ray-project/ray/issues/24296 parallelism = min(parallelism, len(paths)) read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)): if len(read_paths) <= 0: continue meta = self._meta_provider( read_paths, self._schema, rows_per_file=self._delegate._rows_per_file(), file_sizes=file_sizes, ) read_task = ReadTask(lambda read_paths=read_paths: read_files( read_paths, filesystem), meta) read_tasks.append(read_task) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource. """ from ray import cloudpickle import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset( paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False) if schema is None: schema = pq_ds.schema pieces = pq_ds.pieces def read_pieces(serialized_pieces: List[str]): # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. pieces: List["pyarrow._dataset.ParquetFileFragment"] = [ cloudpickle.loads(p) for p in serialized_pieces ] import pyarrow as pa from pyarrow.dataset import _get_partition_keys logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) tables = [] for piece in pieces: table = piece.to_table( use_threads=use_threads, columns=columns, schema=schema, **reader_args) part = _get_partition_keys(piece.partition_expression) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table))) tables.append(table) if len(tables) > 1: table = pa.concat_tables(tables) else: table = tables[0] return table read_tasks = [] for pieces_ in np.array_split(pieces, parallelism): if len(pieces_) == 0: continue metadata = _get_metadata(pieces_, schema) pieces_ = [cloudpickle.dumps(p) for p in pieces_] read_tasks.append( ReadTask(lambda pieces=pieces_: read_pieces(pieces), metadata)) return read_tasks