def for_block(block: Block) -> "BlockAccessor[T]": """Create a block accessor for the given block.""" _check_pyarrow_version() import pyarrow import pandas if isinstance(block, pyarrow.Table): from ray.data.impl.arrow_block import ArrowBlockAccessor return ArrowBlockAccessor(block) elif isinstance(block, pandas.DataFrame): from ray.data.impl.pandas_block import PandasBlockAccessor return PandasBlockAccessor(block) elif isinstance(block, bytes): from ray.data.impl.arrow_block import ArrowBlockAccessor return ArrowBlockAccessor.from_bytes(block) elif isinstance(block, list): from ray.data.impl.simple_block import SimpleBlockAccessor return SimpleBlockAccessor(block) else: raise TypeError("Not a block type: {} ({})".format( block, type(block)))
def prepare_read(self, parallelism: int, n: int, num_columns: int) -> List[ReadTask]: _check_pyarrow_version() import pyarrow read_tasks: List[ReadTask] = [] block_size = max(1, n // parallelism) def make_block(count: int, num_columns: int) -> Block: return pyarrow.Table.from_arrays( np.random.randint(np.iinfo(np.int64).max, size=(num_columns, count), dtype=np.int64), names=[f"c_{i}" for i in range(num_columns)]) schema = pyarrow.Table.from_pydict( {f"c_{i}": [0] for i in range(num_columns)}).schema i = 0 while i < n: count = min(block_size, n - i) read_tasks.append( ReadTask(lambda count=count, num_columns=num_columns: make_block(count, num_columns), BlockMetadata(num_rows=count, size_bytes=8 * count * num_columns, schema=schema, input_files=None))) i += block_size return read_tasks
def prepare_read(self, parallelism: int, n: int, block_format: str = "list", tensor_shape: Tuple = (1, )) -> List[ReadTask]: read_tasks: List[ReadTask] = [] block_size = max(1, n // parallelism) # Example of a read task. In a real datasource, this would pull data # from an external system instead of generating dummy data. def make_block(start: int, count: int) -> Block: if block_format == "arrow": return pyarrow.Table.from_arrays( [np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": tensor = TensorArray( np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))))) return pyarrow.Table.from_pydict({"value": tensor}) else: return list(builtins.range(start, start + count)) i = 0 while i < n: count = min(block_size, n - i) if block_format == "arrow": _check_pyarrow_version() import pyarrow schema = pyarrow.Table.from_pydict({"value": [0]}).schema elif block_format == "tensor": _check_pyarrow_version() from ray.data.extensions import TensorArray import pyarrow tensor = TensorArray( np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(0, 10), tuple( range(1, 1 + len(tensor_shape))))) schema = pyarrow.Table.from_pydict({"value": tensor}).schema elif block_format == "list": schema = int else: raise ValueError("Unsupported block type", block_format) meta = BlockMetadata( num_rows=count, size_bytes=8 * count, schema=schema, input_files=None, exec_stats=None) read_tasks.append( ReadTask( lambda i=i, count=count: [make_block(i, count)], meta)) i += block_size return read_tasks
def for_block(block: Block) -> "BlockAccessor[T]": """Create a block accessor for the given block.""" _check_pyarrow_version() import pyarrow if isinstance(block, pyarrow.Table): from ray.data.impl.arrow_block import \ ArrowBlockAccessor return ArrowBlockAccessor(block) elif isinstance(block, list): from ray.data.impl.simple_block import \ SimpleBlockAccessor return SimpleBlockAccessor(block) elif isinstance(block, np.ndarray): from ray.data.impl.tensor_block import \ TensorBlockAccessor return TensorBlockAccessor(block) else: raise TypeError("Not a block type: {}".format(block))
def prepare_read(self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource. """ # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset(paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema([schema.field(column) for column in columns], schema.metadata) def read_pieces(serialized_pieces: List[str]): # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. pieces: List["pyarrow._dataset.ParquetFileFragment"] = [ cloudpickle.loads(p) for p in serialized_pieces ] # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) tables = [] for piece in pieces: table = piece.to_table(use_threads=use_threads, columns=columns, schema=schema, **reader_args) part = _get_partition_keys(piece.partition_expression) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table))) # If the table is empty, drop it. if table.num_rows > 0: tables.append(table) if len(tables) > 1: table = pa.concat_tables(tables, promote=True) elif len(tables) == 1: table = tables[0] if _block_udf is not None: table = _block_udf(table) # If len(tables) == 0, all fragments were empty, and we return the # empty table from the last fragment. return table if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata( schema.metadata) except Exception: logger.info( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True) inferred_schema = schema else: inferred_schema = schema read_tasks = [] for pieces in np.array_split(pq_ds.pieces, parallelism): if len(pieces) == 0: continue metadata = _get_metadata(pieces, inferred_schema) pieces = [cloudpickle.dumps(p) for p in pieces] read_tasks.append( ReadTask(lambda pieces_=pieces: read_pieces(pieces_), metadata)) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_infos = _expand_paths(paths, filesystem) file_sizes = [file_info.size for file_info in file_infos] read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: with fs.open_input_stream(read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue if self._rows_per_file() is None: num_rows = None else: num_rows = len(read_paths) * self._rows_per_file() meta = BlockMetadata( num_rows=num_rows, size_bytes=sum(file_sizes), schema=schema, input_files=read_paths, exec_stats=None, ) # Exec stats filled in later. read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: PathPartitionFilter = None, # TODO(ekl) deprecate this once read fusion is available. _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_sizes = meta_provider.expand_paths(paths, filesystem) if partition_filter is not None: paths = partition_filter(paths) read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: compression = open_stream_args.pop("compression", None) if compression is None: import pyarrow as pa try: # If no compression manually given, try to detect # compression codec from path. compression = pa.Codec.detect(read_path).name except (ValueError, TypeError): # Arrow's compression inference on the file path # doesn't work for Snappy, so we double-check ourselves. import pathlib suffix = pathlib.Path(read_path).suffix if suffix and suffix[1:] == "snappy": compression = "snappy" else: compression = None if compression == "snappy": # Pass Snappy compression as a reader arg, so datasource subclasses # can manually handle streaming decompression in # self._read_stream(). reader_args["compression"] = compression reader_args["filesystem"] = fs elif compression is not None: # Non-Snappy compression, pass as open_input_stream() arg so Arrow # can take care of streaming decompression for us. open_stream_args["compression"] = compression with self._open_input_source(fs, read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() # fix https://github.com/ray-project/ray/issues/24296 parallelism = min(parallelism, len(paths)) read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue meta = meta_provider( read_paths, schema, rows_per_file=self._rows_per_file(), file_sizes=file_sizes, ) read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(), _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource.""" # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset( paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False ) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema( [schema.field(column) for column in columns], schema.metadata ) def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]: # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. try: _register_parquet_file_fragment_serialization() pieces: List[ "pyarrow._dataset.ParquetFileFragment" ] = cloudpickle.loads(serialized_pieces) finally: _deregister_parquet_file_fragment_serialization() # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) for piece in pieces: part = _get_partition_keys(piece.partition_expression) batches = piece.to_batches( use_threads=use_threads, columns=columns, schema=schema, batch_size=PARQUET_READER_ROW_BATCH_SIZE, **reader_args, ) for batch in batches: table = pyarrow.Table.from_batches([batch], schema=schema) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table)), ) # If the table is empty, drop it. if table.num_rows > 0: output_buffer.add_block(table) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata(schema.metadata) except Exception: logger.debug( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True, ) inferred_schema = schema else: inferred_schema = schema read_tasks = [] metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or [] try: _register_parquet_file_fragment_serialization() for pieces, metadata in zip( np.array_split(pq_ds.pieces, parallelism), np.array_split(metadata, parallelism), ): if len(pieces) <= 0: continue serialized_pieces = cloudpickle.dumps(pieces) input_files = [p.path for p in pieces] meta = meta_provider( input_files, inferred_schema, pieces=pieces, prefetched_metadata=metadata, ) read_tasks.append( ReadTask(lambda p=serialized_pieces: read_pieces(p), meta) ) finally: _deregister_parquet_file_fragment_serialization() return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource. """ _check_pyarrow_version() import pyarrow as pa import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_infos = _expand_paths(paths, filesystem) file_sizes = [file_info.size for file_info in file_infos] read_file = self._read_file filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]): logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() builder = DelegatingArrowBlockBuilder() for read_path in read_paths: with fs.open_input_stream(read_path, **open_stream_args) as f: data = read_file(f, read_path, **reader_args) if isinstance(data, pa.Table) or isinstance( data, np.ndarray): builder.add_block(data) else: builder.add(data) block = builder.build() if _block_udf is not None: block = _block_udf(block) return block read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)): if len(read_paths) <= 0: continue if self._rows_per_file() is None: num_rows = None else: num_rows = len(read_paths) * self._rows_per_file() read_task = ReadTask( lambda read_paths=read_paths: read_files( read_paths, filesystem), BlockMetadata( num_rows=num_rows, size_bytes=sum(file_sizes), schema=schema, input_files=read_paths) ) read_tasks.append(read_task) return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), # TODO(ekl) deprecate this once read fusion is available. _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_sizes = meta_provider.expand_paths(paths, filesystem) read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: with fs.open_input_stream(read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue meta = meta_provider( read_paths, schema, rows_per_file=self._rows_per_file(), file_sizes=file_sizes, ) read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks