def csv_write(write_path: str, block: ArrowBlock): logger.debug( f"Writing {block.num_rows()} records to {write_path}.") block.to_pandas().to_csv(write_path, mode="a", header=True, index=False)
def csv_read(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append( csv.read_csv( f, read_options=csv.ReadOptions(use_threads=False), **arrow_csv_args)) block = ArrowBlock(pa.concat_tables(tables)) return block, block.get_metadata(input_files=read_paths)
def block_to_df(block: ArrowBlock): if isinstance(block, (ray.ObjectRef, ClientObjectRef)): raise ValueError( "Dataset.to_dask() must be used with Dask-on-Ray, please " "set the Dask scheduler to ray_dask_get (located in " "ray.util.dask).") return block.to_pandas()
def make_block(start: int, count: int) -> Block[Union[ArrowRow, int]]: if use_arrow: return ArrowBlock( pyarrow.Table.from_arrays( [np.arange(start, start + count)], names=["value"])) else: return SimpleBlock(list(builtins.range(start, start + count)))
def read_files(read_paths: List[str]): logger.debug(f"Reading {len(read_paths)} files.") tables = [] for read_path in read_paths: with filesystem.open_input_file(read_path) as f: tables.append(read_file(f, **reader_args)) return ArrowBlock(pa.concat_tables(tables))
def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [piece.to_table() for piece in pieces] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return ArrowBlock(table)
def gen_read(pieces: List[pq.ParquetDatasetPiece]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [ piece.read(columns=columns, use_threads=False, partitions=partitions) for piece in pieces ] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return ArrowBlock(table)
def transform(block: Block[T]) -> Block[U]: total_rows = block.num_rows() max_batch_size = batch_size if max_batch_size is None: max_batch_size = total_rows builder = DelegatingArrowBlockBuilder() for start in range(0, total_rows, max_batch_size): # Build a block for each batch. end = min(total_rows, start + max_batch_size) # Note: if the block is a list, it doesn't support zero-copy. view = block.slice(start, end) if batch_format == "pandas": view = view.to_pandas() elif batch_format == "pyarrow": view = view._table else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") applied = fn(view) if isinstance(applied, list): applied = ListBlock(applied) elif isinstance(applied, pd.core.frame.DataFrame): applied = ArrowBlock(pa.Table.from_pandas(applied)) elif isinstance(applied, pa.Table): applied = ArrowBlock(applied) else: raise ValueError("The map batch UDF returns a type " f"{type(applied)}, which is not allowed. " "The return type must be either list, " "pandas.DataFrame, or pyarrow.Table") builder.add_block(applied) return builder.build()
def df_to_block(df: "pandas.DataFrame"): block = ArrowBlock(pa.table(df)) return block, block.get_metadata(input_files=None)
def block_to_df(block: ArrowBlock): return block.to_pandas()
def json_write(write_path: str, block: ArrowBlock): logger.debug( f"Writing {block.num_rows()} records to {write_path}.") block.to_pandas().to_json(write_path, orient="records")