Exemple #1
0
 def csv_write(write_path: str, block: ArrowBlock):
     logger.debug(
         f"Writing {block.num_rows()} records to {write_path}.")
     block.to_pandas().to_csv(write_path,
                              mode="a",
                              header=True,
                              index=False)
Exemple #2
0
 def csv_read(read_paths: List[str]):
     logger.debug(f"Reading {len(read_paths)} files.")
     tables = []
     for read_path in read_paths:
         with filesystem.open_input_file(read_path) as f:
             tables.append(
                 csv.read_csv(
                     f,
                     read_options=csv.ReadOptions(use_threads=False),
                     **arrow_csv_args))
     block = ArrowBlock(pa.concat_tables(tables))
     return block, block.get_metadata(input_files=read_paths)
Exemple #3
0
 def block_to_df(block: ArrowBlock):
     if isinstance(block, (ray.ObjectRef, ClientObjectRef)):
         raise ValueError(
             "Dataset.to_dask() must be used with Dask-on-Ray, please "
             "set the Dask scheduler to ray_dask_get (located in "
             "ray.util.dask).")
     return block.to_pandas()
Exemple #4
0
 def make_block(start: int, count: int) -> Block[Union[ArrowRow, int]]:
     if use_arrow:
         return ArrowBlock(
             pyarrow.Table.from_arrays(
                 [np.arange(start, start + count)], names=["value"]))
     else:
         return SimpleBlock(list(builtins.range(start, start + count)))
Exemple #5
0
 def read_files(read_paths: List[str]):
     logger.debug(f"Reading {len(read_paths)} files.")
     tables = []
     for read_path in read_paths:
         with filesystem.open_input_file(read_path) as f:
             tables.append(read_file(f, **reader_args))
     return ArrowBlock(pa.concat_tables(tables))
Exemple #6
0
 def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]):
     import pyarrow
     logger.debug("Reading {} parquet pieces".format(len(pieces)))
     tables = [piece.to_table() for piece in pieces]
     if len(tables) > 1:
         table = pyarrow.concat_tables(tables)
     else:
         table = tables[0]
     return ArrowBlock(table)
Exemple #7
0
 def gen_read(pieces: List[pq.ParquetDatasetPiece]):
     import pyarrow
     logger.debug("Reading {} parquet pieces".format(len(pieces)))
     tables = [
         piece.read(columns=columns,
                    use_threads=False,
                    partitions=partitions) for piece in pieces
     ]
     if len(tables) > 1:
         table = pyarrow.concat_tables(tables)
     else:
         table = tables[0]
     return ArrowBlock(table)
Exemple #8
0
        def transform(block: Block[T]) -> Block[U]:
            total_rows = block.num_rows()
            max_batch_size = batch_size
            if max_batch_size is None:
                max_batch_size = total_rows

            builder = DelegatingArrowBlockBuilder()

            for start in range(0, total_rows, max_batch_size):
                # Build a block for each batch.
                end = min(total_rows, start + max_batch_size)
                # Note: if the block is a list, it doesn't support zero-copy.
                view = block.slice(start, end)
                if batch_format == "pandas":
                    view = view.to_pandas()
                elif batch_format == "pyarrow":
                    view = view._table
                else:
                    raise ValueError(
                        f"The given batch format: {batch_format} "
                        f"is invalid. Supported batch type: {BatchType}")

                applied = fn(view)
                if isinstance(applied, list):
                    applied = ListBlock(applied)
                elif isinstance(applied, pd.core.frame.DataFrame):
                    applied = ArrowBlock(pa.Table.from_pandas(applied))
                elif isinstance(applied, pa.Table):
                    applied = ArrowBlock(applied)
                else:
                    raise ValueError("The map batch UDF returns a type "
                                     f"{type(applied)}, which is not allowed. "
                                     "The return type must be either list, "
                                     "pandas.DataFrame, or pyarrow.Table")
                builder.add_block(applied)

            return builder.build()
Exemple #9
0
 def df_to_block(df: "pandas.DataFrame"):
     block = ArrowBlock(pa.table(df))
     return block, block.get_metadata(input_files=None)
Exemple #10
0
 def block_to_df(block: ArrowBlock):
     return block.to_pandas()
Exemple #11
0
 def json_write(write_path: str, block: ArrowBlock):
     logger.debug(
         f"Writing {block.num_rows()} records to {write_path}.")
     block.to_pandas().to_json(write_path, orient="records")