Esempio n. 1
0
 def transform(block: Block) -> Block:
     block = BlockAccessor.for_block(block)
     builder = block.builder()
     for row in block.iter_rows():
         if fn(row):
             builder.add(row)
     return builder.build()
Esempio n. 2
0
 def csv_write(write_path: str, block: Block):
     block = BlockAccessor.for_block(block)
     logger.debug(
         f"Writing {block.num_rows()} records to {write_path}.")
     block.to_pandas().to_csv(write_path,
                              mode="a",
                              header=True,
                              index=False)
Esempio n. 3
0
 def format_batch(batch: Block, format: str) -> BatchType:
     if batch_format == "pandas":
         batch = BlockAccessor.for_block(batch)
         return batch.to_pandas()
     elif batch_format == "pyarrow":
         batch = BlockAccessor.for_block(batch)
         return batch.to_arrow_table()
     elif batch_format == "_blocks":
         return batch
     else:
         raise ValueError(
             f"The given batch format: {batch_format} "
             f"is invalid. Supported batch type: {BatchType}")
Esempio n. 4
0
 def shuffle_map(block: Block) -> List[Block]:
     block = BlockAccessor.for_block(block)
     slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
     slices = []
     for i in range(output_num_blocks):
         slices.append(
             block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))
     num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
     assert num_rows == block.num_rows(), (num_rows, block.num_rows())
     # Needed to handle num_returns=1 edge case in Ray API.
     if len(slices) == 1:
         return slices[0]
     else:
         return slices
Esempio n. 5
0
 def transform(block: Block) -> Block:
     block = BlockAccessor.for_block(block)
     builder = DelegatingArrowBlockBuilder()
     for row in block.iter_rows():
         for r2 in fn(row):
             builder.add(r2)
     return builder.build()
Esempio n. 6
0
 def block_to_df(block: Block):
     block = BlockAccessor.for_block(block)
     if isinstance(block, (ray.ObjectRef, ClientObjectRef)):
         raise ValueError(
             "Dataset.to_dask() must be used with Dask-on-Ray, please "
             "set the Dask scheduler to ray_dask_get (located in "
             "ray.util.dask).")
     return block.to_pandas()
Esempio n. 7
0
 def truncate(block: Block, meta: BlockMetadata,
              count: int) -> (Block, BlockMetadata):
     block = BlockAccessor.for_block(block)
     logger.debug("Truncating last block to size: {}".format(count))
     new_block = block.slice(0, count, copy=True)
     accessor = BlockAccessor.for_block(new_block)
     new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                              size_bytes=accessor.size_bytes(),
                              schema=meta.schema,
                              input_files=meta.input_files)
     return new_block, new_meta
Esempio n. 8
0
        def transform(block: Block) -> Block:
            block = BlockAccessor.for_block(block)
            total_rows = block.num_rows()
            max_batch_size = batch_size
            if max_batch_size is None:
                max_batch_size = total_rows

            builder = DelegatingArrowBlockBuilder()

            for start in range(0, total_rows, max_batch_size):
                # Build a block for each batch.
                end = min(total_rows, start + max_batch_size)
                view = block.slice(start, end, copy=False)
                if batch_format == "pandas":
                    view = BlockAccessor.for_block(view).to_pandas()
                elif batch_format == "pyarrow":
                    view = BlockAccessor.for_block(view).to_arrow_table()
                else:
                    raise ValueError(
                        f"The given batch format: {batch_format} "
                        f"is invalid. Supported batch type: {BatchType}")

                applied = fn(view)
                if isinstance(applied, list):
                    applied = applied
                elif isinstance(applied, pa.Table):
                    applied = applied
                elif isinstance(applied, pd.core.frame.DataFrame):
                    applied = pa.Table.from_pandas(applied)
                else:
                    raise ValueError("The map batch UDF returns a type "
                                     f"{type(applied)}, which is not allowed. "
                                     "The return type must be either list, "
                                     "pandas.DataFrame, or pyarrow.Table")
                builder.add_block(applied)

            return builder.build()
Esempio n. 9
0
 def json_write(write_path: str, block: Block):
     block = BlockAccessor.for_block(block)
     logger.debug(
         f"Writing {block.num_rows()} records to {write_path}.")
     block.to_pandas().to_json(write_path, orient="records")
Esempio n. 10
0
 def block_to_df(block: Block):
     block = BlockAccessor.for_block(block)
     return block.to_arrow_table()
Esempio n. 11
0
 def block_to_df(block: Block):
     block = BlockAccessor.for_block(block)
     return block.to_pandas()
Esempio n. 12
0
 def write(self, block: Block) -> str:
     block = BlockAccessor.for_block(block)
     if not self.enabled:
         raise ValueError("disabled")
     self.rows_written += block.num_rows()
     return "ok"
Esempio n. 13
0
 def agg(block: Block) -> int:
     block = BlockAccessor.for_block(block)
     return sum(block.iter_rows())
Esempio n. 14
0
 def count(block: Block) -> int:
     block = BlockAccessor.for_block(block)
     return block.num_rows()