def truncate(block: Block, meta: BlockMetadata, count: int) -> (Block, BlockMetadata): block = BlockAccessor.for_block(block) logger.debug("Truncating last block to size: {}".format(count)) new_block = block.slice(0, count, copy=True) accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=meta.schema, input_files=meta.input_files) return new_block, new_meta
def shuffle_map(block: Block) -> List[Block]: block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) # Needed to handle num_returns=1 edge case in Ray API. if len(slices) == 1: return slices[0] else: return slices
def transform(block: Block) -> Block: block = BlockAccessor.for_block(block) total_rows = block.num_rows() max_batch_size = batch_size if max_batch_size is None: max_batch_size = total_rows builder = DelegatingArrowBlockBuilder() for start in range(0, total_rows, max_batch_size): # Build a block for each batch. end = min(total_rows, start + max_batch_size) view = block.slice(start, end, copy=False) if batch_format == "pandas": view = BlockAccessor.for_block(view).to_pandas() elif batch_format == "pyarrow": view = BlockAccessor.for_block(view).to_arrow_table() else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") applied = fn(view) if isinstance(applied, list): applied = applied elif isinstance(applied, pa.Table): applied = applied elif isinstance(applied, pd.core.frame.DataFrame): applied = pa.Table.from_pandas(applied) else: raise ValueError("The map batch UDF returns a type " f"{type(applied)}, which is not allowed. " "The return type must be either list, " "pandas.DataFrame, or pyarrow.Table") builder.add_block(applied) return builder.build()