Esempio n. 1
0
 def truncate(block: Block, meta: BlockMetadata,
              count: int) -> (Block, BlockMetadata):
     block = BlockAccessor.for_block(block)
     logger.debug("Truncating last block to size: {}".format(count))
     new_block = block.slice(0, count, copy=True)
     accessor = BlockAccessor.for_block(new_block)
     new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                              size_bytes=accessor.size_bytes(),
                              schema=meta.schema,
                              input_files=meta.input_files)
     return new_block, new_meta
Esempio n. 2
0
 def shuffle_map(block: Block) -> List[Block]:
     block = BlockAccessor.for_block(block)
     slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
     slices = []
     for i in range(output_num_blocks):
         slices.append(
             block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))
     num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
     assert num_rows == block.num_rows(), (num_rows, block.num_rows())
     # Needed to handle num_returns=1 edge case in Ray API.
     if len(slices) == 1:
         return slices[0]
     else:
         return slices
Esempio n. 3
0
        def transform(block: Block) -> Block:
            block = BlockAccessor.for_block(block)
            total_rows = block.num_rows()
            max_batch_size = batch_size
            if max_batch_size is None:
                max_batch_size = total_rows

            builder = DelegatingArrowBlockBuilder()

            for start in range(0, total_rows, max_batch_size):
                # Build a block for each batch.
                end = min(total_rows, start + max_batch_size)
                view = block.slice(start, end, copy=False)
                if batch_format == "pandas":
                    view = BlockAccessor.for_block(view).to_pandas()
                elif batch_format == "pyarrow":
                    view = BlockAccessor.for_block(view).to_arrow_table()
                else:
                    raise ValueError(
                        f"The given batch format: {batch_format} "
                        f"is invalid. Supported batch type: {BatchType}")

                applied = fn(view)
                if isinstance(applied, list):
                    applied = applied
                elif isinstance(applied, pa.Table):
                    applied = applied
                elif isinstance(applied, pd.core.frame.DataFrame):
                    applied = pa.Table.from_pandas(applied)
                else:
                    raise ValueError("The map batch UDF returns a type "
                                     f"{type(applied)}, which is not allowed. "
                                     "The return type must be either list, "
                                     "pandas.DataFrame, or pyarrow.Table")
                builder.add_block(applied)

            return builder.build()