def _map_block_split( block: Block, block_fn: BlockTransform, input_files: List[str], fn: Optional[UDF], *fn_args, **fn_kwargs, ) -> BlockPartition: output = [] stats = BlockExecStats.builder() if fn is not None: fn_args = (fn,) + fn_args for new_block in block_fn(block, *fn_args, **fn_kwargs): accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files, exec_stats=stats.build(), ) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_meta)) stats = BlockExecStats.builder() return output
def _map_block_split(block: Block, fn: Any, input_files: List[str]) -> BlockPartition: output = [] stats = BlockExecStats.builder() for new_block in fn(block): accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files, exec_stats=stats.build()) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_meta)) stats = BlockExecStats.builder() return output
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ray.data.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build())) i += block_size return Dataset(BlockList(blocks, metadata), 0, DatasetStats(stages={"from_items": metadata}, parent=None))
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() block = BlockAccessor.batch_to_block(ndarray) metadata = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) return block, metadata
def _shuffle_map( block: Block, idx: int, output_num_blocks: int, random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: """Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].""" stats = BlockExecStats.builder() block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this matters when # some blocks are larger than others). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices
def _split_single_block( block_id: int, block: Block, meta: BlockMetadata, block_row: int, split_indices: List[int], ) -> Tuple[int, List[Tuple[ObjectRef[Block], BlockMetadata]]]: """Split the provided block at the given indices.""" split_result = [] block_accessor = BlockAccessor.for_block(block) prev_index = 0 # append one more entry at the last so we don't # need handle empty edge case. split_indices.append(block_row) for index in split_indices: logger.debug(f"slicing block {prev_index}:{index}") stats = BlockExecStats.builder() split_block = block_accessor.slice(prev_index, index, copy=True) accessor = BlockAccessor.for_block(split_block) split_meta = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=meta.schema, input_files=meta.input_files, exec_stats=stats.build(), ) split_result.append((ray.put(split_block), split_meta)) prev_index = index return (block_id, split_result)
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() import pyarrow as pa from ray.data.extensions import TensorArray table = pa.Table.from_pydict({"value": TensorArray(ndarray)}) return (table, BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build()))
def _sort_block(block, boundaries, key, descending): stats = BlockExecStats.builder() out = BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending) meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return out + [meta]
def merge_sorted_blocks( blocks: List[Block[T]], key: "SortKeyT", descending: bool) -> Tuple[Block[T], BlockMetadata]: stats = BlockExecStats.builder() ret = [x for block in blocks for x in block] ret.sort(key=key, reverse=descending) return ret, SimpleBlockAccessor(ret).get_metadata( None, exec_stats=stats.build())
def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]: stats = BlockExecStats.builder() import pyarrow as pa block = pa.table(df) return ( block, BlockAccessor.for_block(block).get_metadata(input_files=None, exec_stats=stats.build()), )
def _map_block_nosplit(block: Block, fn: Any, input_files: List[str]) -> Tuple[Block, BlockMetadata]: stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for new_block in fn(block): builder.add_block(new_block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) return new_block, accessor.get_metadata(input_files=input_files, exec_stats=stats.build())
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> import ray >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP >>> ds # doctest: +SKIP Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>) >>> ds.take(2) # doctest: +SKIP [1, 2] Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ detected_parallelism, _ = _autodetect_parallelism( parallelism, ray.util.get_current_placement_group(), DatasetContext.get_current(), ) block_size = max( 1, len(items) // detected_parallelism, ) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build())) i += block_size return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_items": metadata}, parent=None), ), 0, False, )
def merge_sorted_blocks( blocks: List[Block[T]], key: "SortKeyT", _descending: bool ) -> Tuple[Block[T], BlockMetadata]: stats = BlockExecStats.builder() blocks = [b for b in blocks if b.num_rows > 0] if len(blocks) == 0: ret = ArrowBlockAccessor._empty_table() else: ret = pyarrow.concat_tables(blocks, promote=True) indices = pyarrow.compute.sort_indices(ret, sort_keys=key) ret = ArrowBlockAccessor.take_table(ret, indices) return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
def merge_sorted_blocks( blocks: List[Block[T]], key: "SortKeyT", _descending: bool ) -> Tuple[Block[T], BlockMetadata]: stats = BlockExecStats.builder() blocks = [b for b in blocks if b.num_rows > 0] if len(blocks) == 0: ret = ArrowBlockAccessor._empty_table() else: concat_and_sort = get_concat_and_sort_transform( DatasetContext.get_current() ) ret = concat_and_sort(blocks, key, _descending) return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
def map( idx: int, block: Block, output_num_blocks: int, boundaries: List[T], key: SortKeyT, descending: bool, ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() out = BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending) meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return [meta] + out
def _partition_and_combine_block( block: Block[T], boundaries: List[KeyType], key: KeyFn, aggs: Tuple[AggregateFn]) -> List[Union[Block, BlockMetadata]]: """Partition the block and combine rows with the same key.""" stats = BlockExecStats.builder() if key is None: partitions = [block] else: partitions = BlockAccessor.for_block(block).sort_and_partition( boundaries, [(key, "ascending")] if isinstance(key, str) else key, descending=False) parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions] meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return parts + [meta]
def _shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None, exec_stats=stats.build(), ) return new_block, new_metadata
def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the read task. block = task() if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block
def _map_block_nosplit( block: Block, block_fn: BlockTransform, input_files: List[str], fn: Optional[UDF], *fn_args, **fn_kwargs, ) -> Tuple[Block, BlockMetadata]: stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() if fn is not None: fn_args = (fn, ) + fn_args for new_block in block_fn(block, *fn_args, **fn_kwargs): builder.add_block(new_block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) return new_block, accessor.get_metadata(input_files=input_files, exec_stats=stats.build())
def map( idx: int, block: Block, output_num_blocks: int, block_udf: Optional[Callable[[Block], Iterable[Block]]], random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() if block_udf: # TODO(ekl) note that this effectively disables block splitting. blocks = list(block_udf(block)) if len(blocks) > 1: builder = BlockAccessor.for_block(blocks[0]).builder() for b in blocks: builder.add_block(b) block = builder.build() else: block = blocks[0] block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this prevents empty # outputs when input blocks are very small). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices
def reduce(random_shuffle: bool, random_seed: Optional[int], *mapper_outputs: List[Block]) -> (Block, BlockMetadata): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) if random_shuffle: new_block = accessor.random_shuffle( random_seed if random_seed is not None else None) accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None, exec_stats=stats.build(), ) return new_block, new_metadata
def _execute_read_task( i: int, task: ReadTask, context: DatasetContext, stats_uuid: str, stats_actor: ray.actor.ActorHandle, ) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the task. block = task() metadata = task.get_metadata() if context.block_splitting_enabled: metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=metadata.input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block, metadata
def _merge( reduce_fn, *all_mapper_outputs: List[List[Block]], reduce_args: Optional[List[Any]] = None, ) -> List[Union[BlockMetadata, Block]]: """ Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks]. """ assert ( len({len(mapper_outputs) for mapper_outputs in all_mapper_outputs}) == 1 ), "Received different number of map inputs" stats = BlockExecStats.builder() merged_outputs = [] if not reduce_args: reduce_args = [] for mapper_outputs in zip(*all_mapper_outputs): block, meta = reduce_fn(*reduce_args, *mapper_outputs) merged_outputs.append(block) meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) return [meta] + merged_outputs
def _merge( reduce_fn, *all_mapper_outputs: List[List[Block]], reduce_args: Optional[List[Any]] = None, ) -> List[Union[BlockMetadata, Block]]: """ Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks]. """ assert (len({ len(mapper_outputs) for mapper_outputs in all_mapper_outputs }) == 1), "Received different number of map inputs" stats = BlockExecStats.builder() if not reduce_args: reduce_args = [] num_rows = 0 size_bytes = 0 schema = None for i, mapper_outputs in enumerate(zip(*all_mapper_outputs)): block, meta = reduce_fn(*reduce_args, *mapper_outputs, partial_reduce=True) yield block block = BlockAccessor.for_block(block) num_rows += block.num_rows() size_bytes += block.size_bytes() schema = block.schema() del block yield BlockMetadata( num_rows=num_rows, size_bytes=size_bytes, schema=schema, input_files=None, exec_stats=stats.build(), )
def _get_metadata(table: "pyarrow.Table") -> BlockMetadata: stats = BlockExecStats.builder() return BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build())
def _get_metadata( table: Union["pyarrow.Table", "pandas.DataFrame"]) -> BlockMetadata: stats = BlockExecStats.builder() return BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build())
def do_zip(block1: Block, block2: Block) -> (Block, BlockMetadata): stats = BlockExecStats.builder() b1 = BlockAccessor.for_block(block1) result = b1.zip(block2) br = BlockAccessor.for_block(result) return result, br.get_metadata(input_files=[], exec_stats=stats.build())
def aggregate_combined_blocks( blocks: List[Block[ArrowRow]], key: KeyFn, aggs: Tuple[AggregateFn], finalize: bool, ) -> Tuple[Block[ArrowRow], BlockMetadata]: """Aggregate sorted, partially combined blocks with the same key range. This assumes blocks are already sorted by key in ascending order, so we can do merge sort to get all the rows with the same key. Args: blocks: A list of partially combined and sorted blocks. key: The column name of key or None for global aggregation. aggs: The aggregations to do. finalize: Whether to finalize the aggregation. This is used as an optimization for cases where we repeatedly combine partially aggregated groups. Returns: A block of [k, v_1, ..., v_n] columns and its metadata where k is the groupby key and v_i is the corresponding aggregation result for the ith given aggregation. If key is None then the k column is omitted. """ stats = BlockExecStats.builder() key_fn = ( (lambda r: r[r._row.schema.names[0]]) if key is not None else (lambda r: 0) ) iter = heapq.merge( *[ArrowBlockAccessor(block).iter_rows() for block in blocks], key=key_fn ) next_row = None builder = ArrowBlockBuilder() while True: try: if next_row is None: next_row = next(iter) next_key = key_fn(next_row) next_key_name = ( next_row._row.schema.names[0] if key is not None else None ) def gen(): nonlocal iter nonlocal next_row while key_fn(next_row) == next_key: yield next_row try: next_row = next(iter) except StopIteration: next_row = None break # Merge. first = True accumulators = [None] * len(aggs) resolved_agg_names = [None] * len(aggs) for r in gen(): if first: count = collections.defaultdict(int) for i in range(len(aggs)): name = aggs[i].name # Check for conflicts with existing aggregation # name. if count[name] > 0: name = ArrowBlockAccessor._munge_conflict( name, count[name] ) count[name] += 1 resolved_agg_names[i] = name accumulators[i] = r[name] first = False else: for i in range(len(aggs)): accumulators[i] = aggs[i].merge( accumulators[i], r[resolved_agg_names[i]] ) # Build the row. row = {} if key is not None: row[next_key_name] = next_key for agg, agg_name, accumulator in zip( aggs, resolved_agg_names, accumulators ): if finalize: row[agg_name] = agg.finalize(accumulator) else: row[agg_name] = accumulator builder.add(row) except StopIteration: break ret = builder.build() return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
def aggregate_combined_blocks( blocks: List[Block[Tuple[KeyType, AggType]]], key: KeyFn, aggs: Tuple[AggregateFn], ) -> Tuple[Block[Tuple[KeyType, U]], BlockMetadata]: """Aggregate sorted, partially combined blocks with the same key range. This assumes blocks are already sorted by key in ascending order, so we can do merge sort to get all the rows with the same key. Args: blocks: A list of partially combined and sorted blocks. key: The key function that returns the key from the row or None for global aggregation. aggs: The aggregations to do. Returns: A block of (k, v_1, ..., v_n) tuples and its metadata where k is the groupby key and v_i is the corresponding aggregation result for the ith given aggregation. If key is None then the k element of tuple is omitted. """ stats = BlockExecStats.builder() key_fn = (lambda r: r[0]) if key else (lambda r: 0) iter = heapq.merge( *[SimpleBlockAccessor(block).iter_rows() for block in blocks], key=key_fn ) next_row = None ret = [] while True: try: if next_row is None: next_row = next(iter) next_key = key_fn(next_row) def gen(): nonlocal iter nonlocal next_row while key_fn(next_row) == next_key: yield next_row try: next_row = next(iter) except StopIteration: next_row = None break first = True accumulators = [None] * len(aggs) for r in gen(): if first: for i in range(len(aggs)): accumulators[i] = r[i + 1] if key else r[i] first = False else: for i in range(len(aggs)): accumulators[i] = aggs[i].merge( accumulators[i], r[i + 1] if key else r[i] ) if key is None: ret.append( tuple( agg.finalize(accumulator) for agg, accumulator in zip(aggs, accumulators) ) ) else: ret.append( (next_key,) + tuple( agg.finalize(accumulator) for agg, accumulator in zip(aggs, accumulators) ) ) except StopIteration: break return ret, SimpleBlockAccessor(ret).get_metadata( None, exec_stats=stats.build() )