コード例 #1
0
def sort_impl(blocks: BlockList,
              key: SortKeyT,
              descending: bool = False) -> BlockList:
    blocks = list(blocks.iter_blocks())
    if len(blocks) == 0:
        return BlockList([], [])

    if isinstance(key, str):
        key = [(key, "descending" if descending else "ascending")]

    if isinstance(key, list):
        descending = key[0][1] == "descending"

    num_mappers = len(blocks)
    num_reducers = num_mappers
    boundaries = sample_boundaries(blocks, key, num_reducers)
    if descending:
        boundaries.reverse()

    sort_block = cached_remote_fn(_sort_block).options(
        num_returns=num_reducers)
    merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2)

    map_results = np.empty((num_mappers, num_reducers), dtype=object)
    for i, block in enumerate(blocks):
        map_results[i, :] = sort_block.remote(block, boundaries, key,
                                              descending)
    map_bar = ProgressBar("Sort Map", len(map_results))
    map_bar.block_until_complete([ret[0] for ret in map_results])
    map_bar.close()

    reduce_results = []
    for j in range(num_reducers):
        ret = merge_sorted_blocks.remote(key, descending,
                                         *map_results[:, j].tolist())
        reduce_results.append(ret)
    merge_bar = ProgressBar("Sort Merge", len(reduce_results))
    merge_bar.block_until_complete([ret[0] for ret in reduce_results])
    merge_bar.close()

    blocks = [b for b, _ in reduce_results]
    metadata = ray.get([m for _, m in reduce_results])
    return BlockList(blocks, metadata)
コード例 #2
0
ファイル: shuffle.py プロジェクト: mvindiola1/ray
def simple_shuffle(input_blocks: BlockList,
                   output_num_blocks: int,
                   *,
                   random_shuffle: bool = False,
                   random_seed: Optional[int] = None,
                   map_ray_remote_args: Optional[Dict[str, Any]] = None,
                   reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
                   _spread_resource_prefix: Optional[str] = None) -> BlockList:
    input_blocks = list(input_blocks.iter_blocks())
    if map_ray_remote_args is None:
        map_ray_remote_args = {}
    if reduce_ray_remote_args is None:
        reduce_ray_remote_args = {}
    input_num_blocks = len(input_blocks)
    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        map_resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, map_ray_remote_args)
        reduce_resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, reduce_ray_remote_args)
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        map_resource_iter, reduce_resource_iter = itertools.tee(
            itertools.repeat({}), 2)

    shuffle_map = cached_remote_fn(_shuffle_map)
    shuffle_reduce = cached_remote_fn(_shuffle_reduce)

    map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks)

    shuffle_map_out = [
        shuffle_map.options(**map_ray_remote_args,
                            num_returns=output_num_blocks,
                            resources=next(map_resource_iter)).remote(
                                block, i, output_num_blocks, random_shuffle,
                                random_seed)
        for i, block in enumerate(input_blocks)
    ]
    # Eagerly delete the input block references in order to eagerly release
    # the blocks' memory.
    del input_blocks
    if output_num_blocks == 1:
        # Handle the num_returns=1 edge case which doesn't return a list.
        shuffle_map_out = [[x] for x in shuffle_map_out]
    map_bar.block_until_complete([x[0] for x in shuffle_map_out])
    map_bar.close()

    # Randomize the reduce order of the blocks.
    if random_shuffle:
        random = np.random.RandomState(random_seed)
        random.shuffle(shuffle_map_out)

    reduce_bar = ProgressBar("Shuffle Reduce",
                             position=0,
                             total=output_num_blocks)
    shuffle_reduce_out = [
        shuffle_reduce.options(**reduce_ray_remote_args,
                               num_returns=2,
                               resources=next(reduce_resource_iter)).
        remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)])
        for j in range(output_num_blocks)
    ]
    # Eagerly delete the map block references in order to eagerly release
    # the blocks' memory.
    del shuffle_map_out
    new_blocks, new_metadata = zip(*shuffle_reduce_out)
    reduce_bar.block_until_complete(list(new_blocks))
    new_metadata = ray.get(list(new_metadata))
    reduce_bar.close()

    return BlockList(list(new_blocks), list(new_metadata))