Exemple #1
0
        def do_agg(blocks, clear_input_blocks: bool, block_udf):
            # TODO: implement clear_input_blocks
            stage_info = {}
            if len(aggs) == 0:
                raise ValueError("Aggregate requires at least one aggregation")
            for agg in aggs:
                agg._validate(self._dataset)
            # Handle empty dataset.
            if blocks.initial_num_blocks() == 0:
                return blocks, stage_info

            num_mappers = blocks.initial_num_blocks()
            num_reducers = num_mappers
            if self._key is None:
                num_reducers = 1
                boundaries = []
            else:
                boundaries = sort.sample_boundaries(
                    blocks.get_blocks(),
                    [(self._key, "ascending")]
                    if isinstance(self._key, str) else self._key,
                    num_reducers,
                )

            partition_and_combine_block = cached_remote_fn(
                _partition_and_combine_block).options(
                    num_returns=num_reducers + 1)
            aggregate_combined_blocks = cached_remote_fn(
                _aggregate_combined_blocks, num_returns=2)

            map_results = np.empty((num_mappers, num_reducers), dtype=object)
            map_meta = []
            for i, block in enumerate(blocks.get_blocks()):
                results = partition_and_combine_block.remote(
                    block, boundaries, self._key, aggs)
                map_results[i, :] = results[:-1]
                map_meta.append(results[-1])
            map_bar = ProgressBar("GroupBy Map", len(map_results))
            map_bar.block_until_complete(map_meta)
            stage_info["map"] = ray.get(map_meta)
            map_bar.close()

            blocks = []
            metadata = []
            for j in range(num_reducers):
                block, meta = aggregate_combined_blocks.remote(
                    num_reducers, self._key, aggs, *map_results[:, j].tolist())
                blocks.append(block)
                metadata.append(meta)
            reduce_bar = ProgressBar("GroupBy Reduce", len(blocks))
            reduce_bar.block_until_complete(blocks)
            reduce_bar.close()

            metadata = ray.get(metadata)
            stage_info["reduce"] = metadata
            return BlockList(blocks, metadata), stage_info
Exemple #2
0
def sort_impl(blocks: BlockList,
              key: SortKeyT,
              descending: bool = False) -> Tuple[BlockList, dict]:
    stage_info = {}
    blocks = blocks.get_blocks()
    if len(blocks) == 0:
        return BlockList([], []), stage_info

    if isinstance(key, str):
        key = [(key, "descending" if descending else "ascending")]

    if isinstance(key, list):
        descending = key[0][1] == "descending"

    num_mappers = len(blocks)
    num_reducers = num_mappers
    boundaries = sample_boundaries(blocks, key, num_reducers)
    if descending:
        boundaries.reverse()

    sort_block = cached_remote_fn(_sort_block).options(
        num_returns=num_reducers + 1)
    merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2)

    map_results = np.empty((num_mappers, num_reducers), dtype=object)
    map_meta = []
    for i, block in enumerate(blocks):
        result = sort_block.remote(block, boundaries, key, descending)
        map_results[i, :] = result[:-1]
        map_meta.append(result[-1])

    # Early release memory.
    del blocks

    map_bar = ProgressBar("Sort Map", len(map_results))
    map_bar.block_until_complete(map_meta)
    map_bar.close()
    stage_info["map"] = ray.get(map_meta)

    reduce_results = []
    for j in range(num_reducers):
        ret = merge_sorted_blocks.remote(key, descending,
                                         *map_results[:, j].tolist())
        reduce_results.append(ret)

    # Early release memory.
    del map_results

    merge_bar = ProgressBar("Sort Merge", len(reduce_results))
    merge_bar.block_until_complete([ret[0] for ret in reduce_results])
    merge_bar.close()

    blocks = [b for b, _ in reduce_results]
    metadata = ray.get([m for _, m in reduce_results])
    stage_info["merge"] = metadata
    return BlockList(blocks, metadata), stage_info
Exemple #3
0
    def __init__(self, pipeline: "DatasetPipeline[T]"):
        self._pipeline: "DatasetPipeline[T]" = pipeline
        self._stages: List[ObjectRef[Dataset[Any]]] = [None] * (
            len(self._pipeline._optimized_stages) + 1)
        # Pin the child actors to the local node so that they fate-share with
        # the PipelineExecutor during a node failure.
        local_node_resource = "node:{}".format(ray.util.get_node_ip_address())
        self._stage_runners = [
            _StageRunner.options(resources={
                local_node_resource: 0.0001
            },
                                 placement_group=None).remote()
            for _ in self._stages
        ]
        self._iter = iter(self._pipeline._base_iterable)
        self._stages[0] = self._stage_runners[0].run.remote(
            next(self._iter), DatasetContext.get_current())

        if self._pipeline._length and self._pipeline._length != float("inf"):
            length = self._pipeline._length
        else:
            length = 1

        if self._pipeline._progress_bars:
            self._bars = [
                ProgressBar("Stage {}".format(i), length, position=i)
                for i in range(len(self._stages))
            ]
        else:
            self._bars = None
Exemple #4
0
def test_chaos_actor_retry(ray_start_chaos_cluster, log_pubsub):
    # p = log_pubsub

    # Chaos testing.
    @ray.remote(num_cpus=1, max_restarts=-1, max_task_retries=-1)
    class Actor:
        def __init__(self):
            self.letter_dict = set()

        def add(self, letter):
            self.letter_dict.add(letter)

        def get(self):
            return self.letter_dict

    NUM_CPUS = 32
    TOTAL_TASKS = 300

    pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS * NUM_CPUS)
    actors = [Actor.remote() for _ in range(NUM_CPUS)]
    results = []
    for a in actors:
        results.extend([a.add.remote(str(i)) for i in range(TOTAL_TASKS)])
    start = time.time()
    pb.fetch_until_complete(results)
    runtime_with_failure = time.time() - start
    print(f"Runtime when there are many failures: {runtime_with_failure}")
    pb.close()
Exemple #5
0
def test_chaos_task_retry(ray_start_chaos_cluster):
    # Chaos testing.
    @ray.remote(max_retries=-1)
    def task():
        def generate_data(size_in_kb=10):
            return np.zeros(1024 * size_in_kb, dtype=np.uint8)

        a = ""
        for _ in range(100000):
            a = a + random.choice(string.ascii_letters)
        return generate_data(size_in_kb=50)

    @ray.remote(max_retries=-1)
    def invoke_nested_task():
        time.sleep(0.8)
        return ray.get(task.remote())

    # 50MB of return values.
    TOTAL_TASKS = 300

    pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS)
    results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)]
    start = time.time()
    pb.block_until_complete(results)
    runtime_with_failure = time.time() - start
    print(f"Runtime when there are many failures: {runtime_with_failure}")
    pb.close()
Exemple #6
0
def run_task_workload(total_num_cpus, smoke):
    """Run task-based workload that doesn't require object reconstruction.
    """
    @ray.remote(num_cpus=1, max_retries=-1)
    def task():
        def generate_data(size_in_kb=10):
            return np.zeros(1024 * size_in_kb, dtype=np.uint8)

        a = ""
        for _ in range(100000):
            a = a + random.choice(string.ascii_letters)
        return generate_data(size_in_kb=50)

    @ray.remote(num_cpus=1, max_retries=-1)
    def invoke_nested_task():
        time.sleep(0.8)
        return ray.get(task.remote())

    multiplier = 75
    # For smoke mode, run less number of tasks
    if smoke:
        multiplier = 1
    TOTAL_TASKS = int(total_num_cpus * 2 * multiplier)

    pb = ProgressBar("Chaos test", TOTAL_TASKS)
    results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)]
    pb.block_until_complete(results)
    pb.close()

    # Consistency check.
    wait_for_condition(lambda: (ray.cluster_resources().get("CPU", 0) == ray.
                                available_resources().get("CPU", 0)),
                       timeout=60)
Exemple #7
0
Fichier : sort.py Projet : rlan/ray
def sample_boundaries(blocks: BlockList[T], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    n_samples = int(num_reducers * 10 / len(blocks))

    sample_block = cached_remote_fn(_sample_block)

    sample_results = [
        sample_block.remote(block, n_samples, key) for block in blocks
    ]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    sample_bar.block_until_complete(sample_results)
    sample_bar.close()

    samples = ray.get(sample_results)
    sample_items = np.concatenate(samples)
    sample_items.sort()
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.arange(0, 1, 1 / num_reducers)
    ]
    return ret[1:]
Exemple #8
0
    def _apply(
        self,
        fn: Any,
        remote_args: dict,
        block_list: BlockList,
        clear_input_blocks: bool,
    ) -> BlockList:
        context = DatasetContext.get_current()

        # Handle empty datasets.
        if block_list.initial_num_blocks() == 0:
            return block_list

        blocks = block_list.get_blocks_with_metadata()
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        if context.block_splitting_enabled:
            map_block = cached_remote_fn(_map_block_split).options(**remote_args)
            refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks]
        else:
            map_block = cached_remote_fn(_map_block_nosplit).options(
                **dict(remote_args, num_returns=2)
            )
            all_refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks]
            data_refs = [r[0] for r in all_refs]
            refs = [r[1] for r in all_refs]

        # Release input block references.
        if clear_input_blocks:
            del blocks
            block_list.clear()

        # Common wait for non-data refs.
        try:
            results = map_bar.fetch_until_complete(refs)
        except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e:
            # One or more mapper tasks failed, or we received a SIGINT signal
            # while waiting; either way, we cancel all map tasks.
            for ref in refs:
                ray.cancel(ref)
            # Wait until all tasks have failed or been cancelled.
            for ref in refs:
                try:
                    ray.get(ref)
                except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError):
                    pass
            # Reraise the original task failure exception.
            raise e from None

        new_blocks, new_metadata = [], []
        if context.block_splitting_enabled:
            for result in results:
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block, metadata in zip(data_refs, results):
                new_blocks.append(block)
                new_metadata.append(metadata)
        return BlockList(list(new_blocks), list(new_metadata))
Exemple #9
0
def sort_impl(blocks: BlockList,
              key: SortKeyT,
              descending: bool = False) -> BlockList:
    blocks = list(blocks.iter_blocks())
    if len(blocks) == 0:
        return BlockList([], [])

    if isinstance(key, str):
        key = [(key, "descending" if descending else "ascending")]

    if isinstance(key, list):
        descending = key[0][1] == "descending"

    num_mappers = len(blocks)
    num_reducers = num_mappers
    boundaries = sample_boundaries(blocks, key, num_reducers)
    if descending:
        boundaries.reverse()

    sort_block = cached_remote_fn(_sort_block).options(
        num_returns=num_reducers)
    merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2)

    map_results = np.empty((num_mappers, num_reducers), dtype=object)
    for i, block in enumerate(blocks):
        map_results[i, :] = sort_block.remote(block, boundaries, key,
                                              descending)
    map_bar = ProgressBar("Sort Map", len(map_results))
    map_bar.block_until_complete([ret[0] for ret in map_results])
    map_bar.close()

    reduce_results = []
    for j in range(num_reducers):
        ret = merge_sorted_blocks.remote(key, descending,
                                         *map_results[:, j].tolist())
        reduce_results.append(ret)
    merge_bar = ProgressBar("Sort Merge", len(reduce_results))
    merge_bar.block_until_complete([ret[0] for ret in reduce_results])
    merge_bar.close()

    blocks = [b for b, _ in reduce_results]
    metadata = ray.get([m for _, m in reduce_results])
    return BlockList(blocks, metadata)
Exemple #10
0
def simple_shuffle(input_blocks: BlockList[T],
                   output_num_blocks: int,
                   *,
                   random_shuffle: bool = False,
                   random_seed: Optional[int] = None) -> BlockList[T]:
    input_num_blocks = len(input_blocks)

    shuffle_map = cached_remote_fn(_shuffle_map).options(
        num_returns=output_num_blocks)
    shuffle_reduce = cached_remote_fn(_shuffle_reduce, num_returns=2)

    map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks)

    shuffle_map_out = [
        shuffle_map.remote(block, i, output_num_blocks, random_shuffle,
                           random_seed) for i, block in enumerate(input_blocks)
    ]
    if output_num_blocks == 1:
        # Handle the num_returns=1 edge case which doesn't return a list.
        shuffle_map_out = [[x] for x in shuffle_map_out]
    map_bar.block_until_complete([x[0] for x in shuffle_map_out])
    map_bar.close()

    # Randomize the reduce order of the blocks.
    if random_shuffle:
        random = np.random.RandomState(random_seed)
        random.shuffle(shuffle_map_out)

    reduce_bar = ProgressBar("Shuffle Reduce",
                             position=0,
                             total=output_num_blocks)
    shuffle_reduce_out = [
        shuffle_reduce.remote(
            *[shuffle_map_out[i][j] for i in range(input_num_blocks)])
        for j in range(output_num_blocks)
    ]
    new_blocks, new_metadata = zip(*shuffle_reduce_out)
    reduce_bar.block_until_complete(list(new_blocks))
    new_metadata = ray.get(list(new_metadata))
    reduce_bar.close()

    return BlockList(list(new_blocks), list(new_metadata))
def main():
    """The test simulates the workload with many threaded actors.

    Test is doing 4 things for 1 hour.

    - It first creates actors as many as num_cpus with max_concurrency=10
    - Each actor computes pi and put the result to the queue.
    - Driver keeps getting result & metadata from the actor.
    - Every X seconds, it kills all actors and restarts them.
    """
    ray.init(address="auto")
    args, unknown = parse_script_args()
    num_cpus = ray.cluster_resources()["CPU"]
    num_nodes = sum(1 for n in ray.nodes() if n["Alive"])
    print(f"Total number of actors: {num_cpus}, nodes: {num_nodes}")
    monitor_actor = monitor_memory_usage()

    start = time.time()
    while time.time() - start < args.test_runtime:
        # Step 1: Create actors and start computation loop.
        print("Create actors.")
        actors = start_actors(num_cpus, num_nodes)

        # Step 2: Get the pi result from actors.
        compute_start = time.time()
        print("Start computation.")
        while time.time() - compute_start < args.kill_interval_s:
            # Get the metadata.
            ray.get([actor.get_metadata.remote() for actor in actors])
            # Get the result.
            pb = ProgressBar("Computing Pi", num_cpus)
            results = [actor.get_pi.remote() for actor in actors]
            pb.fetch_until_complete(results)
            pb.close()

        # Step 3: Kill actors.
        print("Kill all actors.")
        for actor in actors:
            ray.kill(actor)

    # Report the result.
    print("PASSED.")
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage with failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")
    # Report the result.
    ray.get(monitor_actor.stop_run.remote())

    result = {"success": 0}
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps(result))
Exemple #12
0
def _fetch_metadata_remotely(
        pieces: List[bytes]
) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]:
    remote_fetch_metadata = cached_remote_fn(
        _fetch_metadata_serialization_wrapper)
    metas = []
    parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100)
    meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism)
    for pieces_ in np.array_split(pieces, parallelism):
        if len(pieces_) == 0:
            continue
        metas.append(remote_fetch_metadata.remote(pieces_))
    metas = meta_fetch_bar.fetch_until_complete(metas)
    return list(itertools.chain.from_iterable(metas))
Exemple #13
0
    def __init__(self, pipeline: "DatasetPipeline[T]"):
        self._pipeline: "DatasetPipeline[T]" = pipeline
        self._stages: List[ObjectRef[
            Dataset[Any]]] = [None] * (len(self._pipeline._stages) + 1)
        self._iter = iter(self._pipeline._base_iterable)
        self._stages[0] = pipeline_stage.remote(next(self._iter))

        if self._pipeline._progress_bars:
            self._bars = [
                ProgressBar("Stage {}".format(i),
                            self._pipeline._length or 1,
                            position=i) for i in range(len(self._stages))
            ]
        else:
            self._bars = None
Exemple #14
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList[Any]) -> BlockList[Any]:
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        kwargs = remote_args.copy()
        kwargs["num_returns"] = 2

        map_block = cached_remote_fn(_map_block)
        refs = [
            map_block.options(**kwargs).remote(b, m, fn)
            for b, m in zip(blocks, blocks.get_metadata())
        ]
        new_blocks, new_metadata = zip(*refs)

        map_bar.block_until_complete(list(new_blocks))
        new_metadata = ray.get(list(new_metadata))
        return BlockList(list(new_blocks), list(new_metadata))
Exemple #15
0
    def _execute_reduce_stage(
        self,
        output_num_blocks: int,
        schedule: _PushBasedShuffleTaskSchedule,
        reduce_ray_remote_args: Dict[str, Any],
        all_merge_results: List[List[ObjectRef]],
    ):
        shuffle_reduce = cached_remote_fn(self.reduce)
        # Execute the final reduce stage.
        shuffle_reduce_out = []
        for reducer_idx in range(output_num_blocks):
            merge_idx = schedule.get_merge_idx_for_reducer_idx(reducer_idx)
            # Submit one partition of reduce tasks, one for each of the P
            # outputs produced by the corresponding merge task.
            # We also add the merge task arguments so that the reduce task
            # is colocated with its inputs.
            shuffle_reduce_out.append(
                shuffle_reduce.options(
                    **reduce_ray_remote_args,
                    **schedule.get_merge_task_options(merge_idx),
                    num_returns=2,
                ).remote(
                    *self._reduce_args,
                    *[
                        merge_results.pop(0)
                        for merge_results in all_merge_results[merge_idx]
                    ],
                )
            )
        for merge_idx, merge_results in enumerate(all_merge_results):
            assert all(len(merge_result) == 0 for merge_result in merge_results), (
                "Reduce stage did not process outputs from merge tasks at index: "
                f"{merge_idx}"
            )
        assert (
            len(shuffle_reduce_out) == output_num_blocks
        ), f"Expected {output_num_blocks} outputs, produced {len(shuffle_reduce_out)}"

        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        reduce_blocks, reduce_metadata = zip(*shuffle_reduce_out)
        reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata))
        reduce_bar.close()

        return reduce_metadata, reduce_blocks
def _fetch_metadata_remotely(
    pieces: List["pyarrow._dataset.ParquetFileFragment"],
) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]:
    from ray import cloudpickle

    remote_fetch_metadata = cached_remote_fn(_fetch_metadata_serialization_wrapper)
    metas = []
    parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100)
    meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism)
    try:
        _register_parquet_file_fragment_serialization()
        for pcs in np.array_split(pieces, parallelism):
            if len(pcs) == 0:
                continue
            metas.append(remote_fetch_metadata.remote(cloudpickle.dumps(pcs)))
    finally:
        _deregister_parquet_file_fragment_serialization()
    metas = meta_fetch_bar.fetch_until_complete(metas)
    return list(itertools.chain.from_iterable(metas))
Exemple #17
0
    def _get_blocks_with_metadata(
        self, ) -> Tuple[List[ObjectRef[Block]], List[BlockMetadata]]:
        """Get all underlying block futures and concrete metadata.

        This will block on the completion of the underlying read tasks and will fetch
        all block metadata outputted by those tasks.
        """
        context = DatasetContext.get_current()
        block_refs, meta_refs = [], []
        for block_ref, meta_ref in self._iter_block_partition_refs():
            block_refs.append(block_ref)
            meta_refs.append(meta_ref)
        if context.block_splitting_enabled:
            # If block splitting is enabled, fetch the partitions.
            parts = ray.get(block_refs)
            block_refs, metadata = [], []
            for part in parts:
                for block_ref, meta in part:
                    block_refs.append(block_ref)
                    metadata.append(meta)
            self._cached_metadata = metadata
            return block_refs, metadata
        if all(meta is not None for meta in self._cached_metadata):
            # Short-circuit on cached metadata.
            return block_refs, self._cached_metadata
        if not meta_refs:
            # Short-circuit on empty set of block partitions.
            assert not block_refs, block_refs
            return [], []
        read_progress_bar = ProgressBar("Read progress", total=len(meta_refs))
        # Fetch the metadata in bulk.
        # Handle duplicates (e.g. due to unioning the same dataset).
        unique_meta_refs = set(meta_refs)
        metadata = read_progress_bar.fetch_until_complete(
            list(unique_meta_refs))
        ref_to_data = {
            meta_ref: data
            for meta_ref, data in zip(unique_meta_refs, metadata)
        }
        metadata = [ref_to_data[meta_ref] for meta_ref in meta_refs]
        self._cached_metadata = metadata
        return block_refs, metadata
Exemple #18
0
    def __init__(self, pipeline: "DatasetPipeline[T]"):
        self._pipeline: "DatasetPipeline[T]" = pipeline
        self._stages: List[ObjectRef[
            Dataset[Any]]] = [None] * (len(self._pipeline._stages) + 1)
        self._iter = iter(self._pipeline._base_iterable)
        self._stages[0] = pipeline_stage.remote(next(self._iter),
                                                DatasetContext.get_current())

        if self._pipeline._length and self._pipeline._length != float("inf"):
            length = self._pipeline._length
        else:
            length = 1

        if self._pipeline._progress_bars:
            self._bars = [
                ProgressBar("Stage {}".format(i), length, position=i)
                for i in range(len(self._stages))
            ]
        else:
            self._bars = None
Exemple #19
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList) -> BlockList:
        # Handle empty datasets.
        if blocks.initial_num_blocks() == 0:
            return blocks

        blocks = list(blocks.iter_blocks_with_metadata())
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        map_block = cached_remote_fn(_map_block)
        refs = [
            map_block.options(**remote_args).remote(b, fn, m.input_files)
            for b, m in blocks
        ]

        try:
            results = map_bar.fetch_until_complete(refs)
        except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e:
            # One or more mapper tasks failed, or we received a SIGINT signal
            # while waiting; either way, we cancel all map tasks.
            for ref in refs:
                ray.cancel(ref)
            # Wait until all tasks have failed or been cancelled.
            for ref in refs:
                try:
                    ray.get(ref)
                except (ray.exceptions.RayTaskError,
                        ray.exceptions.TaskCancelledError):
                    pass
            # Reraise the original task failure exception.
            raise e from None

        new_blocks, new_metadata = [], []
        for result in results:
            for block, metadata in result:
                new_blocks.append(block)
                new_metadata.append(metadata)
        return BlockList(list(new_blocks), list(new_metadata))
Exemple #20
0
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    # TODO(Clark): Support multiple boundary sampling keys.
    if isinstance(key, list) and len(key) > 1:
        raise ValueError("Multiple boundary sampling keys not supported.")

    n_samples = int(num_reducers * 10 / len(blocks))

    sample_block = cached_remote_fn(_sample_block)

    sample_results = [
        sample_block.remote(block, n_samples, key) for block in blocks
    ]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    sample_bar.block_until_complete(sample_results)
    sample_bar.close()

    samples = ray.get(sample_results)
    samples = [s for s in samples if len(s) > 0]
    # The dataset is empty
    if len(samples) == 0:
        return [None] * (num_reducers - 1)
    builder = DelegatingArrowBlockBuilder()
    for sample in samples:
        builder.add_block(sample)
    samples = builder.build()
    column = key[0][0] if isinstance(key, list) else None
    sample_items = BlockAccessor.for_block(samples).to_numpy(column)
    sample_items = np.sort(sample_items)
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.linspace(0, 1, num_reducers)
    ]
    return ret[1:]
Exemple #21
0
    def __init__(self, pipeline: "DatasetPipeline[T]"):
        self._pipeline: "DatasetPipeline[T]" = pipeline
        self._stages: List[concurrent.futures.Future[Dataset[Any]]] = [
            None
        ] * (len(self._pipeline._optimized_stages) + 1)
        self._iter = iter(self._pipeline._base_iterable)
        self._pool = concurrent.futures.ThreadPoolExecutor(
            max_workers=len(self._stages))
        self._stages[0] = self._pool.submit(lambda n: pipeline_stage(n),
                                            next(self._iter))

        if self._pipeline._length and self._pipeline._length != float("inf"):
            length = self._pipeline._length
        else:
            length = 1

        if self._pipeline._progress_bars:
            self._bars = [
                ProgressBar("Stage {}".format(i), length, position=i)
                for i in range(len(self._stages))
            ]
        else:
            self._bars = None
Exemple #22
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList[Any]) -> BlockList[Any]:
        # Handle empty datasets.
        if len(blocks) == 0:
            return blocks

        map_bar = ProgressBar("Map Progress", total=len(blocks))

        kwargs = remote_args.copy()
        kwargs["num_returns"] = 2

        map_block = cached_remote_fn(_map_block)
        refs = [
            map_block.options(**kwargs).remote(b, m, fn)
            for b, m in zip(blocks, blocks.get_metadata())
        ]
        new_blocks, new_metadata = zip(*refs)

        new_metadata = list(new_metadata)
        try:
            new_metadata = map_bar.fetch_until_complete(new_metadata)
        except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e:
            # One or more mapper tasks failed, or we received a SIGINT signal
            # while waiting; either way, we cancel all map tasks.
            for ref in new_metadata:
                ray.cancel(ref)
            # Wait until all tasks have failed or been cancelled.
            for ref in new_metadata:
                try:
                    ray.get(ref)
                except (ray.exceptions.RayTaskError,
                        ray.exceptions.TaskCancelledError):
                    pass
            # Reraise the original task failure exception.
            raise e from None
        return BlockList(list(new_blocks), list(new_metadata))
Exemple #23
0
def test_chaos_task_retry(set_kill_interval):
    # Chaos testing.
    @ray.remote(max_retries=-1)
    def task():
        a = ""
        for _ in range(100000):
            a = a + random.choice(string.ascii_letters)
        return

    @ray.remote(max_retries=-1)
    def invoke_nested_task():
        time.sleep(0.8)
        return ray.get(task.remote())

    # 50MB of return values.
    TOTAL_TASKS = 100

    pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS)
    results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)]
    start = time.time()
    pb.block_until_complete(results)
    runtime_with_failure = time.time() - start
    print(f"Runtime when there are many failures: {runtime_with_failure}")
    pb.close()
Exemple #24
0
def simple_shuffle(
        input_blocks: BlockList[T],
        output_num_blocks: int,
        *,
        random_shuffle: bool = False,
        random_seed: Optional[int] = None,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
        _spread_resource_prefix: Optional[str] = None) -> BlockList[T]:
    if map_ray_remote_args is None:
        map_ray_remote_args = {}
    if reduce_ray_remote_args is None:
        reduce_ray_remote_args = {}
    input_num_blocks = len(input_blocks)
    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        map_resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, map_ray_remote_args)
        reduce_resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, reduce_ray_remote_args)
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        map_resource_iter, reduce_resource_iter = itertools.tee(
            itertools.repeat({}), 2)

    shuffle_map = cached_remote_fn(_shuffle_map)
    shuffle_reduce = cached_remote_fn(_shuffle_reduce)

    map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks)

    shuffle_map_out = [
        shuffle_map.options(**map_ray_remote_args,
                            num_returns=output_num_blocks,
                            resources=next(map_resource_iter)).remote(
                                block, i, output_num_blocks, random_shuffle,
                                random_seed)
        for i, block in enumerate(input_blocks)
    ]
    # Eagerly delete the input block references in order to eagerly release
    # the blocks' memory.
    del input_blocks
    if output_num_blocks == 1:
        # Handle the num_returns=1 edge case which doesn't return a list.
        shuffle_map_out = [[x] for x in shuffle_map_out]
    map_bar.block_until_complete([x[0] for x in shuffle_map_out])
    map_bar.close()

    # Randomize the reduce order of the blocks.
    if random_shuffle:
        random = np.random.RandomState(random_seed)
        random.shuffle(shuffle_map_out)

    reduce_bar = ProgressBar("Shuffle Reduce",
                             position=0,
                             total=output_num_blocks)
    shuffle_reduce_out = [
        shuffle_reduce.options(**reduce_ray_remote_args,
                               num_returns=2,
                               resources=next(reduce_resource_iter)).
        remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)])
        for j in range(output_num_blocks)
    ]
    # Eagerly delete the map block references in order to eagerly release
    # the blocks' memory.
    del shuffle_map_out
    new_blocks, new_metadata = zip(*shuffle_reduce_out)
    reduce_bar.block_until_complete(list(new_blocks))
    new_metadata = ray.get(list(new_metadata))
    reduce_bar.close()

    return BlockList(list(new_blocks), list(new_metadata))
Exemple #25
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
        merge_factor: int = 2,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        logger.info("Using experimental push-based shuffle.")
        # TODO(swang): For jobs whose reduce work is heavier than the map work,
        # we should support fractional merge factors.
        # TODO(swang): For large jobs, we should try to choose the merge factor
        # automatically, e.g., by running one test round of map and merge tasks
        # and comparing their run times.
        # TODO(swang): Add option to automatically reduce write amplification
        # during map-merge stage, by limiting how many partitions can be
        # processed concurrently.
        input_blocks_list = input_blocks.get_blocks()
        # Preemptively clear the blocks list since we will incrementally delete
        # the last remaining references as we submit the dependent map tasks
        # during the map-merge stage.
        if clear_input_blocks:
            input_blocks.clear()

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        # The placement strategy for reduce tasks is overwritten to colocate
        # them with their inputs from the merge stage, so remove any
        # pre-specified scheduling strategy here.
        reduce_ray_remote_args = reduce_ray_remote_args.copy()
        reduce_ray_remote_args.pop("scheduling_strategy", None)

        map_fn = self._map_partition
        merge_fn = self._merge

        def map_partition(*args, **kwargs):
            return map_fn(self.map, *args, **kwargs)

        def merge(*args, **kwargs):
            return merge_fn(self.reduce, *args, **kwargs)

        shuffle_map = cached_remote_fn(map_partition)
        shuffle_merge = cached_remote_fn(merge)

        def submit_map_task(arg):
            mapper_idx, block = arg
            # NOTE(swang): Results are shuffled between map and merge tasks, so
            # there is no advantage to colocating specific map and merge tasks.
            # Therefore, we do not specify a node affinity policy for map tasks
            # in case the caller or Ray has a better scheduling strategy, e.g.,
            # based on data locality.
            map_result = shuffle_map.options(
                **map_ray_remote_args,
                num_returns=1 + schedule.num_merge_tasks_per_round,
            ).remote(
                mapper_idx,
                block,
                output_num_blocks,
                schedule,
                *self._map_args,
            )
            metadata_ref = map_result.pop(0)
            return metadata_ref, map_result

        def submit_merge_task(arg):
            merge_idx, map_results = arg
            num_merge_returns = schedule.get_num_reducers_per_merge_idx(merge_idx)
            merge_result = shuffle_merge.options(
                num_returns=1 + num_merge_returns,
                **schedule.get_merge_task_options(merge_idx),
            ).remote(
                *map_results,
                reduce_args=self._reduce_args,
            )
            metadata_ref = merge_result.pop(0)
            return metadata_ref, merge_result

        # Compute all constants used for task scheduling.
        num_cpus_per_node_map = _get_num_cpus_per_node_map()
        schedule = self._compute_shuffle_schedule(
            num_cpus_per_node_map,
            len(input_blocks_list),
            merge_factor,
            output_num_blocks,
        )

        # ObjectRef results from the last round of tasks. Used to add
        # backpressure during pipelining of map and merge tasks.
        last_map_metadata_results = []
        last_merge_metadata_results = []
        # Final outputs from the map-merge stage.
        # This is a map from merge task index to a nested list of merge results
        # (ObjectRefs). Each merge task index corresponds to a partition of P
        # final reduce tasks.
        all_merge_results = [[] for _ in range(schedule.num_merge_tasks_per_round)]
        shuffle_map_metadata = []
        shuffle_merge_metadata = []
        map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list))

        # Execute the map-merge stage. This submits tasks in rounds of M map
        # tasks and N merge tasks each. Task execution between map and merge is
        # pipelined, so that while executing merge for one round of inputs, we
        # also execute the map tasks for the following round.
        input_blocks_list = list(enumerate(input_blocks_list))
        while input_blocks_list:
            # Execute one round of the map stage.
            # Pop from the inputs so that we can clear the memory ASAP.
            round_input_blocks = []
            try:
                for _ in range(schedule.num_map_tasks_per_round):
                    round_input_blocks.append(input_blocks_list.pop(0))
            except IndexError:
                pass
            (
                prev_map_metadata,
                last_map_metadata_results,
                map_results,
            ) = _execute_pipelined_stage(
                submit_map_task,
                last_map_metadata_results,
                round_input_blocks,
                progress_bar=map_bar,
            )
            shuffle_map_metadata += prev_map_metadata

            # Shuffle the map results for the merge tasks.
            merge_args = [
                (merge_idx, [map_result.pop(0) for map_result in map_results])
                for merge_idx in range(schedule.num_merge_tasks_per_round)
            ]
            assert all([not map_result for map_result in map_results])
            # Execute one round of the merge stage.
            (
                prev_merge_metadata,
                last_merge_metadata_results,
                merge_results,
            ) = _execute_pipelined_stage(
                submit_merge_task,
                last_merge_metadata_results,
                merge_args,
            )
            shuffle_merge_metadata += prev_merge_metadata
            for merge_idx, merge_result in enumerate(merge_results):
                all_merge_results[merge_idx].append(merge_result)
            del merge_results

        # Wait for last map and merge tasks to finish.
        prev_map_metadata, _, _ = _execute_pipelined_stage(
            None, last_map_metadata_results, [], progress_bar=map_bar
        )
        shuffle_map_metadata += prev_map_metadata
        map_bar.close()
        prev_merge_metadata, _, _ = _execute_pipelined_stage(
            None, last_merge_metadata_results, []
        )
        shuffle_merge_metadata += prev_merge_metadata

        # Execute and wait for the reduce stage.
        new_metadata, new_blocks = self._execute_reduce_stage(
            output_num_blocks, schedule, reduce_ray_remote_args, all_merge_results
        )

        stats = {
            "map": shuffle_map_metadata,
            "merge": shuffle_merge_metadata,
            "reduce": new_metadata,
        }

        return BlockList(list(new_blocks), list(new_metadata)), stats
Exemple #26
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        input_blocks_list = input_blocks.get_blocks()
        input_num_blocks = len(input_blocks_list)

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        if "scheduling_strategy" not in reduce_ray_remote_args:
            reduce_ray_remote_args = reduce_ray_remote_args.copy()
            reduce_ray_remote_args["scheduling_strategy"] = "SPREAD"

        shuffle_map = cached_remote_fn(self.map)
        shuffle_reduce = cached_remote_fn(self.reduce)

        map_bar = ProgressBar("Shuffle Map", total=input_num_blocks)

        shuffle_map_out = [
            shuffle_map.options(
                **map_ray_remote_args,
                num_returns=1 + output_num_blocks,
            ).remote(i, block, output_num_blocks, *self._map_args)
            for i, block in enumerate(input_blocks_list)
        ]

        # The first item returned is the BlockMetadata.
        shuffle_map_metadata = []
        for i, refs in enumerate(shuffle_map_out):
            shuffle_map_metadata.append(refs[0])
            shuffle_map_out[i] = refs[1:]

        # Eagerly delete the input block references in order to eagerly release
        # the blocks' memory.
        del input_blocks_list
        if clear_input_blocks:
            input_blocks.clear()
        shuffle_map_metadata = map_bar.fetch_until_complete(
            shuffle_map_metadata)
        map_bar.close()

        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        shuffle_reduce_out = [
            shuffle_reduce.options(
                **reduce_ray_remote_args,
                num_returns=2,
            ).remote(
                *self._reduce_args,
                *[shuffle_map_out[i][j] for i in range(input_num_blocks)],
            ) for j in range(output_num_blocks)
        ]
        # Eagerly delete the map block references in order to eagerly release
        # the blocks' memory.
        del shuffle_map_out
        new_blocks, new_metadata = zip(*shuffle_reduce_out)
        new_metadata = reduce_bar.fetch_until_complete(list(new_metadata))
        reduce_bar.close()

        stats = {
            "map": shuffle_map_metadata,
            "reduce": new_metadata,
        }

        return BlockList(list(new_blocks), list(new_metadata)), stats
Exemple #27
0
    def _apply(
        self,
        fn: Any,
        remote_args: dict,
        block_list: BlockList,
        clear_input_blocks: bool,
        name: Optional[str] = None,
    ) -> BlockList:
        """Note: this is not part of the Dataset public API."""
        context = DatasetContext.get_current()

        blocks_in = block_list.get_blocks_with_metadata()

        # Early release block references.
        if clear_input_blocks:
            block_list.clear()

        orig_num_blocks = len(blocks_in)
        results = []
        if name is None:
            name = "map"
        name = name.title()
        map_bar = ProgressBar(name, total=orig_num_blocks)

        class BlockWorker:
            def ready(self):
                return "ok"

            def map_block_split(self, block: Block,
                                input_files: List[str]) -> BlockPartition:
                return _map_block_split(block, fn, input_files)

            @ray.method(num_returns=2)
            def map_block_nosplit(
                    self, block: Block,
                    input_files: List[str]) -> Tuple[Block, BlockMetadata]:
                return _map_block_nosplit(block, fn, input_files)

        if not remote_args:
            remote_args["num_cpus"] = 1

        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        workers = [BlockWorker.remote() for _ in range(self.min_size)]
        tasks = {w.ready.remote(): w for w in workers}
        tasks_in_flight = collections.defaultdict(int)
        metadata_mapping = {}
        block_indices = {}
        ready_workers = set()

        while len(results) < orig_num_blocks:
            ready, _ = ray.wait(list(tasks.keys()),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if (len(workers) < self.max_size
                        and len(ready_workers) / len(workers) > 0.8):
                    w = BlockWorker.remote()
                    workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks.pop(obj_id)

            # Process task result.
            if worker in ready_workers:
                results.append(obj_id)
                tasks_in_flight[worker] -= 1
                map_bar.update(1)
            else:
                ready_workers.add(worker)
                map_bar.set_description(
                    "Map Progress ({} actors {} pending)".format(
                        len(ready_workers),
                        len(workers) - len(ready_workers)))

            # Schedule a new task.
            while (blocks_in and tasks_in_flight[worker] <
                   self.max_tasks_in_flight_per_actor):
                block, meta = blocks_in.pop()
                if context.block_splitting_enabled:
                    ref = worker.map_block_split.remote(
                        block, meta.input_files)
                else:
                    ref, meta_ref = worker.map_block_nosplit.remote(
                        block, meta.input_files)
                    metadata_mapping[ref] = meta_ref
                tasks[ref] = worker
                block_indices[ref] = len(blocks_in)
                tasks_in_flight[worker] += 1

        map_bar.close()
        new_blocks, new_metadata = [], []
        # Put blocks in input order.
        results.sort(key=block_indices.get)
        if context.block_splitting_enabled:
            for result in ray.get(results):
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block in results:
                new_blocks.append(block)
                new_metadata.append(metadata_mapping[block])
            new_metadata = ray.get(new_metadata)
        return BlockList(new_blocks, new_metadata)
Exemple #28
0
def run_actor_workload(total_num_cpus, smoke):
    """Run actor-based workload.

    The test checks if actor restart -1 and task_retries -1 works
    as expected. It basically requires many actors to report the
    seqno to the centralized DB actor while there are failures.
    If at least once is guaranteed upon failures, this test
    shouldn't fail.
    """
    @ray.remote(num_cpus=0)
    class DBActor:
        def __init__(self):
            self.letter_dict = set()

        def add(self, letter):
            self.letter_dict.add(letter)

        def get(self):
            return self.letter_dict

    @ray.remote(num_cpus=1, max_restarts=-1, max_task_retries=-1)
    class ReportActor:
        def __init__(self, db_actor):
            self.db_actor = db_actor

        def add(self, letter):
            ray.get(self.db_actor.add.remote(letter))

    NUM_CPUS = int(total_num_cpus)
    multiplier = 2
    # For smoke mode, run less number of tasks
    if smoke:
        multiplier = 1
    TOTAL_TASKS = int(300 * multiplier)
    current_node_ip = ray.worker.global_worker.node_ip_address
    db_actors = [
        DBActor.options(resources={
            f"node:{current_node_ip}": 0.001
        }).remote() for _ in range(NUM_CPUS)
    ]

    pb = ProgressBar("Chaos test", TOTAL_TASKS * NUM_CPUS)
    actors = []
    for db_actor in db_actors:
        actors.append(ReportActor.remote(db_actor))
    results = []
    highest_reported_num = 0
    for a in actors:
        for _ in range(TOTAL_TASKS):
            results.append(a.add.remote(str(highest_reported_num)))
            highest_reported_num += 1
    pb.fetch_until_complete(results)
    pb.close()
    for actor in actors:
        ray.kill(actor)

    # Consistency check
    wait_for_condition(lambda: (ray.cluster_resources().get("CPU", 0) == ray.
                                available_resources().get("CPU", 0)),
                       timeout=60)
    letter_set = set()
    for db_actor in db_actors:
        letter_set.update(ray.get(db_actor.get.remote()))
    # Make sure the DB actor didn't lose any report.
    # If this assert fails, that means at least once actor task semantic
    # wasn't guaranteed.
    for i in range(highest_reported_num):
        assert str(i) in letter_set, i
Exemple #29
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList) -> BlockList:
        context = DatasetContext.get_current()

        blocks_in = blocks.get_blocks_with_metadata()
        orig_num_blocks = len(blocks_in)
        results = []
        map_bar = ProgressBar("Map Progress", total=orig_num_blocks)

        class BlockWorker:
            def ready(self):
                return "ok"

            def map_block_split(self, block: Block,
                                input_files: List[str]) -> BlockPartition:
                return _map_block_split(block, fn, input_files)

            @ray.method(num_returns=2)
            def map_block_nosplit(
                    self, block: Block,
                    input_files: List[str]) -> Tuple[Block, BlockMetadata]:
                return _map_block_nosplit(block, fn, input_files)

        if not remote_args:
            remote_args["num_cpus"] = 1

        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        self.workers = [BlockWorker.remote()]
        tasks = {w.ready.remote(): w for w in self.workers}
        metadata_mapping = {}
        ready_workers = set()

        while len(results) < orig_num_blocks:
            ready, _ = ray.wait(list(tasks),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if len(ready_workers) / len(self.workers) > 0.8:
                    w = BlockWorker.remote()
                    self.workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(self.workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks[obj_id]
            del tasks[obj_id]

            # Process task result.
            if worker in ready_workers:
                results.append(obj_id)
                map_bar.update(1)
            else:
                ready_workers.add(worker)

            # Schedule a new task.
            if blocks_in:
                block, meta = blocks_in.pop()
                if context.block_splitting_enabled:
                    ref = worker.map_block_split.remote(
                        block, meta.input_files)
                else:
                    ref, meta_ref = worker.map_block_nosplit.remote(
                        block, meta.input_files)
                    metadata_mapping[ref] = meta_ref
                tasks[ref] = worker

        map_bar.close()
        new_blocks, new_metadata = [], []
        if context.block_splitting_enabled:
            for result in ray.get(results):
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block in results:
                new_blocks.append(block)
                new_metadata.append(metadata_mapping[block])
        return BlockList(new_blocks, new_metadata)
Exemple #30
0
def simple_shuffle(
    input_blocks: BlockList,
    block_udf: Optional[Callable[[Block], Iterable[Block]]],
    output_num_blocks: int,
    *,
    random_shuffle: bool = False,
    random_seed: Optional[int] = None,
    map_ray_remote_args: Optional[Dict[str, Any]] = None,
    reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
    _spread_resource_prefix: Optional[str] = None
) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
    input_blocks = input_blocks.get_blocks()
    if map_ray_remote_args is None:
        map_ray_remote_args = {}
    if reduce_ray_remote_args is None:
        reduce_ray_remote_args = {}
    if "scheduling_strategy" not in reduce_ray_remote_args:
        reduce_ray_remote_args = reduce_ray_remote_args.copy()
        reduce_ray_remote_args["scheduling_strategy"] = "SPREAD"
    input_num_blocks = len(input_blocks)
    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        map_resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, map_ray_remote_args
        )
        reduce_resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, reduce_ray_remote_args
        )
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        map_resource_iter, reduce_resource_iter = itertools.tee(itertools.repeat({}), 2)

    shuffle_map = cached_remote_fn(_shuffle_map)
    shuffle_reduce = cached_remote_fn(_shuffle_reduce)

    map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks)

    shuffle_map_out = [
        shuffle_map.options(
            **map_ray_remote_args,
            num_returns=1 + output_num_blocks,
            resources=next(map_resource_iter)
        ).remote(block, block_udf, i, output_num_blocks, random_shuffle, random_seed)
        for i, block in enumerate(input_blocks)
    ]

    # The first item returned is the BlockMetadata.
    shuffle_map_metadata = []
    for i, refs in enumerate(shuffle_map_out):
        shuffle_map_metadata.append(refs[0])
        shuffle_map_out[i] = refs[1:]

    # Eagerly delete the input block references in order to eagerly release
    # the blocks' memory.
    del input_blocks
    shuffle_map_metadata = map_bar.fetch_until_complete(shuffle_map_metadata)
    map_bar.close()

    # Randomize the reduce order of the blocks.
    if random_shuffle:
        random = np.random.RandomState(random_seed)
        random.shuffle(shuffle_map_out)

    reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks)
    shuffle_reduce_out = [
        shuffle_reduce.options(
            **reduce_ray_remote_args,
            num_returns=2,
            resources=next(reduce_resource_iter)
        ).remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)])
        for j in range(output_num_blocks)
    ]
    # Eagerly delete the map block references in order to eagerly release
    # the blocks' memory.
    del shuffle_map_out
    new_blocks, new_metadata = zip(*shuffle_reduce_out)
    reduce_bar.block_until_complete(list(new_blocks))
    new_metadata = ray.get(list(new_metadata))
    reduce_bar.close()

    stats = {
        "map": shuffle_map_metadata,
        "reduce": new_metadata,
    }

    return BlockList(list(new_blocks), list(new_metadata)), stats