Exemple #1
0
def run_task_workload(total_num_cpus, smoke):
    """Run task-based workload that doesn't require object reconstruction."""
    @ray.remote(num_cpus=1, max_retries=-1)
    def task():
        def generate_data(size_in_kb=10):
            return np.zeros(1024 * size_in_kb, dtype=np.uint8)

        a = ""
        for _ in range(100000):
            a = a + random.choice(string.ascii_letters)
        return generate_data(size_in_kb=50)

    @ray.remote(num_cpus=1, max_retries=-1)
    def invoke_nested_task():
        time.sleep(0.8)
        return ray.get(task.remote())

    multiplier = 75
    # For smoke mode, run less number of tasks
    if smoke:
        multiplier = 1
    TOTAL_TASKS = int(total_num_cpus * 2 * multiplier)

    pb = ProgressBar("Chaos test", TOTAL_TASKS)
    results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)]
    pb.block_until_complete(results)
    pb.close()

    # Consistency check.
    wait_for_condition(
        lambda: (ray.cluster_resources().get("CPU", 0) == ray.
                 available_resources().get("CPU", 0)),
        timeout=60,
    )
def main():
    """The test simulates the workload with many threaded actors.

    Test is doing 4 things for 1 hour.

    - It first creates actors as many as num_cpus with max_concurrency=10
    - Each actor computes pi and put the result to the queue.
    - Driver keeps getting result & metadata from the actor.
    - Every X seconds, it kills all actors and restarts them.
    """
    ray.init(address="auto")
    args, unknown = parse_script_args()
    num_cpus = ray.cluster_resources()["CPU"]
    num_nodes = sum(1 for n in ray.nodes() if n["Alive"])
    print(f"Total number of actors: {num_cpus}, nodes: {num_nodes}")
    monitor_actor = monitor_memory_usage()

    start = time.time()
    while time.time() - start < args.test_runtime:
        # Step 1: Create actors and start computation loop.
        print("Create actors.")
        actors = start_actors(num_cpus, num_nodes)

        # Step 2: Get the pi result from actors.
        compute_start = time.time()
        print("Start computation.")
        while time.time() - compute_start < args.kill_interval_s:
            # Get the metadata.
            ray.get([actor.get_metadata.remote() for actor in actors])
            # Get the result.
            pb = ProgressBar("Computing Pi", num_cpus)
            results = [actor.get_pi.remote() for actor in actors]
            pb.fetch_until_complete(results)
            pb.close()

        # Step 3: Kill actors.
        print("Kill all actors.")
        for actor in actors:
            ray.kill(actor)

    # Report the result.
    print("PASSED.")
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage with failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")
    # Report the result.
    ray.get(monitor_actor.stop_run.remote())

    result = {"success": 0}
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps(result))
Exemple #3
0
    def _execute_reduce_stage(
        self,
        output_num_blocks: int,
        schedule: _PushBasedShuffleTaskSchedule,
        reduce_ray_remote_args: Dict[str, Any],
        all_merge_results: List[List[ObjectRef]],
    ):
        shuffle_reduce = cached_remote_fn(self.reduce)
        # Execute the final reduce stage.
        shuffle_reduce_out = []
        for reducer_idx in range(output_num_blocks):
            merge_idx = schedule.get_merge_idx_for_reducer_idx(reducer_idx)
            # Submit one partition of reduce tasks, one for each of the P
            # outputs produced by the corresponding merge task.
            # We also add the merge task arguments so that the reduce task
            # is colocated with its inputs.
            shuffle_reduce_out.append(
                shuffle_reduce.options(
                    **reduce_ray_remote_args,
                    **schedule.get_merge_task_options(merge_idx),
                    num_returns=2,
                ).remote(
                    *self._reduce_args,
                    *[
                        merge_results.pop(0)
                        for merge_results in all_merge_results[merge_idx]
                    ],
                )
            )
        for merge_idx, merge_results in enumerate(all_merge_results):
            assert all(len(merge_result) == 0 for merge_result in merge_results), (
                "Reduce stage did not process outputs from merge tasks at index: "
                f"{merge_idx}"
            )
        assert (
            len(shuffle_reduce_out) == output_num_blocks
        ), f"Expected {output_num_blocks} outputs, produced {len(shuffle_reduce_out)}"

        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        reduce_blocks, reduce_metadata = zip(*shuffle_reduce_out)
        reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata))
        reduce_bar.close()

        return reduce_metadata, reduce_blocks
Exemple #4
0
def test_chaos_actor_retry(set_kill_interval):
    # Chaos testing.
    @ray.remote(num_cpus=0.25, max_restarts=-1, max_task_retries=-1)
    class Actor:
        def __init__(self):
            self.letter_dict = set()

        def add(self, letter):
            self.letter_dict.add(letter)

    NUM_CPUS = 16
    TOTAL_TASKS = 300

    pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS * NUM_CPUS)
    actors = [Actor.remote() for _ in range(NUM_CPUS)]
    results = []
    for a in actors:
        results.extend([a.add.remote(str(i)) for i in range(TOTAL_TASKS)])
    start = time.time()
    pb.fetch_until_complete(results)
    runtime_with_failure = time.time() - start
    print(f"Runtime when there are many failures: {runtime_with_failure}")
    pb.close()
Exemple #5
0
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    # TODO(Clark): Support multiple boundary sampling keys.
    if isinstance(key, list) and len(key) > 1:
        raise ValueError("Multiple boundary sampling keys not supported.")

    n_samples = int(num_reducers * 10 / len(blocks))

    sample_block = cached_remote_fn(_sample_block)

    sample_results = [
        sample_block.remote(block, n_samples, key) for block in blocks
    ]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    samples = sample_bar.fetch_until_complete(sample_results)
    sample_bar.close()
    del sample_results
    samples = [s for s in samples if len(s) > 0]
    # The dataset is empty
    if len(samples) == 0:
        return [None] * (num_reducers - 1)
    builder = DelegatingBlockBuilder()
    for sample in samples:
        builder.add_block(sample)
    samples = builder.build()
    column = key[0][0] if isinstance(key, list) else None
    sample_items = BlockAccessor.for_block(samples).to_numpy(column)
    sample_items = np.sort(sample_items)
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.linspace(0, 1, num_reducers)
    ]
    return ret[1:]
Exemple #6
0
def test_chaos_task_retry(set_kill_interval):
    # Chaos testing.
    @ray.remote(max_retries=-1)
    def task():
        a = ""
        for _ in range(100000):
            a = a + random.choice(string.ascii_letters)
        return

    @ray.remote(max_retries=-1)
    def invoke_nested_task():
        time.sleep(0.8)
        return ray.get(task.remote())

    # 50MB of return values.
    TOTAL_TASKS = 100

    pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS)
    results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)]
    start = time.time()
    pb.block_until_complete(results)
    runtime_with_failure = time.time() - start
    print(f"Runtime when there are many failures: {runtime_with_failure}")
    pb.close()
Exemple #7
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        input_blocks_list = input_blocks.get_blocks()
        input_num_blocks = len(input_blocks_list)

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        if "scheduling_strategy" not in reduce_ray_remote_args:
            reduce_ray_remote_args = reduce_ray_remote_args.copy()
            reduce_ray_remote_args["scheduling_strategy"] = "SPREAD"

        shuffle_map = cached_remote_fn(self.map)
        shuffle_reduce = cached_remote_fn(self.reduce)

        map_bar = ProgressBar("Shuffle Map", total=input_num_blocks)

        shuffle_map_out = [
            shuffle_map.options(
                **map_ray_remote_args,
                num_returns=1 + output_num_blocks,
            ).remote(i, block, output_num_blocks, *self._map_args)
            for i, block in enumerate(input_blocks_list)
        ]

        # The first item returned is the BlockMetadata.
        shuffle_map_metadata = []
        for i, refs in enumerate(shuffle_map_out):
            shuffle_map_metadata.append(refs[0])
            shuffle_map_out[i] = refs[1:]

        # Eagerly delete the input block references in order to eagerly release
        # the blocks' memory.
        del input_blocks_list
        if clear_input_blocks:
            input_blocks.clear()
        shuffle_map_metadata = map_bar.fetch_until_complete(
            shuffle_map_metadata)
        map_bar.close()

        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        shuffle_reduce_out = [
            shuffle_reduce.options(
                **reduce_ray_remote_args,
                num_returns=2,
            ).remote(
                *self._reduce_args,
                *[shuffle_map_out[i][j] for i in range(input_num_blocks)],
            ) for j in range(output_num_blocks)
        ]
        # Eagerly delete the map block references in order to eagerly release
        # the blocks' memory.
        del shuffle_map_out
        new_blocks, new_metadata = zip(*shuffle_reduce_out)
        new_metadata = reduce_bar.fetch_until_complete(list(new_metadata))
        reduce_bar.close()

        stats = {
            "map": shuffle_map_metadata,
            "reduce": new_metadata,
        }

        return BlockList(list(new_blocks), list(new_metadata)), stats
Exemple #8
0
    def _apply(
        self,
        fn: Any,
        remote_args: dict,
        block_list: BlockList,
        clear_input_blocks: bool,
        name: Optional[str] = None,
    ) -> BlockList:
        """Note: this is not part of the Dataset public API."""
        context = DatasetContext.get_current()

        blocks_in = block_list.get_blocks_with_metadata()

        # Early release block references.
        if clear_input_blocks:
            block_list.clear()

        orig_num_blocks = len(blocks_in)
        results = []
        if name is None:
            name = "map"
        name = name.title()
        map_bar = ProgressBar(name, total=orig_num_blocks)

        class BlockWorker:
            def ready(self):
                return "ok"

            def map_block_split(self, block: Block,
                                input_files: List[str]) -> BlockPartition:
                return _map_block_split(block, fn, input_files)

            @ray.method(num_returns=2)
            def map_block_nosplit(
                    self, block: Block,
                    input_files: List[str]) -> Tuple[Block, BlockMetadata]:
                return _map_block_nosplit(block, fn, input_files)

        if not remote_args:
            remote_args["num_cpus"] = 1

        remote_args["scheduling_strategy"] = context.scheduling_strategy

        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        workers = [BlockWorker.remote() for _ in range(self.min_size)]
        tasks = {w.ready.remote(): w for w in workers}
        tasks_in_flight = collections.defaultdict(int)
        metadata_mapping = {}
        block_indices = {}
        ready_workers = set()

        while len(results) < orig_num_blocks:
            ready, _ = ray.wait(list(tasks.keys()),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if (len(workers) < self.max_size
                        and len(ready_workers) / len(workers) > 0.8):
                    w = BlockWorker.remote()
                    workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks.pop(obj_id)

            # Process task result.
            if worker in ready_workers:
                results.append(obj_id)
                tasks_in_flight[worker] -= 1
                map_bar.update(1)
            else:
                ready_workers.add(worker)
                map_bar.set_description(
                    "Map Progress ({} actors {} pending)".format(
                        len(ready_workers),
                        len(workers) - len(ready_workers)))

            # Schedule a new task.
            while (blocks_in and tasks_in_flight[worker] <
                   self.max_tasks_in_flight_per_actor):
                block, meta = blocks_in.pop()
                if context.block_splitting_enabled:
                    ref = worker.map_block_split.remote(
                        block, meta.input_files)
                else:
                    ref, meta_ref = worker.map_block_nosplit.remote(
                        block, meta.input_files)
                    metadata_mapping[ref] = meta_ref
                tasks[ref] = worker
                block_indices[ref] = len(blocks_in)
                tasks_in_flight[worker] += 1

        map_bar.close()
        new_blocks, new_metadata = [], []
        # Put blocks in input order.
        results.sort(key=block_indices.get)
        if context.block_splitting_enabled:
            for result in ray.get(results):
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block in results:
                new_blocks.append(block)
                new_metadata.append(metadata_mapping[block])
            new_metadata = ray.get(new_metadata)
        return BlockList(new_blocks, new_metadata)
Exemple #9
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
        merge_factor: int = 2,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        logger.info("Using experimental push-based shuffle.")
        # TODO(swang): For jobs whose reduce work is heavier than the map work,
        # we should support fractional merge factors.
        # TODO(swang): For large jobs, we should try to choose the merge factor
        # automatically, e.g., by running one test round of map and merge tasks
        # and comparing their run times.
        # TODO(swang): Add option to automatically reduce write amplification
        # during map-merge stage, by limiting how many partitions can be
        # processed concurrently.
        input_blocks_list = input_blocks.get_blocks()
        # Preemptively clear the blocks list since we will incrementally delete
        # the last remaining references as we submit the dependent map tasks
        # during the map-merge stage.
        if clear_input_blocks:
            input_blocks.clear()

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        # The placement strategy for reduce tasks is overwritten to colocate
        # them with their inputs from the merge stage, so remove any
        # pre-specified scheduling strategy here.
        reduce_ray_remote_args = reduce_ray_remote_args.copy()
        reduce_ray_remote_args.pop("scheduling_strategy", None)

        map_fn = self._map_partition
        merge_fn = self._merge

        def map_partition(*args, **kwargs):
            return map_fn(self.map, *args, **kwargs)

        def merge(*args, **kwargs):
            return merge_fn(self.reduce, *args, **kwargs)

        shuffle_map = cached_remote_fn(map_partition)
        shuffle_merge = cached_remote_fn(merge)

        def submit_map_task(arg):
            mapper_idx, block = arg
            # NOTE(swang): Results are shuffled between map and merge tasks, so
            # there is no advantage to colocating specific map and merge tasks.
            # Therefore, we do not specify a node affinity policy for map tasks
            # in case the caller or Ray has a better scheduling strategy, e.g.,
            # based on data locality.
            map_result = shuffle_map.options(
                **map_ray_remote_args,
                num_returns=1 + schedule.num_merge_tasks_per_round,
            ).remote(
                mapper_idx,
                block,
                output_num_blocks,
                schedule,
                *self._map_args,
            )
            metadata_ref = map_result.pop(0)
            return metadata_ref, map_result

        def submit_merge_task(arg):
            merge_idx, map_results = arg
            num_merge_returns = schedule.get_num_reducers_per_merge_idx(merge_idx)
            merge_result = shuffle_merge.options(
                num_returns=1 + num_merge_returns,
                **schedule.get_merge_task_options(merge_idx),
            ).remote(
                *map_results,
                reduce_args=self._reduce_args,
            )
            metadata_ref = merge_result.pop(0)
            return metadata_ref, merge_result

        # Compute all constants used for task scheduling.
        num_cpus_per_node_map = _get_num_cpus_per_node_map()
        schedule = self._compute_shuffle_schedule(
            num_cpus_per_node_map,
            len(input_blocks_list),
            merge_factor,
            output_num_blocks,
        )

        # ObjectRef results from the last round of tasks. Used to add
        # backpressure during pipelining of map and merge tasks.
        last_map_metadata_results = []
        last_merge_metadata_results = []
        # Final outputs from the map-merge stage.
        # This is a map from merge task index to a nested list of merge results
        # (ObjectRefs). Each merge task index corresponds to a partition of P
        # final reduce tasks.
        all_merge_results = [[] for _ in range(schedule.num_merge_tasks_per_round)]
        shuffle_map_metadata = []
        shuffle_merge_metadata = []
        map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list))

        # Execute the map-merge stage. This submits tasks in rounds of M map
        # tasks and N merge tasks each. Task execution between map and merge is
        # pipelined, so that while executing merge for one round of inputs, we
        # also execute the map tasks for the following round.
        input_blocks_list = list(enumerate(input_blocks_list))
        while input_blocks_list:
            # Execute one round of the map stage.
            # Pop from the inputs so that we can clear the memory ASAP.
            round_input_blocks = []
            try:
                for _ in range(schedule.num_map_tasks_per_round):
                    round_input_blocks.append(input_blocks_list.pop(0))
            except IndexError:
                pass
            (
                prev_map_metadata,
                last_map_metadata_results,
                map_results,
            ) = _execute_pipelined_stage(
                submit_map_task,
                last_map_metadata_results,
                round_input_blocks,
                progress_bar=map_bar,
            )
            shuffle_map_metadata += prev_map_metadata

            # Shuffle the map results for the merge tasks.
            merge_args = [
                (merge_idx, [map_result.pop(0) for map_result in map_results])
                for merge_idx in range(schedule.num_merge_tasks_per_round)
            ]
            assert all([not map_result for map_result in map_results])
            # Execute one round of the merge stage.
            (
                prev_merge_metadata,
                last_merge_metadata_results,
                merge_results,
            ) = _execute_pipelined_stage(
                submit_merge_task,
                last_merge_metadata_results,
                merge_args,
            )
            shuffle_merge_metadata += prev_merge_metadata
            for merge_idx, merge_result in enumerate(merge_results):
                all_merge_results[merge_idx].append(merge_result)
            del merge_results

        # Wait for last map and merge tasks to finish.
        prev_map_metadata, _, _ = _execute_pipelined_stage(
            None, last_map_metadata_results, [], progress_bar=map_bar
        )
        shuffle_map_metadata += prev_map_metadata
        map_bar.close()
        prev_merge_metadata, _, _ = _execute_pipelined_stage(
            None, last_merge_metadata_results, []
        )
        shuffle_merge_metadata += prev_merge_metadata

        # Execute and wait for the reduce stage.
        new_metadata, new_blocks = self._execute_reduce_stage(
            output_num_blocks, schedule, reduce_ray_remote_args, all_merge_results
        )

        stats = {
            "map": shuffle_map_metadata,
            "merge": shuffle_merge_metadata,
            "reduce": new_metadata,
        }

        return BlockList(list(new_blocks), list(new_metadata)), stats
Exemple #10
0
def fast_repartition(blocks, num_blocks):
    from ray.data.dataset import Dataset

    wrapped_ds = Dataset(
        ExecutionPlan(blocks, DatasetStats(stages={}, parent=None)), 0, lazy=False
    )
    # Compute the (n-1) indices needed for an equal split of the data.
    count = wrapped_ds.count()
    dataset_format = wrapped_ds._dataset_format()
    indices = []
    cur_idx = 0
    for _ in range(num_blocks - 1):
        cur_idx += count / num_blocks
        indices.append(int(cur_idx))
    assert len(indices) < num_blocks, (indices, num_blocks)
    if indices:
        splits = wrapped_ds.split_at_indices(indices)
    else:
        splits = [wrapped_ds]
    # TODO(ekl) include stats for the split tasks. We may also want to
    # consider combining the split and coalesce tasks as an optimization.

    # Coalesce each split into a single block.
    reduce_task = cached_remote_fn(_ShufflePartitionOp.reduce).options(num_returns=2)
    reduce_bar = ProgressBar("Repartition", position=0, total=len(splits))
    reduce_out = [
        reduce_task.remote(False, None, *s.get_internal_block_refs())
        for s in splits
        if s.num_blocks() > 0
    ]

    # Early-release memory.
    del splits, blocks, wrapped_ds

    new_blocks, new_metadata = zip(*reduce_out)
    new_blocks, new_metadata = list(new_blocks), list(new_metadata)
    new_metadata = reduce_bar.fetch_until_complete(new_metadata)
    reduce_bar.close()

    # Handle empty blocks.
    if len(new_blocks) < num_blocks:
        from ray.data._internal.arrow_block import ArrowBlockBuilder
        from ray.data._internal.pandas_block import PandasBlockBuilder
        from ray.data._internal.simple_block import SimpleBlockBuilder

        num_empties = num_blocks - len(new_blocks)
        if dataset_format == "arrow":
            builder = ArrowBlockBuilder()
        elif dataset_format == "pandas":
            builder = PandasBlockBuilder()
        else:
            builder = SimpleBlockBuilder()
        empty_block = builder.build()
        empty_meta = BlockAccessor.for_block(empty_block).get_metadata(
            input_files=None, exec_stats=None
        )  # No stats for empty block.
        empty_blocks, empty_metadata = zip(
            *[(ray.put(empty_block), empty_meta) for _ in range(num_empties)]
        )
        new_blocks += empty_blocks
        new_metadata += empty_metadata

    return BlockList(new_blocks, new_metadata), {}
Exemple #11
0
    def _apply(
        self,
        block_fn: BlockTransform,
        remote_args: dict,
        block_list: BlockList,
        clear_input_blocks: bool,
        name: Optional[str] = None,
        fn: Optional[UDF] = None,
        fn_args: Optional[Iterable[Any]] = None,
        fn_kwargs: Optional[Dict[str, Any]] = None,
        fn_constructor_args: Optional[Iterable[Any]] = None,
        fn_constructor_kwargs: Optional[Dict[str, Any]] = None,
    ) -> BlockList:
        """Note: this is not part of the Dataset public API."""
        if fn_args is None:
            fn_args = tuple()
        if fn_kwargs is None:
            fn_kwargs = {}
        if fn_constructor_args is None:
            fn_constructor_args = tuple()
        if fn_constructor_kwargs is None:
            fn_constructor_kwargs = {}

        context = DatasetContext.get_current()

        blocks_in = block_list.get_blocks_with_metadata()

        # Early release block references.
        if clear_input_blocks:
            block_list.clear()

        orig_num_blocks = len(blocks_in)
        results = []
        if name is None:
            name = "map"
        name = name.title()
        map_bar = ProgressBar(name, total=orig_num_blocks)

        class BlockWorker:
            def __init__(
                self,
                *fn_constructor_args: Any,
                **fn_constructor_kwargs: Any,
            ):
                if not isinstance(fn, CallableClass):
                    if fn_constructor_args or fn_constructor_kwargs:
                        raise ValueError(
                            "fn_constructor_{kw}args only valid for CallableClass "
                            f"UDFs, but got: {fn}"
                        )
                    self.fn = fn
                else:
                    self.fn = fn(*fn_constructor_args, **fn_constructor_kwargs)

            def ready(self):
                return "ok"

            def map_block_split(
                self,
                block: Block,
                input_files: List[str],
                *fn_args,
                **fn_kwargs,
            ) -> BlockPartition:
                return _map_block_split(
                    block, block_fn, input_files, self.fn, *fn_args, **fn_kwargs
                )

            @ray.method(num_returns=2)
            def map_block_nosplit(
                self,
                block: Block,
                input_files: List[str],
                *fn_args,
                **fn_kwargs,
            ) -> Tuple[Block, BlockMetadata]:
                return _map_block_nosplit(
                    block, block_fn, input_files, self.fn, *fn_args, **fn_kwargs
                )

        if "num_cpus" not in remote_args:
            remote_args["num_cpus"] = 1

        if "scheduling_strategy" not in remote_args:
            ctx = DatasetContext.get_current()
            if ctx.scheduling_strategy == DEFAULT_SCHEDULING_STRATEGY:
                remote_args["scheduling_strategy"] = "SPREAD"
            else:
                remote_args["scheduling_strategy"] = ctx.scheduling_strategy

        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        workers = [
            BlockWorker.remote(*fn_constructor_args, **fn_constructor_kwargs)
            for _ in range(self.min_size)
        ]
        tasks = {w.ready.remote(): w for w in workers}
        tasks_in_flight = collections.defaultdict(int)
        metadata_mapping = {}
        block_indices = {}
        ready_workers = set()

        try:
            while len(results) < orig_num_blocks:
                ready, _ = ray.wait(
                    list(tasks.keys()), timeout=0.01, num_returns=1, fetch_local=False
                )
                if not ready:
                    if (
                        len(workers) < self.max_size
                        and len(ready_workers) / len(workers)
                        > self.ready_to_total_workers_ratio
                    ):
                        w = BlockWorker.remote(
                            *fn_constructor_args, **fn_constructor_kwargs
                        )
                        workers.append(w)
                        tasks[w.ready.remote()] = w
                        map_bar.set_description(
                            "Map Progress ({} actors {} pending)".format(
                                len(ready_workers), len(workers) - len(ready_workers)
                            )
                        )
                    continue

                [obj_id] = ready
                worker = tasks.pop(obj_id)

                # Process task result.
                if worker in ready_workers:
                    results.append(obj_id)
                    tasks_in_flight[worker] -= 1
                    map_bar.update(1)
                else:
                    ready_workers.add(worker)
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers), len(workers) - len(ready_workers)
                        )
                    )

                # Schedule a new task.
                while (
                    blocks_in
                    and tasks_in_flight[worker] < self.max_tasks_in_flight_per_actor
                ):
                    block, meta = blocks_in.pop()
                    if context.block_splitting_enabled:
                        ref = worker.map_block_split.remote(
                            block,
                            meta.input_files,
                            *fn_args,
                            **fn_kwargs,
                        )
                    else:
                        ref, meta_ref = worker.map_block_nosplit.remote(
                            block,
                            meta.input_files,
                            *fn_args,
                            **fn_kwargs,
                        )
                        metadata_mapping[ref] = meta_ref
                    tasks[ref] = worker
                    block_indices[ref] = len(blocks_in)
                    tasks_in_flight[worker] += 1

            map_bar.close()
            self.num_workers += len(workers)
            new_blocks, new_metadata = [], []
            # Put blocks in input order.
            results.sort(key=block_indices.get)
            if context.block_splitting_enabled:
                for result in ray.get(results):
                    for block, metadata in result:
                        new_blocks.append(block)
                        new_metadata.append(metadata)
            else:
                for block in results:
                    new_blocks.append(block)
                    new_metadata.append(metadata_mapping[block])
                new_metadata = ray.get(new_metadata)
            return BlockList(new_blocks, new_metadata)

        except Exception as e:
            try:
                for worker in workers:
                    ray.kill(worker)
            except Exception as err:
                logger.exception(f"Error killing workers: {err}")
            finally:
                raise e
Exemple #12
0
def run_actor_workload(total_num_cpus, smoke):
    """Run actor-based workload.

    The test checks if actor restart -1 and task_retries -1 works
    as expected. It basically requires many actors to report the
    seqno to the centralized DB actor while there are failures.
    If at least once is guaranteed upon failures, this test
    shouldn't fail.
    """
    @ray.remote(num_cpus=0)
    class DBActor:
        def __init__(self):
            self.letter_dict = set()

        def add(self, letter):
            self.letter_dict.add(letter)

        def get(self):
            return self.letter_dict

    @ray.remote(num_cpus=1, max_restarts=-1, max_task_retries=-1)
    class ReportActor:
        def __init__(self, db_actor):
            self.db_actor = db_actor

        def add(self, letter):
            ray.get(self.db_actor.add.remote(letter))

    NUM_CPUS = int(total_num_cpus)
    multiplier = 2
    # For smoke mode, run less number of tasks
    if smoke:
        multiplier = 1
    TOTAL_TASKS = int(300 * multiplier)
    current_node_ip = ray.worker.global_worker.node_ip_address
    db_actors = [
        DBActor.options(resources={
            f"node:{current_node_ip}": 0.001
        }).remote() for _ in range(NUM_CPUS)
    ]

    pb = ProgressBar("Chaos test", TOTAL_TASKS * NUM_CPUS)
    actors = []
    for db_actor in db_actors:
        actors.append(ReportActor.remote(db_actor))
    results = []
    highest_reported_num = 0
    for a in actors:
        for _ in range(TOTAL_TASKS):
            results.append(a.add.remote(str(highest_reported_num)))
            highest_reported_num += 1
    pb.fetch_until_complete(results)
    pb.close()
    for actor in actors:
        ray.kill(actor)

    # Consistency check
    wait_for_condition(
        lambda: (ray.cluster_resources().get("CPU", 0) == ray.
                 available_resources().get("CPU", 0)),
        timeout=60,
    )
    letter_set = set()
    for db_actor in db_actors:
        letter_set.update(ray.get(db_actor.get.remote()))
    # Make sure the DB actor didn't lose any report.
    # If this assert fails, that means at least once actor task semantic
    # wasn't guaranteed.
    for i in range(highest_reported_num):
        assert str(i) in letter_set, i
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
        merge_factor: int = 2,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        logger.info("Using experimental push-based shuffle.")
        # TODO(swang): For jobs whose reduce work is heavier than the map work,
        # we should support fractional merge factors.
        # TODO(swang): For large jobs, we should try to choose the merge factor
        # automatically, e.g., by running one test round of map and merge tasks
        # and comparing their run times.
        # TODO(swang): Add option to automatically reduce write amplification
        # during map-merge stage, by limiting how many partitions can be
        # processed concurrently.
        input_blocks_list = input_blocks.get_blocks()
        # Preemptively clear the blocks list since we will incrementally delete
        # the last remaining references as we submit the dependent map tasks
        # during the map-merge stage.
        if clear_input_blocks:
            input_blocks.clear()

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        # The placement strategy for reduce tasks is overwritten to colocate
        # them with their inputs from the merge stage, so remove any
        # pre-specified scheduling strategy here.
        reduce_ray_remote_args = reduce_ray_remote_args.copy()
        reduce_ray_remote_args.pop("scheduling_strategy", None)

        # Compute all constants used for task scheduling.
        num_cpus_per_node_map = _get_num_cpus_per_node_map()
        stage = self._compute_shuffle_schedule(
            num_cpus_per_node_map,
            len(input_blocks_list),
            merge_factor,
            output_num_blocks,
        )

        map_fn = self._map_partition
        merge_fn = self._merge

        def map_partition(*args, **kwargs):
            return map_fn(self.map, *args, **kwargs)

        def merge(*args, **kwargs):
            return merge_fn(self.reduce, *args, **kwargs)

        shuffle_map = cached_remote_fn(map_partition)
        shuffle_map = shuffle_map.options(
            **map_ray_remote_args,
            num_returns=1 + stage.num_merge_tasks_per_round,
        )

        map_stage_iter = _MapStageIterator(
            input_blocks_list,
            shuffle_map,
            [output_num_blocks, stage.merge_schedule, *self._map_args],
        )
        map_bar = ProgressBar("Shuffle Map",
                              position=0,
                              total=len(input_blocks_list))
        map_stage_executor = _PipelinedStageExecutor(
            map_stage_iter,
            stage.num_map_tasks_per_round,
            progress_bar=map_bar)

        shuffle_merge = cached_remote_fn(merge)
        merge_stage_iter = _MergeStageIterator(map_stage_iter, shuffle_merge,
                                               stage, self._reduce_args)
        merge_stage_executor = _PipelinedStageExecutor(
            merge_stage_iter,
            stage.num_merge_tasks_per_round,
            max_concurrent_rounds=2)

        # Execute the map-merge stage. This submits tasks in rounds of M map
        # tasks and N merge tasks each. Task execution between map and merge is
        # pipelined, so that while executing merge for one round of inputs, we
        # also execute the map tasks for the following round.
        map_done = False
        merge_done = False
        map_stage_metadata = []
        merge_stage_metadata = []
        while not (map_done and merge_done):
            try:
                map_stage_metadata += next(map_stage_executor)
            except StopIteration:
                map_done = True
                break

            try:
                merge_stage_metadata += next(merge_stage_executor)
            except StopIteration:
                merge_done = True
                break

        map_bar.close()
        all_merge_results = merge_stage_iter.pop_merge_results()

        # Execute and wait for the reduce stage.
        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        shuffle_reduce = cached_remote_fn(self.reduce)
        reduce_stage_iter = _ReduceStageIterator(
            stage,
            shuffle_reduce,
            all_merge_results,
            reduce_ray_remote_args,
            self._reduce_args,
        )

        max_reduce_tasks_in_flight = output_num_blocks
        ctx = DatasetContext.get_current()
        if ctx.pipeline_push_based_shuffle_reduce_tasks:
            # If pipelining is enabled, we should still try to utilize all
            # cores.
            max_reduce_tasks_in_flight = min(
                max_reduce_tasks_in_flight,
                sum(num_cpus_per_node_map.values()))

        reduce_stage_executor = _PipelinedStageExecutor(
            reduce_stage_iter,
            max_reduce_tasks_in_flight,
            max_concurrent_rounds=2,
            progress_bar=reduce_bar,
        )
        reduce_stage_metadata = []
        while True:
            try:
                reduce_stage_metadata += next(reduce_stage_executor)
            except StopIteration:
                break

        new_blocks = reduce_stage_iter.pop_reduce_results()
        sorted_blocks = [(block[0], block[1], reduce_stage_metadata[i])
                         for i, block in enumerate(new_blocks)]
        sorted_blocks.sort(key=lambda x: x[0])
        _, new_blocks, reduce_stage_metadata = zip(*sorted_blocks)
        del sorted_blocks

        assert (
            len(new_blocks) == output_num_blocks
        ), f"Expected {output_num_blocks} outputs, produced {len(new_blocks)}"
        reduce_bar.close()

        stats = {
            "map": map_stage_metadata,
            "merge": merge_stage_metadata,
            "reduce": reduce_stage_metadata,
        }

        return BlockList(list(new_blocks), list(reduce_stage_metadata)), stats