Python PlacementGroupSchedulingStrategyの例、ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy Pythonの例

コード例 #1

0

ファイルを表示

ファイル: key_concepts.py プロジェクト: vishalbelsare/ray

def objective(*args):
    # Tell Datasets to use the current placement group for all Datasets tasks.
    ctx = DatasetContext.get_current()
    ctx.scheduling_strategy = PlacementGroupSchedulingStrategy(
        ray.util.get_current_placement_group())
    # This Dataset workload will use that placement group for all read and map tasks.
    ray.data.range(10).show()

コード例 #2

0

ファイルを表示

ファイル: test_scheduling_2.py プロジェクト: novahe/ray

def test_default_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=16,
                     resources={"head": 1},
                     _system_config={"scheduler_spread_threshold": 1})
    cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1})
    cluster.wait_for_nodes()

    ray.init(address=cluster.address)
    pg = ray.util.placement_group(bundles=[{
        "CPU": 1,
        "GPU": 1
    }, {
        "CPU": 1,
        "GPU": 1
    }])
    ray.get(pg.ready())
    ray.get(pg.ready())

    with connect_to_client_or_not(connect_to_client):

        @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY)
        def get_node_id_1():
            return ray.worker.global_worker.current_node_id

        head_node_id = ray.get(
            get_node_id_1.options(resources={
                "head": 1
            }).remote())
        worker_node_id = ray.get(
            get_node_id_1.options(resources={
                "worker": 1
            }).remote())

        assert ray.get(get_node_id_1.remote()) == head_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg))
        def get_node_id_2():
            return ray.worker.global_worker.current_node_id

        assert ray.get(
            get_node_id_2.options(
                scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY).remote()
        ) == head_node_id

        @ray.remote
        def get_node_id_3():
            return ray.worker.global_worker.current_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg,
                        placement_group_capture_child_tasks=True))
        class Actor1():
            def get_node_ids(self):
                return [
                    ray.worker.global_worker.current_node_id,
                    # Use parent's placement group
                    ray.get(get_node_id_3.remote()),
                    ray.get(
                        get_node_id_3.options(
                            scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY).
                        remote())
                ]

        actor1 = Actor1.remote()
        assert ray.get(actor1.get_node_ids.remote()) == \
               [worker_node_id, worker_node_id, head_node_id]

コード例 #3

0

ファイルを表示

ファイル: test_scheduling_2.py プロジェクト: novahe/ray

def test_placement_group_scheduling_strategy(ray_start_cluster,
                                             connect_to_client):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=8, resources={"head": 1})
    cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1})
    cluster.wait_for_nodes()

    ray.init(address=cluster.address)
    pg = ray.util.placement_group(bundles=[{
        "CPU": 1,
        "GPU": 1
    }, {
        "CPU": 1,
        "GPU": 1
    }])
    ray.get(pg.ready())

    with connect_to_client_or_not(connect_to_client):

        @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY)
        def get_node_id_1():
            return ray.worker.global_worker.current_node_id

        worker_node_id = ray.get(
            get_node_id_1.options(resources={
                "worker": 1
            }).remote())

        assert ray.get(
            get_node_id_1.options(
                num_cpus=1,
                scheduling_strategy=PlacementGroupSchedulingStrategy(
                    placement_group=pg)).remote()) == worker_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg))
        def get_node_id_2():
            return ray.worker.global_worker.current_node_id

        assert ray.get(get_node_id_2.remote()) == worker_node_id

        @ray.remote(num_cpus=1,
                    scheduling_strategy=PlacementGroupSchedulingStrategy(
                        placement_group=pg))
        class Actor1():
            def get_node_id(self):
                return ray.worker.global_worker.current_node_id

        actor1 = Actor1.remote()
        assert ray.get(actor1.get_node_id.remote()) == worker_node_id

        @ray.remote
        class Actor2():
            def get_node_id(self):
                return ray.worker.global_worker.current_node_id

        actor2 = Actor2.options(
            scheduling_strategy=PlacementGroupSchedulingStrategy(
                placement_group=pg)).remote()
        assert ray.get(actor2.get_node_id.remote()) == worker_node_id

    with pytest.raises(ValueError):

        @ray.remote(scheduling_strategy=PlacementGroupSchedulingStrategy(
            placement_group=pg))
        def func():
            return 0

        func.options(placement_group=pg).remote()

    with pytest.raises(ValueError):

        @ray.remote
        def func():
            return 0

        func.options(scheduling_strategy="XXX").remote()

    with pytest.raises(ValueError):

        @ray.remote
        def func():
            return 0

        func.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
            placement_group=None)).remote()

コード例 #4

0

ファイルを表示

ファイル: remote_function.py プロジェクト: tchordia/ray

    def _remote(self, args=None, kwargs=None, **task_options):
        """Submit the remote function for execution."""
        # We pop the "max_calls" coming from "@ray.remote" here. We no longer need
        # it in "_remote()".
        task_options.pop("max_calls", None)
        if client_mode_should_convert(auto_init=True):
            return client_mode_convert_function(self, args, kwargs,
                                                **task_options)

        worker = ray.worker.global_worker
        worker.check_connected()

        # If this function was not exported in this session and job, we need to
        # export this function again, because the current GCS doesn't have it.
        if (not self._is_cross_language and self._last_export_session_and_job
                != worker.current_session_and_job):
            self._function_descriptor = PythonFunctionDescriptor.from_function(
                self._function, self._uuid)
            # There is an interesting question here. If the remote function is
            # used by a subsequent driver (in the same script), should the
            # second driver pickle the function again? If yes, then the remote
            # function definition can differ in the second driver (e.g., if
            # variables in its closure have changed). We probably want the
            # behavior of the remote function in the second driver to be
            # independent of whether or not the function was invoked by the
            # first driver. This is an argument for repickling the function,
            # which we do here.
            try:
                self._pickled_function = pickle.dumps(self._function)
            except TypeError as e:
                msg = (
                    "Could not serialize the function "
                    f"{self._function_descriptor.repr}. Check "
                    "https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting "  # noqa
                    "for more information.")
                raise TypeError(msg) from e

            self._last_export_session_and_job = worker.current_session_and_job
            worker.function_actor_manager.export(self)

        kwargs = {} if kwargs is None else kwargs
        args = [] if args is None else args

        # fill task required options
        for k, v in ray_option_utils.task_options.items():
            task_options[k] = task_options.get(k, v.default_value)
        # "max_calls" already takes effects and should not apply again.
        # Remove the default value here.
        task_options.pop("max_calls", None)

        # TODO(suquark): cleanup these fields
        name = task_options["name"]
        runtime_env = parse_runtime_env(task_options["runtime_env"])
        placement_group = task_options["placement_group"]
        placement_group_bundle_index = task_options[
            "placement_group_bundle_index"]
        placement_group_capture_child_tasks = task_options[
            "placement_group_capture_child_tasks"]
        scheduling_strategy = task_options["scheduling_strategy"]
        num_returns = task_options["num_returns"]
        max_retries = task_options["max_retries"]
        retry_exceptions = task_options["retry_exceptions"]

        resources = ray._private.utils.resources_from_ray_options(task_options)

        if scheduling_strategy is None or isinstance(
                scheduling_strategy, PlacementGroupSchedulingStrategy):
            if isinstance(scheduling_strategy,
                          PlacementGroupSchedulingStrategy):
                placement_group = scheduling_strategy.placement_group
                placement_group_bundle_index = (
                    scheduling_strategy.placement_group_bundle_index)
                placement_group_capture_child_tasks = (
                    scheduling_strategy.placement_group_capture_child_tasks)

            if placement_group_capture_child_tasks is None:
                placement_group_capture_child_tasks = (
                    worker.should_capture_child_tasks_in_placement_group)
            placement_group = configure_placement_group_based_on_context(
                placement_group_capture_child_tasks,
                placement_group_bundle_index,
                resources,
                {},  # no placement_resources for tasks
                self._function_descriptor.function_name,
                placement_group=placement_group,
            )
            if not placement_group.is_empty:
                scheduling_strategy = PlacementGroupSchedulingStrategy(
                    placement_group,
                    placement_group_bundle_index,
                    placement_group_capture_child_tasks,
                )
            else:
                scheduling_strategy = "DEFAULT"

        serialized_runtime_env_info = None
        if runtime_env is not None:
            serialized_runtime_env_info = get_runtime_env_info(
                runtime_env,
                is_job_runtime_env=False,
                serialize=True,
            )

        def invocation(args, kwargs):
            if self._is_cross_language:
                list_args = cross_language.format_args(worker, args, kwargs)
            elif not args and not kwargs and not self._function_signature:
                list_args = []
            else:
                list_args = ray._private.signature.flatten_args(
                    self._function_signature, args, kwargs)

            if worker.mode == ray.worker.LOCAL_MODE:
                assert (
                    not self._is_cross_language
                ), "Cross language remote function cannot be executed locally."
            object_refs = worker.core_worker.submit_task(
                self._language,
                self._function_descriptor,
                list_args,
                name if name is not None else "",
                num_returns,
                resources,
                max_retries,
                retry_exceptions,
                scheduling_strategy,
                worker.debugger_breakpoint,
                serialized_runtime_env_info or "{}",
            )
            # Reset worker's debug context from the last "remote" command
            # (which applies only to this .remote call).
            worker.debugger_breakpoint = b""
            if len(object_refs) == 1:
                return object_refs[0]
            elif len(object_refs) > 1:
                return object_refs

        if self._decorator is not None:
            invocation = self._decorator(invocation)

        return invocation(args, kwargs)

コード例 #5

0

ファイルを表示

ファイル: test_warnings.py プロジェクト: vishalbelsare/ray

 def g(*a):
     ctx = DatasetContext.get_current()
     ctx.scheduling_strategy = PlacementGroupSchedulingStrategy(
         ray.util.get_current_placement_group())
     ray.data.range(10).show()

コード例 #6

0

ファイルを表示

ファイル: remote_function.py プロジェクト: ijrsvt/ray

    def _remote(
        self,
        args=None,
        kwargs=None,
        num_returns=None,
        num_cpus=None,
        num_gpus=None,
        memory=None,
        object_store_memory=None,
        accelerator_type=None,
        resources=None,
        max_retries=None,
        retry_exceptions=None,
        placement_group="default",
        placement_group_bundle_index=-1,
        placement_group_capture_child_tasks=None,
        runtime_env=None,
        name="",
        scheduling_strategy: SchedulingStrategyT = None,
    ):
        """Submit the remote function for execution."""

        if client_mode_should_convert(auto_init=True):
            return client_mode_convert_function(
                self,
                args,
                kwargs,
                num_returns=num_returns,
                num_cpus=num_cpus,
                num_gpus=num_gpus,
                memory=memory,
                object_store_memory=object_store_memory,
                accelerator_type=accelerator_type,
                resources=resources,
                max_retries=max_retries,
                retry_exceptions=retry_exceptions,
                placement_group=placement_group,
                placement_group_bundle_index=placement_group_bundle_index,
                placement_group_capture_child_tasks=(
                    placement_group_capture_child_tasks),
                runtime_env=runtime_env,
                name=name,
                scheduling_strategy=scheduling_strategy,
            )

        worker = ray.worker.global_worker
        worker.check_connected()

        # If this function was not exported in this session and job, we need to
        # export this function again, because the current GCS doesn't have it.
        if (not self._is_cross_language and self._last_export_session_and_job
                != worker.current_session_and_job):
            self._function_descriptor = PythonFunctionDescriptor.from_function(
                self._function, self._uuid)
            # There is an interesting question here. If the remote function is
            # used by a subsequent driver (in the same script), should the
            # second driver pickle the function again? If yes, then the remote
            # function definition can differ in the second driver (e.g., if
            # variables in its closure have changed). We probably want the
            # behavior of the remote function in the second driver to be
            # independent of whether or not the function was invoked by the
            # first driver. This is an argument for repickling the function,
            # which we do here.
            try:
                self._pickled_function = pickle.dumps(self._function)
            except TypeError as e:
                msg = (
                    "Could not serialize the function "
                    f"{self._function_descriptor.repr}. Check "
                    "https://docs.ray.io/en/master/serialization.html#troubleshooting "  # noqa
                    "for more information.")
                raise TypeError(msg) from e

            self._last_export_session_and_job = worker.current_session_and_job
            worker.function_actor_manager.export(self)

        kwargs = {} if kwargs is None else kwargs
        args = [] if args is None else args

        if num_returns is None:
            num_returns = self._num_returns
        if max_retries is None:
            max_retries = self._max_retries
        if retry_exceptions is None:
            retry_exceptions = self._retry_exceptions
        if scheduling_strategy is None:
            scheduling_strategy = self._scheduling_strategy

        resources = ray._private.utils.resources_from_resource_arguments(
            self._num_cpus,
            self._num_gpus,
            self._memory,
            self._object_store_memory,
            self._resources,
            self._accelerator_type,
            num_cpus,
            num_gpus,
            memory,
            object_store_memory,
            resources,
            accelerator_type,
        )

        if (placement_group != "default") and (scheduling_strategy
                                               is not None):
            raise ValueError("Placement groups should be specified via the "
                             "scheduling_strategy option. "
                             "The placement_group option is deprecated.")

        if scheduling_strategy is None or isinstance(
                scheduling_strategy, PlacementGroupSchedulingStrategy):
            if isinstance(scheduling_strategy,
                          PlacementGroupSchedulingStrategy):
                placement_group = scheduling_strategy.placement_group
                placement_group_bundle_index = (
                    scheduling_strategy.placement_group_bundle_index)
                placement_group_capture_child_tasks = (
                    scheduling_strategy.placement_group_capture_child_tasks)

            if placement_group_capture_child_tasks is None:
                placement_group_capture_child_tasks = (
                    worker.should_capture_child_tasks_in_placement_group)
            if placement_group == "default":
                placement_group = self._placement_group
            placement_group = configure_placement_group_based_on_context(
                placement_group_capture_child_tasks,
                placement_group_bundle_index,
                resources,
                {},  # no placement_resources for tasks
                self._function_descriptor.function_name,
                placement_group=placement_group,
            )
            if not placement_group.is_empty:
                scheduling_strategy = PlacementGroupSchedulingStrategy(
                    placement_group,
                    placement_group_bundle_index,
                    placement_group_capture_child_tasks,
                )
            else:
                scheduling_strategy = DEFAULT_SCHEDULING_STRATEGY

        if not runtime_env or runtime_env == "{}":
            runtime_env = self._runtime_env

        def invocation(args, kwargs):
            if self._is_cross_language:
                list_args = cross_language.format_args(worker, args, kwargs)
            elif not args and not kwargs and not self._function_signature:
                list_args = []
            else:
                list_args = ray._private.signature.flatten_args(
                    self._function_signature, args, kwargs)

            if worker.mode == ray.worker.LOCAL_MODE:
                assert not self._is_cross_language, (
                    "Cross language remote function "
                    "cannot be executed locally.")
            object_refs = worker.core_worker.submit_task(
                self._language,
                self._function_descriptor,
                list_args,
                name,
                num_returns,
                resources,
                max_retries,
                retry_exceptions,
                scheduling_strategy,
                worker.debugger_breakpoint,
                runtime_env or "{}",
            )
            # Reset worker's debug context from the last "remote" command
            # (which applies only to this .remote call).
            worker.debugger_breakpoint = b""
            if len(object_refs) == 1:
                return object_refs[0]
            elif len(object_refs) > 1:
                return object_refs

        if self._decorator is not None:
            invocation = self._decorator(invocation)

        return invocation(args, kwargs)

コード例 #7

0

ファイルを表示

ray.init(num_cpus=1)
ctx = DatasetContext.get_current()
# Create a placement group that takes up the single core on the cluster.
placement_group = ray.util.placement_group(
    name="core_hog",
    strategy="SPREAD",
    bundles=[
        {
            "CPU": 1
        },
    ],
)
ray.get(placement_group.ready())

# Tell Datasets to use the placement group for all Datasets tasks.
ctx.scheduling_strategy = PlacementGroupSchedulingStrategy(placement_group)
# This Dataset workload will use that placement group for all read and map tasks.
ds = ray.data.range(100, parallelism=2) \
    .map(lambda x: x + 1)

assert ds.take_all() == list(range(1, 101))
# __resource_allocation_end__
# fmt: on

# fmt: off
# __block_move_begin__
import ray
from ray.data.context import DatasetContext

ctx = DatasetContext.get_current()
ctx.optimize_fuse_stages = False

コード例 #8

0

ファイルを表示

ファイル: original_resource_unavailable_example.py プロジェクト: vishalbelsare/ray

import ray
from ray.util.placement_group import (
    placement_group, )
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

# Two "CPU"s are available.
ray.init(num_cpus=2)

# Create a placement group.
pg = placement_group([{"CPU": 2}])
ray.get(pg.ready())


# Now, 2 CPUs are not available anymore because
# they are pre-reserved by the placement group.
@ray.remote(num_cpus=2)
def f():
    return True


# Won't be scheduled because there are no 2 cpus.
f.remote()

# Will be scheduled because 2 cpus are reserved by the placement group.
f.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
    placement_group=pg)).remote()

コード例 #9

0

ファイルを表示

ファイル: test_exit_observability.py プロジェクト: tchordia/ray

def test_worker_exit_intended_system_exit_and_user_error(ray_start_cluster):
    """
    INTENDED_SYSTEM_EXIT
    - (not tested, hard to test) Unused resource removed
    - (tested) Pg removed
    - (tested) Idle
    USER_ERROR
    - (tested) Actor init failed
    """
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=1)
    ray.init(address=cluster.address)

    @ray.remote
    def f():
        return ray.get(g.remote())

    @ray.remote
    def g():
        return os.getpid()

    # Start a task that has a blocking call ray.get with g.remote.
    # g.remote will borrow the CPU and start a new worker.
    # The worker started for g.remote will exit by IDLE timeout.
    pid = ray.get(f.remote())

    def verify_exit_by_idle_timeout():
        worker = get_worker_by_pid(pid)
        type = worker["exit_type"]
        detail = worker["exit_detail"]
        return type == "INTENDED_SYSTEM_EXIT" and "it was idle" in detail

    wait_for_condition(verify_exit_by_idle_timeout)

    @ray.remote
    class A:
        def getpid(self):
            return os.getpid()

    pg = ray.util.placement_group(bundles=[{"CPU": 1}])
    a = A.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
        placement_group=pg)).remote()
    pid = ray.get(a.getpid.remote())
    ray.util.remove_placement_group(pg)

    def verify_exit_by_pg_removed():
        worker = get_worker_by_pid(pid)
        type = worker["exit_type"]
        detail = worker["exit_detail"]
        return (type == "INTENDED_SYSTEM_EXIT"
                and "placement group was removed" in detail)

    wait_for_condition(verify_exit_by_pg_removed)

    @ray.remote
    class PidDB:
        def __init__(self):
            self.pid = None

        def record_pid(self, pid):
            self.pid = pid

        def get_pid(self):
            return self.pid

    p = PidDB.remote()

    @ray.remote
    class FaultyActor:
        def __init__(self):
            p.record_pid.remote(os.getpid())
            raise Exception

        def ready(self):
            pass

    a = FaultyActor.remote()
    wait_for_condition(lambda: ray.get(p.get_pid.remote()) is not None)
    pid = ray.get(p.get_pid.remote())

    def verify_exit_by_actor_init_failure():
        worker = get_worker_by_pid(pid)
        type = worker["exit_type"]
        detail = worker["exit_detail"]
        print(type, detail)
        return (type == "USER_ERROR"
                and "exception in the initialization method" in detail)

    wait_for_condition(verify_exit_by_actor_init_failure)

コード例 #10

0

ファイルを表示

ファイル: placement_group_capture_child_tasks_example.py プロジェクト: vishalbelsare/ray

import ray
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

ray.init(num_cpus=4)

# Create a placement group with the SPREAD strategy.
pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="SPREAD")
ray.get(pg.ready())


@ray.remote(num_cpus=1)
def child():
    pass


@ray.remote(num_cpus=1)
def parent():
    # The child task is scheduled with the same placement group as its parent
    # although child.options(
    #     scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg)
    # ).remote() wasn't called if placement_group_capture_child_tasks is set to True.
    ray.get(child.remote())


ray.get(
    parent.options(scheduling_strategy=PlacementGroupSchedulingStrategy(
        placement_group=pg,
        placement_group_capture_child_tasks=True)).remote())