Beispiel #1
0
    def __init__(self, function, num_cpus, num_gpus, memory,
                 object_store_memory, resources, num_return_vals, max_calls,
                 max_retries):
        self._function = function
        self._function_name = (self._function.__module__ + "." +
                               self._function.__name__)
        self._num_cpus = (DEFAULT_REMOTE_FUNCTION_CPUS
                          if num_cpus is None else num_cpus)
        self._num_gpus = num_gpus
        self._memory = memory
        if object_store_memory is not None:
            raise NotImplementedError(
                "setting object_store_memory is not implemented for tasks")
        self._object_store_memory = None
        self._resources = resources
        self._num_return_vals = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS if
                                 num_return_vals is None else num_return_vals)
        self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS
                           if max_calls is None else max_calls)
        self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES
                             if max_retries is None else max_retries)
        self._decorator = getattr(function, "__ray_invocation_decorator__",
                                  None)

        self._function_signature = ray.signature.extract_signature(
            self._function)

        self._last_export_session_and_job = None
        # Override task.remote's signature and docstring
        @wraps(function)
        def _remote_proxy(*args, **kwargs):
            return self._remote(args=args, kwargs=kwargs)

        self.remote = _remote_proxy
        self.direct_call_enabled = ray_constants.direct_call_enabled()
Beispiel #2
0
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
               max_reconstructions):
    # Give an error if cls is an old-style class.
    if not issubclass(cls, object):
        raise TypeError(
            "The @ray.remote decorator cannot be applied to old-style "
            "classes. In Python 2, you must declare the class with "
            "'class ClassName(object):' instead of 'class ClassName:'.")

    if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
        raise TypeError(
            "A checkpointable actor class should implement all abstract "
            "methods in the `Checkpointable` interface.")

    if max_reconstructions is None:
        if ray_constants.direct_call_enabled():
            # Allow the actor creation task to be resubmitted automatically
            # by default.
            max_reconstructions = 3
        else:
            max_reconstructions = 0

    if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
            ray_constants.INFINITE_RECONSTRUCTION):
        raise Exception("max_reconstructions must be in range [%d, %d]." %
                        (ray_constants.NO_RECONSTRUCTION,
                         ray_constants.INFINITE_RECONSTRUCTION))

    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
        def __ray_terminate__(self):
            worker = ray.worker.get_global_worker()
            if worker.mode != ray.LOCAL_MODE:
                ray.actor.exit_actor()

        def __ray_checkpoint__(self):
            """Save a checkpoint.

            This task saves the current state of the actor, the current task
            frontier according to the raylet, and the checkpoint index
            (number of tasks executed so far).
            """
            worker = ray.worker.global_worker
            if not isinstance(self, ray.actor.Checkpointable):
                raise Exception(
                    "__ray_checkpoint__.remote() may only be called on actors "
                    "that implement ray.actor.Checkpointable")
            return worker._save_actor_checkpoint()

    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

    return ActorClass._ray_from_modified_class(Class,
                                               ActorClassID.from_random(),
                                               max_reconstructions, num_cpus,
                                               num_gpus, memory,
                                               object_store_memory, resources)
Beispiel #3
0
    def testLineageEvictedReconstructionFails(self):
        if ray_constants.direct_call_enabled():
            return  # not relevant

        @ray.remote
        def f(data):
            return 0

        x_id = f.remote(None)
        ray.get(x_id)
        # Hold references to the ray.put objects so they aren't LRU'd.
        oids = []
        for _ in range(400):
            new_oids = [f.remote(np.zeros(10000)) for _ in range(50)]
            oids.extend(new_oids)
            ray.get(new_oids)
        self.assertRaises(ray.exceptions.UnreconstructableError,
                          lambda: ray.get(x_id))
Beispiel #4
0
    def _remote(self,
                args=None,
                kwargs=None,
                num_cpus=None,
                num_gpus=None,
                memory=None,
                object_store_memory=None,
                resources=None,
                is_direct_call=None,
                max_concurrency=None,
                name=None,
                detached=False,
                is_asyncio=False):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            memory: Restrict the heap memory usage of this actor.
            object_store_memory: Restrict the object store memory used by
                this actor when creating objects.
            resources: The custom resources required by the actor creation
                task.
            is_direct_call: Use direct actor calls.
            max_concurrency: The max number of concurrent calls to allow for
                this actor. This only works with direct actor calls. The max
                concurrency defaults to 1 for threaded execution, and 100 for
                asyncio execution. Note that the execution order is not
                guaranteed when max_concurrency > 1.
            name: The globally unique name for the actor.
            detached: Whether the actor should be kept alive after driver
                exits.
            is_asyncio: Turn on async actor calls. This only works with direct
                actor calls.

        Returns:
            A handle to the newly created actor.
        """
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}
        if is_direct_call is None:
            is_direct_call = ray_constants.direct_call_enabled()
        if max_concurrency is None:
            if is_asyncio:
                max_concurrency = 100
            else:
                max_concurrency = 1

        if max_concurrency > 1 and not is_direct_call:
            raise ValueError(
                "setting max_concurrency requires is_direct_call=True")
        if max_concurrency < 1:
            raise ValueError("max_concurrency must be >= 1")

        if is_asyncio and not is_direct_call:
            raise ValueError(
                "Setting is_asyncio requires is_direct_call=True.")

        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        meta = self.__ray_metadata__

        if detached and name is None:
            raise Exception("Detached actors must be named. "
                            "Please use Actor._remote(name='some_name') "
                            "to associate the name.")

        # Check whether the name is already taken.
        if name is not None:
            try:
                ray.experimental.get_actor(name)
            except ValueError:  # name is not taken, expected.
                pass
            else:
                raise ValueError(
                    "The name {name} is already taken. Please use "
                    "a different name or get existing actor using "
                    "ray.experimental.get_actor('{name}')".format(name=name))

        # Set the actor's default resources if not already set. First three
        # conditions are to check that no resources were specified in the
        # decorator. Last three conditions are to check that no resources were
        # specified when _remote() was called.
        if (meta.num_cpus is None and meta.num_gpus is None
                and meta.resources is None and num_cpus is None
                and num_gpus is None and resources is None):
            # In the default case, actors acquire no resources for
            # their lifetime, and actor methods will require 1 CPU.
            cpus_to_use = ray_constants.DEFAULT_ACTOR_CREATION_CPU_SIMPLE
            actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SIMPLE
        else:
            # If any resources are specified (here or in decorator), then
            # all resources are acquired for the actor's lifetime and no
            # resources are associated with methods.
            cpus_to_use = (ray_constants.DEFAULT_ACTOR_CREATION_CPU_SPECIFIED
                           if meta.num_cpus is None else meta.num_cpus)
            actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SPECIFIED

        function_name = "__init__"
        function_descriptor = FunctionDescriptor(
            meta.modified_class.__module__, function_name,
            meta.modified_class.__name__)

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            actor_id = ActorID.from_random()
            worker.actors[actor_id] = meta.modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
        else:
            # Export the actor.
            if (meta.last_export_session_and_job !=
                    worker.current_session_and_job):
                # If this actor class was not exported in this session and job,
                # we need to export this function again, because current GCS
                # doesn't have it.
                meta.last_export_session_and_job = (
                    worker.current_session_and_job)
                worker.function_actor_manager.export_actor_class(
                    meta.modified_class, meta.actor_method_names)

            resources = ray.utils.resources_from_resource_arguments(
                cpus_to_use, meta.num_gpus, meta.memory,
                meta.object_store_memory, meta.resources, num_cpus, num_gpus,
                memory, object_store_memory, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert actor_method_cpu in [0, 1]
            if actor_method_cpu == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1
            function_signature = meta.method_signatures[function_name]
            creation_args = signature.flatten_args(function_signature, args,
                                                   kwargs)
            actor_id = worker.core_worker.create_actor(
                function_descriptor.get_function_descriptor_list(),
                creation_args, meta.max_reconstructions, resources,
                actor_placement_resources, is_direct_call, max_concurrency,
                detached, is_asyncio)

        actor_handle = ActorHandle(
            actor_id,
            meta.modified_class.__module__,
            meta.class_name,
            meta.actor_method_names,
            meta.method_decorators,
            meta.method_signatures,
            meta.actor_method_num_return_vals,
            actor_method_cpu,
            worker.current_session_and_job,
            original_handle=True)

        if name is not None:
            ray.experimental.register_actor(name, actor_handle)

        return actor_handle
Beispiel #5
0
    # tasks.
    tasks = [f.remote() for _ in range(10)]
    start = time.time()
    ray.get(tasks)
    end = time.time()

    # Submit some more tasks that can only be executed on the remote nodes.
    tasks = [f.remote() for _ in range(10)]
    # Sleep for a bit to let the tasks finish.
    time.sleep((end - start) * 2)
    _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0)
    # All remote tasks should have finished.
    assert len(unready) == 0


@pytest.mark.skipif(ray_constants.direct_call_enabled(), reason="TODO(ekl)")
def test_object_transfer_dump(ray_start_cluster):
    cluster = ray_start_cluster

    num_nodes = 3
    for i in range(num_nodes):
        cluster.add_node(resources={str(i): 1}, object_store_memory=10**9)
    ray.init(address=cluster.address)

    @ray.remote
    def f(x):
        return

    # These objects will live on different nodes.
    object_ids = [
        f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes)
Beispiel #6
0
                "class_name": tune.grid_search(["a"]),
                "config": {{"lr": tune.grid_search([1, 2])}}
            }},
        }},
        "local_dir": os.path.expanduser("~/tmp")
    }}
}})
print("success")
""".format(address_info["redis_address"])

    for i in range(2):
        out = run_string_as_driver(driver_script)
        assert "success" in out


@pytest.mark.skipif(ray_constants.direct_call_enabled(),
                    reason="fate sharing not implemented yet")
def test_driver_exiting_when_worker_blocked(call_ray_start):
    # This test will create some drivers that submit some tasks and then
    # exit without waiting for the tasks to complete.
    address = call_ray_start

    ray.init(address=address)

    # Define a driver that creates two tasks, one that runs forever and the
    # other blocked on the first in a `ray.get`.
    driver_script = """
import time
import ray
ray.init(address="{}")
@ray.remote
Beispiel #7
0
import json
import os
import signal
import sys
import time

import pytest

import ray
import ray.ray_constants as ray_constants
from ray.cluster_utils import Cluster
from ray.test_utils import RayTestTimeoutException

RAY_FORCE_DIRECT = ray_constants.direct_call_enabled()


@pytest.fixture(params=[(1, 4), (4, 4)])
def ray_start_workers_separate_multinode(request):
    num_nodes = request.param[0]
    num_initial_workers = request.param[1]
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_initial_workers)
    ray.init(address=cluster.address)

    yield num_nodes, num_initial_workers
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Beispiel #8
0
    # tasks.
    tasks = [f.remote() for _ in range(10)]
    start = time.time()
    ray.get(tasks)
    end = time.time()

    # Submit some more tasks that can only be executed on the remote nodes.
    tasks = [f.remote() for _ in range(10)]
    # Sleep for a bit to let the tasks finish.
    time.sleep((end - start) * 2)
    _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0)
    # All remote tasks should have finished.
    assert len(unready) == 0


@pytest.mark.skipif(ray_constants.direct_call_enabled(), reason="TODO(ekl)")
def test_object_transfer_dump(ray_start_cluster):
    cluster = ray_start_cluster

    num_nodes = 3
    for i in range(num_nodes):
        cluster.add_node(resources={str(i): 1}, object_store_memory=10**9)
    ray.init(address=cluster.address)

    @ray.remote
    def f(x):
        return

    # These objects will live on different nodes.
    object_ids = [
        f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes)
Beispiel #9
0
    actor_info, = actor_table.values()
    assert actor_info["JobID"] == job_id.hex()
    assert "IPAddress" in actor_info["Address"]
    assert "IPAddress" in actor_info["OwnerAddress"]
    assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"]

    job_table = ray.jobs()

    assert len(job_table) == 1
    assert job_table[0]["JobID"] == job_id.hex()
    assert job_table[0]["NodeManagerAddress"] == node_ip_address


@pytest.mark.skipif(
    ray_constants.direct_call_enabled(),
    reason="object and task API not supported")
def test_global_state_task_object_api(shutdown_only):
    ray.init()

    job_id = ray.utils.compute_job_id_from_driver(
        ray.WorkerID(ray.worker.global_worker.worker_id))
    driver_task_id = ray.worker.global_worker.current_task_id.hex()

    nil_actor_id_hex = ray.ActorID.nil().hex()

    @ray.remote
    def f(*xs):
        return 1

    x_id = ray.put(1)