Beispiel #1
0
def export_actor(actor_id, class_id, class_name, actor_method_names,
                 actor_method_num_return_vals, actor_creation_resources,
                 actor_method_cpus, worker):
    """Export an actor to redis.

    Args:
        actor_id (common.ObjectID): The ID of the actor.
        class_id (str): A random ID for the actor class.
        class_name (str): The actor class name.
        actor_method_names (list): A list of the names of this actor's methods.
        actor_method_num_return_vals: A list of the number of return values for
            each of the actor's methods.
        actor_creation_resources: A dictionary mapping resource name to the
            quantity of that resource required by the actor.
        actor_method_cpus: The number of CPUs required by actor methods.
    """
    ray.worker.check_main_thread()
    if worker.mode is None:
        raise Exception("Actors cannot be created before Ray has been "
                        "started. You can start Ray with 'ray.init()'.")

    driver_id = worker.task_driver_id.id()
    register_actor_signatures(
        worker,
        driver_id,
        class_id,
        class_name,
        actor_method_names,
        actor_method_num_return_vals,
        actor_creation_resources=actor_creation_resources,
        actor_method_cpus=actor_method_cpus)

    args = [class_id]
    function_id = compute_actor_creation_function_id(class_id)
    return worker.submit_task(function_id, args, actor_creation_id=actor_id)[0]
Beispiel #2
0
    def _actor_method_call(self,
                           method_name,
                           args=None,
                           kwargs=None,
                           num_return_vals=None,
                           dependency=None):
        """Method execution stub for an actor handle.

        This is the function that executes when
        `actor.method_name.remote(*args, **kwargs)` is called. Instead of
        executing locally, the method is packaged as a task and scheduled
        to the remote actor instance.

        Args:
            method_name: The name of the actor method to execute.
            args: A list of arguments for the actor method.
            kwargs: A dictionary of keyword arguments for the actor method.
            dependency: The object ID that this method is dependent on.
                Defaults to None, for no dependencies. Most tasks should
                pass in the dummy object returned by the preceding task.
                Some tasks, such as checkpoint and terminate methods, have
                no dependencies.

        Returns:
            object_ids: A list of object IDs returned by the remote actor
                method.
        """
        worker = ray.worker.get_global_worker()

        worker.check_connected()

        function_signature = self._ray_method_signatures[method_name]
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}
        args = signature.extend_args(function_signature, args, kwargs)

        # Execute functions locally if Ray is run in LOCAL_MODE
        # Copy args to prevent the function from mutating them.
        if worker.mode == ray.LOCAL_MODE:
            return getattr(worker.actors[self._ray_actor_id],
                           method_name)(*copy.deepcopy(args))

        # Add the execution dependency.
        if dependency is None:
            execution_dependencies = []
        else:
            execution_dependencies = [dependency]

        is_actor_checkpoint_method = (method_name == "__ray_checkpoint__")

        # Right now, if the actor handle has been pickled, we create a
        # temporary actor handle id for invocations.
        # TODO(pcm): This still leads to a lot of actor handles being
        # created, there should be a better way to handle pickled
        # actor handles.
        if self._ray_actor_handle_id is None:
            actor_handle_id = compute_actor_handle_id_non_forked(
                self._ray_actor_id, self._ray_previous_actor_handle_id,
                worker.current_task_id)
            # Each new task creates a new actor handle id, so we need to
            # reset the actor counter to 0
            if (actor_handle_id !=
                    self._ray_previously_generated_actor_handle_id):
                self._ray_actor_counter = 0
                self._ray_previously_generated_actor_handle_id = (
                    actor_handle_id)
        else:
            actor_handle_id = self._ray_actor_handle_id

        function_id = FunctionActorManager.compute_actor_method_function_id(
            self._ray_class_name, method_name)
        object_ids = worker.submit_task(
            function_id,
            args,
            actor_id=self._ray_actor_id,
            actor_handle_id=actor_handle_id,
            actor_counter=self._ray_actor_counter,
            is_actor_checkpoint_method=is_actor_checkpoint_method,
            actor_creation_dummy_object_id=(
                self._ray_actor_creation_dummy_object_id),
            execution_dependencies=execution_dependencies,
            # We add one for the dummy return ID.
            num_return_vals=num_return_vals + 1,
            resources={"CPU": self._ray_actor_method_cpus},
            placement_resources={},
            driver_id=self._ray_actor_driver_id)
        # Update the actor counter and cursor to reflect the most recent
        # invocation.
        self._ray_actor_counter += 1
        # The last object returned is the dummy object that should be
        # passed in to the next actor method. Do not return it to the user.
        self._ray_actor_cursor = object_ids.pop()

        if len(object_ids) == 1:
            object_ids = object_ids[0]
        elif len(object_ids) == 0:
            object_ids = None

        return object_ids
Beispiel #3
0
    def _actor_method_call(self,
                           method_name,
                           args=None,
                           kwargs=None,
                           num_return_vals=None):
        """Method execution stub for an actor handle.

        This is the function that executes when
        `actor.method_name.remote(*args, **kwargs)` is called. Instead of
        executing locally, the method is packaged as a task and scheduled
        to the remote actor instance.

        Args:
            method_name: The name of the actor method to execute.
            args: A list of arguments for the actor method.
            kwargs: A dictionary of keyword arguments for the actor method.
            num_return_vals (int): The number of return values for the method.

        Returns:
            object_ids: A list of object IDs returned by the remote actor
                method.
        """
        worker = ray.worker.get_global_worker()

        worker.check_connected()

        function_signature = self._ray_method_signatures[method_name]
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}
        args = signature.extend_args(function_signature, args, kwargs)

        function_descriptor = FunctionDescriptor(
            self._ray_module_name, method_name, self._ray_class_name)

        if worker.mode == ray.LOCAL_MODE:
            function = getattr(worker.actors[self._ray_actor_id], method_name)
            object_ids = worker.local_mode_manager.execute(
                function, function_descriptor, args, num_return_vals)
        else:
            with self._ray_actor_lock:
                object_ids = worker.submit_task(
                    function_descriptor,
                    args,
                    actor_id=self._ray_actor_id,
                    actor_handle_id=self._ray_actor_handle_id,
                    actor_counter=self._ray_actor_counter,
                    actor_creation_dummy_object_id=(
                        self._ray_actor_creation_dummy_object_id),
                    previous_actor_task_dummy_object_id=self._ray_actor_cursor,
                    new_actor_handles=self._ray_new_actor_handles,
                    # We add one for the dummy return ID.
                    num_return_vals=num_return_vals + 1,
                    resources={"CPU": self._ray_actor_method_cpus},
                    placement_resources={},
                    job_id=self._ray_actor_job_id,
                )
                # Update the actor counter and cursor to reflect the most
                # recent invocation.
                self._ray_actor_counter += 1
                # The last object returned is the dummy object that should be
                # passed in to the next actor method. Do not return it to the
                # user.
                self._ray_actor_cursor = object_ids.pop()
                # We have notified the backend of the new actor handles to
                # expect since the last task was submitted, so clear the list.
                self._ray_new_actor_handles = []

        if len(object_ids) == 1:
            object_ids = object_ids[0]
        elif len(object_ids) == 0:
            object_ids = None

        return object_ids
Beispiel #4
0
    def _remote(self,
                args,
                kwargs,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ray.ObjectID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class.__new__(
                self._modified_class)
        else:
            # Export the actor.
            if not self._exported:
                worker.function_actor_manager.export_actor_class(
                    self._class_id, self._modified_class,
                    self._actor_method_names, self._checkpoint_interval)
                self._exported = True

            resources = ray.utils.resources_from_resource_arguments(
                self._num_cpus, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert self._actor_method_cpus in [0, 1]
            if self._actor_method_cpus == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            creation_args = [self._class_id]
            function_id = compute_actor_creation_function_id(self._class_id)
            [actor_cursor] = worker.submit_task(
                function_id,
                creation_args,
                actor_creation_id=actor_id,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)

        # We initialize the actor counter at 1 to account for the actor
        # creation task.
        actor_counter = 1
        actor_handle = ActorHandle(
            actor_id, self._class_name, actor_cursor, actor_counter,
            self._actor_method_names, self._method_signatures,
            self._actor_method_num_return_vals, actor_cursor,
            self._actor_method_cpus, worker.task_driver_id)

        # Call __init__ as a remote function.
        if "__init__" in actor_handle._ray_actor_method_names:
            actor_handle.__init__.remote(*args, **kwargs)
        else:
            if len(args) != 0 or len(kwargs) != 0:
                raise Exception("Arguments cannot be passed to the actor "
                                "constructor because this actor class has no "
                                "__init__ method.")

        return actor_handle
Beispiel #5
0
    def _remote(self,
                args,
                kwargs,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ActorID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
        else:
            # Export the actor.
            if not self._exported:
                worker.function_actor_manager.export_actor_class(
                    self._modified_class, self._actor_method_names)
                self._exported = True

            resources = ray.utils.resources_from_resource_arguments(
                self._num_cpus, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert self._actor_method_cpus in [0, 1]
            if self._actor_method_cpus == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            if args is None:
                args = []
            if kwargs is None:
                kwargs = {}
            function_name = "__init__"
            function_signature = self._method_signatures[function_name]
            creation_args = signature.extend_args(function_signature, args,
                                                  kwargs)
            function_descriptor = FunctionDescriptor(
                self._modified_class.__module__, function_name,
                self._modified_class.__name__)
            [actor_cursor] = worker.submit_task(
                function_descriptor,
                creation_args,
                actor_creation_id=actor_id,
                max_actor_reconstructions=self._max_reconstructions,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)
            assert isinstance(actor_cursor, ObjectID)

        actor_handle = ActorHandle(
            actor_id, self._modified_class.__module__, self._class_name,
            actor_cursor, self._actor_method_names, self._method_signatures,
            self._actor_method_num_return_vals, actor_cursor,
            self._actor_method_cpus, worker.task_driver_id)
        # We increment the actor counter by 1 to account for the actor creation
        # task.
        actor_handle._ray_actor_counter += 1

        return actor_handle
Beispiel #6
0
    def _remote(self,
                args=None,
                kwargs=None,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}

        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ActorID.from_random()
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Set the actor's default resources if not already set. First three
        # conditions are to check that no resources were specified in the
        # decorator. Last three conditions are to check that no resources were
        # specified when _remote() was called.
        if (self._num_cpus is None and self._num_gpus is None
                and self._resources is None and num_cpus is None
                and num_gpus is None and resources is None):
            # In the default case, actors acquire no resources for
            # their lifetime, and actor methods will require 1 CPU.
            cpus_to_use = ray_constants.DEFAULT_ACTOR_CREATION_CPU_SIMPLE
            actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SIMPLE
        else:
            # If any resources are specified (here or in decorator), then
            # all resources are acquired for the actor's lifetime and no
            # resources are associated with methods.
            cpus_to_use = (ray_constants.DEFAULT_ACTOR_CREATION_CPU_SPECIFIED
                           if self._num_cpus is None else self._num_cpus)
            actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SPECIFIED

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
        else:
            # Export the actor.
            if (self._last_export_session_and_job !=
                    worker.current_session_and_job):
                # If this actor class was not exported in this session and job,
                # we need to export this function again, because current GCS
                # doesn't have it.
                self._last_export_session_and_job = (
                    worker.current_session_and_job)
                worker.function_actor_manager.export_actor_class(
                    self._modified_class, self._actor_method_names)

            resources = ray.utils.resources_from_resource_arguments(
                cpus_to_use, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert actor_method_cpu in [0, 1]
            if actor_method_cpu == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            function_name = "__init__"
            function_signature = self._method_signatures[function_name]
            creation_args = signature.extend_args(function_signature, args,
                                                  kwargs)
            function_descriptor = FunctionDescriptor(
                self._modified_class.__module__, function_name,
                self._modified_class.__name__)
            [actor_cursor] = worker.submit_task(
                function_descriptor,
                creation_args,
                actor_creation_id=actor_id,
                max_actor_reconstructions=self._max_reconstructions,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)
            assert isinstance(actor_cursor, ObjectID)

        actor_handle = ActorHandle(
            actor_id, self._modified_class.__module__, self._class_name,
            actor_cursor, self._actor_method_names, self._method_decorators,
            self._method_signatures, self._actor_method_num_return_vals,
            actor_cursor, actor_method_cpu, worker.current_job_id,
            worker.current_session_and_job)
        # We increment the actor counter by 1 to account for the actor creation
        # task.
        actor_handle._ray_actor_counter += 1

        return actor_handle
Beispiel #7
0
def reconstruct_actor_state(actor_id, worker):
    """Reconstruct the state of an actor that is being reconstructed.

    Args:
        actor_id: The ID of the actor being reconstructed.
        worker: The worker object that is running the actor.
    """
    # Get the most recent actor checkpoint.
    checkpoint_index, checkpoint = get_actor_checkpoint(actor_id, worker)
    if checkpoint is not None:
        print(
            "Loading actor state from checkpoint {}".format(checkpoint_index))
        # Wait for the actor to have been defined.
        while not hasattr(worker, "actor_class"):
            time.sleep(0.001)
        # TODO(rkn): Restoring from the checkpoint may fail, so this should be
        # in a try-except block and we should give a good error message.
        worker.actors[actor_id] = (
            worker.actor_class.__ray_restore_from_checkpoint__(checkpoint))

    # TODO(rkn): This call is expensive. It'd be nice to find a way to get only
    # the tasks that are relevant to this actor.
    tasks = ray.global_state.task_table()

    def hex_to_object_id(hex_id):
        return ray.local_scheduler.ObjectID(hex_to_binary(hex_id))

    relevant_tasks = []

    # Loop over the task table and keep the tasks that are relevant to this
    # actor.
    for _, task_info in tasks.items():
        task_spec_info = task_info["TaskSpec"]
        if hex_to_binary(task_spec_info["ActorID"]) == actor_id:
            relevant_tasks.append(task_spec_info)

    # Sort the tasks by actor ID.
    relevant_tasks.sort(key=lambda task: task["ActorCounter"])
    for i in range(len(relevant_tasks)):
        assert relevant_tasks[i]["ActorCounter"] == i

    # This is a mini replica of the worker's main_loop. This will loop over all
    # of the tasks that this actor is supposed to rerun. For each task, the
    # worker will submit the task to the local scheduler, retrieve the task
    # from the local scheduler, and execute the task.
    for task_spec_info in relevant_tasks:
        # Create a task spec out of the dictionary of info. This isn't
        # necessary. It is strictly for the purposes of checking that the task
        # we get back from the local scheduler is identical to the one we
        # submit.
        task_spec = ray.local_scheduler.Task(
            hex_to_object_id(task_spec_info["DriverID"]),
            hex_to_object_id(task_spec_info["FunctionID"]),
            task_spec_info["Args"], len(task_spec_info["ReturnObjectIDs"]),
            hex_to_object_id(task_spec_info["ParentTaskID"]),
            task_spec_info["ParentCounter"],
            hex_to_object_id(task_spec_info["ActorID"]),
            task_spec_info["ActorCounter"], [
                task_spec_info["RequiredResources"]["CPUs"],
                task_spec_info["RequiredResources"]["GPUs"],
                task_spec_info["RequiredResources"]["CustomResource"]
            ])

        # Verify that the return object IDs are the same as they were the
        # first time.
        assert task_spec_info["ReturnObjectIDs"] == task_spec.returns()

        # We need to wait for the actor to be imported and for the functions to
        # be defined before we can submit the task.
        worker._wait_for_function(hex_to_binary(task_spec_info["FunctionID"]),
                                  hex_to_binary(task_spec_info["DriverID"]))

        # Set some additional state. During normal operation
        # (non-reconstruction) this state would already be set because tasks
        # are only submitted from drivers or from workers that are in the
        # middle of executing other tasks.
        worker.task_driver_id = ray.local_scheduler.ObjectID(
            hex_to_binary(task_spec_info["DriverID"]))
        worker.current_task_id = ray.local_scheduler.ObjectID(
            hex_to_binary(task_spec_info["ParentTaskID"]))
        worker.task_index = task_spec_info["ParentCounter"]

        # Submit the task to the local scheduler. This is important so that the
        # local scheduler does bookkeeping about this actor's resource
        # utilization and things like that. It's also important for updating
        # some state on the worker.
        if task_spec_info["ActorCounter"] > checkpoint_index:
            worker.submit_task(hex_to_object_id(task_spec_info["FunctionID"]),
                               task_spec_info["Args"],
                               actor_id=hex_to_object_id(
                                   task_spec_info["ActorID"]))
        else:
            # Pass in a dummy task with no arguments to avoid having to
            # unnecessarily reconstruct past arguments.
            worker.submit_task(
                hex_to_object_id(task_spec_info["FunctionID"]), [],
                actor_id=hex_to_object_id(task_spec_info["ActorID"]))

        # Clear the extra state that we set.
        del worker.task_driver_id
        del worker.current_task_id
        del worker.task_index

        # Get the task from the local scheduler.
        retrieved_task = worker._get_next_task_from_local_scheduler()

        # If the task happened before the most recent checkpoint, ignore it.
        # Otherwise, execute it.
        if retrieved_task.actor_counter() > checkpoint_index:
            # Assert that the retrieved task is the same as the constructed
            # task.
            assert (ray.local_scheduler.task_to_string(task_spec) ==
                    ray.local_scheduler.task_to_string(retrieved_task))
            # Wait for the task to be ready and then execute it.
            worker._wait_for_and_process_task(retrieved_task)

    # Enter the main loop to receive and process tasks.
    worker.main_loop()
Beispiel #8
0
    def _actor_method_call(self,
                           method_name,
                           args=None,
                           kwargs=None,
                           num_return_vals=None):
        """Method execution stub for an actor handle.

        This is the function that executes when
        `actor.method_name.remote(*args, **kwargs)` is called. Instead of
        executing locally, the method is packaged as a task and scheduled
        to the remote actor instance.

        Args:
            method_name: The name of the actor method to execute.
            args: A list of arguments for the actor method.
            kwargs: A dictionary of keyword arguments for the actor method.
            num_return_vals (int): The number of return values for the method.

        Returns:
            object_ids: A list of object IDs returned by the remote actor
                method.
        """
        worker = ray.worker.get_global_worker()

        worker.check_connected()

        function_signature = self._ray_method_signatures[method_name]
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}
        args = signature.extend_args(function_signature, args, kwargs)

        # Execute functions locally if Ray is run in LOCAL_MODE
        # Copy args to prevent the function from mutating them.
        if worker.mode == ray.LOCAL_MODE:
            return getattr(worker.actors[self._ray_actor_id],
                           method_name)(*copy.deepcopy(args))

        function_descriptor = FunctionDescriptor(
            self._ray_module_name, method_name, self._ray_class_name)
        with self._ray_actor_lock:
            object_ids = worker.submit_task(
                function_descriptor,
                args,
                actor_id=self._ray_actor_id,
                actor_handle_id=self._ray_actor_handle_id,
                actor_counter=self._ray_actor_counter,
                actor_creation_dummy_object_id=(
                    self._ray_actor_creation_dummy_object_id),
                execution_dependencies=[self._ray_actor_cursor],
                new_actor_handles=self._ray_new_actor_handles,
                # We add one for the dummy return ID.
                num_return_vals=num_return_vals + 1,
                resources={"CPU": self._ray_actor_method_cpus},
                placement_resources={},
                driver_id=self._ray_actor_driver_id,
            )
            # Update the actor counter and cursor to reflect the most recent
            # invocation.
            self._ray_actor_counter += 1
            # The last object returned is the dummy object that should be
            # passed in to the next actor method. Do not return it to the user.
            self._ray_actor_cursor = object_ids.pop()
            # We have notified the backend of the new actor handles to expect
            # since the last task was submitted, so clear the list.
            self._ray_new_actor_handles = []

        if len(object_ids) == 1:
            object_ids = object_ids[0]
        elif len(object_ids) == 0:
            object_ids = None

        return object_ids
Beispiel #9
0
    def _remote(self,
                args=None,
                kwargs=None,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}

        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ActorID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
        else:
            # Export the actor.
            if not self._exported:
                worker.function_actor_manager.export_actor_class(
                    self._modified_class, self._actor_method_names)
                self._exported = True

            resources = ray.utils.resources_from_resource_arguments(
                self._num_cpus, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert self._actor_method_cpus in [0, 1]
            if self._actor_method_cpus == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            function_name = "__init__"
            function_signature = self._method_signatures[function_name]
            creation_args = signature.extend_args(function_signature, args,
                                                  kwargs)
            function_descriptor = FunctionDescriptor(
                self._modified_class.__module__, function_name,
                self._modified_class.__name__)
            [actor_cursor] = worker.submit_task(
                function_descriptor,
                creation_args,
                actor_creation_id=actor_id,
                max_actor_reconstructions=self._max_reconstructions,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)
            assert isinstance(actor_cursor, ObjectID)

        actor_handle = ActorHandle(
            actor_id, self._modified_class.__module__, self._class_name,
            actor_cursor, self._actor_method_names, self._method_signatures,
            self._actor_method_num_return_vals, actor_cursor,
            self._actor_method_cpus, worker.task_driver_id)
        # We increment the actor counter by 1 to account for the actor creation
        # task.
        actor_handle._ray_actor_counter += 1

        return actor_handle
Beispiel #10
0
def reconstruct_actor_state(actor_id, worker):
    """Reconstruct the state of an actor that is being reconstructed.

    Args:
        actor_id: The ID of the actor being reconstructed.
        worker: The worker object that is running the actor.
    """
    # TODO(rkn): This call is expensive. It'd be nice to find a way to get only
    # the tasks that are relevant to this actor.
    tasks = ray.global_state.task_table()

    def hex_to_object_id(hex_id):
        return ray.local_scheduler.ObjectID(hex_to_binary(hex_id))

    relevant_tasks = []

    # Loop over the task table and keep the tasks that are relevant to this
    # actor.
    for _, task_info in tasks.items():
        task_spec_info = task_info["TaskSpec"]
        if hex_to_binary(task_spec_info["ActorID"]) == actor_id:
            relevant_tasks.append(task_spec_info)

    # Sort the tasks by actor ID.
    relevant_tasks.sort(key=lambda task: task["ActorCounter"])
    for i in range(len(relevant_tasks)):
        assert relevant_tasks[i]["ActorCounter"] == i

    # This is a mini replica of the worker's main_loop. This will loop over all
    # of the tasks that this actor is supposed to rerun. For each task, the
    # worker will submit the task to the local scheduler, retrieve the task
    # from the local scheduler, and execute the task.
    for task_spec_info in relevant_tasks:
        # Create a task spec out of the dictionary of info. This isn't
        # necessary. It is strictly for the purposes of checking that the task
        # we get back from the local scheduler is identical to the one we
        # submit.
        task_spec = ray.local_scheduler.Task(
            hex_to_object_id(task_spec_info["DriverID"]),
            hex_to_object_id(task_spec_info["FunctionID"]),
            task_spec_info["Args"], len(task_spec_info["ReturnObjectIDs"]),
            hex_to_object_id(task_spec_info["ParentTaskID"]),
            task_spec_info["ParentCounter"],
            hex_to_object_id(task_spec_info["ActorID"]),
            task_spec_info["ActorCounter"], [
                task_spec_info["RequiredResources"]["CPUs"],
                task_spec_info["RequiredResources"]["GPUs"]
            ])

        # Verify that the return object IDs are the same as they were the
        # first time.
        assert task_spec_info["ReturnObjectIDs"] == task_spec.returns()

        # We need to wait for the actor to be imported and for the functions to
        # be defined before we can submit the task.
        worker._wait_for_function(hex_to_binary(task_spec_info["FunctionID"]),
                                  hex_to_binary(task_spec_info["DriverID"]))

        # Set some additional state. During normal operation
        # (non-reconstruction) this state would already be set because tasks
        # are only submitted from drivers or from workers that are in the
        # middle of executing other tasks.
        worker.task_driver_id = ray.local_scheduler.ObjectID(
            hex_to_binary(task_spec_info["DriverID"]))
        worker.current_task_id = ray.local_scheduler.ObjectID(
            hex_to_binary(task_spec_info["ParentTaskID"]))
        worker.task_index = task_spec_info["ParentCounter"]

        # Submit the task to the local scheduler. This is important so that the
        # local scheduler does bookkeeping about this actor's resource
        # utilization and things like that. It's also important for updating
        # some state on the worker.
        worker.submit_task(hex_to_object_id(task_spec_info["FunctionID"]),
                           task_spec_info["Args"],
                           actor_id=hex_to_object_id(
                               task_spec_info["ActorID"]))

        # Clear the extra state that we set.
        del worker.task_driver_id
        del worker.current_task_id
        del worker.task_index

        # Get the task from the local scheduler.
        retrieved_task = worker._get_next_task_from_local_scheduler()
        # Assert that the retrieved task is the same as the constructed task.
        assert (ray.local_scheduler.task_to_string(task_spec) ==
                ray.local_scheduler.task_to_string(retrieved_task))

        # Wait for the task to be ready and execute the task.
        worker._wait_for_and_process_task(retrieved_task)

    # Enter the main loop to receive and process tasks.
    worker.main_loop()
Beispiel #11
0
    def _actor_method_call(self,
                           method_name,
                           args=None,
                           kwargs=None,
                           num_return_vals=None):
        """Method execution stub for an actor handle.

        This is the function that executes when
        `actor.method_name.remote(*args, **kwargs)` is called. Instead of
        executing locally, the method is packaged as a task and scheduled
        to the remote actor instance.

        Args:
            method_name: The name of the actor method to execute.
            args: A list of arguments for the actor method.
            kwargs: A dictionary of keyword arguments for the actor method.
            dependency: The object ID that this method is dependent on.
                Defaults to None, for no dependencies. Most tasks should
                pass in the dummy object returned by the preceding task.
                Some tasks, such as checkpoint and terminate methods, have
                no dependencies.

        Returns:
            object_ids: A list of object IDs returned by the remote actor
                method.
        """
        worker = ray.worker.get_global_worker()

        worker.check_connected()

        function_signature = self._ray_method_signatures[method_name]
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}
        args = signature.extend_args(function_signature, args, kwargs)

        # Execute functions locally if Ray is run in LOCAL_MODE
        # Copy args to prevent the function from mutating them.
        if worker.mode == ray.LOCAL_MODE:
            return getattr(worker.actors[self._ray_actor_id],
                           method_name)(*copy.deepcopy(args))

        is_actor_checkpoint_method = (method_name == "__ray_checkpoint__")

        function_descriptor = FunctionDescriptor(self._ray_module_name,
                                                 method_name,
                                                 self._ray_class_name)
        with self._ray_actor_lock:
            object_ids = worker.submit_task(
                function_descriptor,
                args,
                actor_id=self._ray_actor_id,
                actor_handle_id=self._ray_actor_handle_id,
                actor_counter=self._ray_actor_counter,
                is_actor_checkpoint_method=is_actor_checkpoint_method,
                actor_creation_dummy_object_id=(
                    self._ray_actor_creation_dummy_object_id),
                execution_dependencies=[self._ray_actor_cursor],
                new_actor_handles=self._ray_new_actor_handles,
                # We add one for the dummy return ID.
                num_return_vals=num_return_vals + 1,
                resources={"CPU": self._ray_actor_method_cpus},
                placement_resources={},
                driver_id=self._ray_actor_driver_id,
            )
            # Update the actor counter and cursor to reflect the most recent
            # invocation.
            self._ray_actor_counter += 1
            # The last object returned is the dummy object that should be
            # passed in to the next actor method. Do not return it to the user.
            self._ray_actor_cursor = object_ids.pop()
            # We have notified the backend of the new actor handles to expect
            # since the last task was submitted, so clear the list.
            self._ray_new_actor_handles = []

        if len(object_ids) == 1:
            object_ids = object_ids[0]
        elif len(object_ids) == 0:
            object_ids = None

        return object_ids