def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources, max_restarts, max_task_retries): Class = modify_class(cls) if max_restarts is None: max_restarts = 0 if max_task_retries is None: max_task_retries = 0 infinite_restart = max_restarts == -1 if not infinite_restart: if max_restarts < 0: raise ValueError("max_restarts must be an integer >= -1 " "-1 indicates infinite restarts") else: # Make sure we don't pass too big of an int to C++, causing # an overflow. max_restarts = min(max_restarts, ray_constants.MAX_INT64_VALUE) if max_restarts == 0 and max_task_retries != 0: raise ValueError( "max_task_retries cannot be set if max_restarts is 0.") return ActorClass._ray_from_modified_class(Class, ActorClassID.from_random(), max_restarts, max_task_retries, num_cpus, num_gpus, memory, object_store_memory, resources)
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources, max_reconstructions): # Give an error if cls is an old-style class. if not issubclass(cls, object): raise TypeError( "The @ray.remote decorator cannot be applied to old-style " "classes. In Python 2, you must declare the class with " "'class ClassName(object):' instead of 'class ClassName:'.") if issubclass(cls, Checkpointable) and inspect.isabstract(cls): raise TypeError( "A checkpointable actor class should implement all abstract " "methods in the `Checkpointable` interface.") if max_reconstructions is None: if ray_constants.direct_call_enabled(): # Allow the actor creation task to be resubmitted automatically # by default. max_reconstructions = 3 else: max_reconstructions = 0 if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise Exception("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): worker = ray.worker.get_global_worker() if worker.mode != ray.LOCAL_MODE: ray.actor.exit_actor() def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the raylet, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker if not isinstance(self, ray.actor.Checkpointable): raise Exception( "__ray_checkpoint__.remote() may only be called on actors " "that implement ray.actor.Checkpointable") return worker._save_actor_checkpoint() Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ return ActorClass._ray_from_modified_class(Class, ActorClassID.from_random(), max_reconstructions, num_cpus, num_gpus, memory, object_store_memory, resources)
def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus, max_reconstructions): # Give an error if cls is an old-style class. if not issubclass(cls, object): raise TypeError( "The @ray.remote decorator cannot be applied to old-style " "classes. In Python 2, you must declare the class with " "'class ClassName(object):' instead of 'class ClassName:'.") if issubclass(cls, Checkpointable) and inspect.isabstract(cls): raise TypeError( "A checkpointable actor class should implement all abstract " "methods in the `Checkpointable` interface.") if max_reconstructions is None: max_reconstructions = 0 if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise Exception("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): worker = ray.worker.get_global_worker() if worker.mode != ray.LOCAL_MODE: # Disconnect the worker from the local scheduler. The point of # this is so that when the worker kills itself below, the local # scheduler won't push an error message to the driver. worker.raylet_client.disconnect() sys.exit(0) assert False, "This process should have terminated." def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the local scheduler, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker if not isinstance(self, ray.actor.Checkpointable): raise Exception( "__ray_checkpoint__.remote() may only be called on actors " "that implement ray.actor.Checkpointable") return worker._save_actor_checkpoint() Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ class_id = ActorClassID(_random_string()) return ActorClass(Class, class_id, max_reconstructions, num_cpus, num_gpus, resources, actor_method_cpus)
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources, max_reconstructions): Class = modify_class(cls) if max_reconstructions is None: max_reconstructions = 0 if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise ValueError("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) return ActorClass._ray_from_modified_class( Class, ActorClassID.from_random(), max_reconstructions, num_cpus, num_gpus, memory, object_store_memory, resources)
def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus, checkpoint_interval, max_reconstructions): # Give an error if cls is an old-style class. if not issubclass(cls, object): raise TypeError( "The @ray.remote decorator cannot be applied to old-style " "classes. In Python 2, you must declare the class with " "'class ClassName(object):' instead of 'class ClassName:'.") if checkpoint_interval is None: checkpoint_interval = -1 if max_reconstructions is None: max_reconstructions = 0 if checkpoint_interval == 0: raise Exception("checkpoint_interval must be greater than 0.") if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise Exception("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): worker = ray.worker.get_global_worker() if worker.mode != ray.LOCAL_MODE: # Disconnect the worker from the local scheduler. The point of # this is so that when the worker kills itself below, the local # scheduler won't push an error message to the driver. worker.raylet_client.disconnect() sys.exit(0) assert False, "This process should have terminated." def __ray_save_checkpoint__(self): if hasattr(self, "__ray_save__"): object_to_serialize = self.__ray_save__() else: object_to_serialize = self return pickle.dumps(object_to_serialize) @classmethod def __ray_restore_from_checkpoint__(cls, pickled_checkpoint): checkpoint = pickle.loads(pickled_checkpoint) if hasattr(cls, "__ray_restore__"): actor_object = cls.__new__(cls) actor_object.__ray_restore__(checkpoint) else: # TODO(rkn): It's possible that this will cause problems. When # you unpickle the same object twice, the two objects will not # have the same class. actor_object = checkpoint return actor_object def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the local scheduler, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker checkpoint_index = worker.actor_task_counter # Get the state to save. checkpoint = self.__ray_save_checkpoint__() # Get the current task frontier, per actor handle. # NOTE(swang): This only includes actor handles that the local # scheduler has seen. Handle IDs for which no task has yet reached # the local scheduler will not be included, and may not be runnable # on checkpoint resumption. actor_id = worker.actor_id frontier = worker.raylet_client.get_actor_frontier(actor_id) # Save the checkpoint in Redis. TODO(rkn): Checkpoints # should not be stored in Redis. Fix this. set_actor_checkpoint(worker, worker.actor_id, checkpoint_index, checkpoint, frontier) def __ray_checkpoint_restore__(self): """Restore a checkpoint. This task looks for a saved checkpoint and if found, restores the state of the actor, the task frontier in the local scheduler, and the checkpoint index (number of tasks executed so far). Returns: A bool indicating whether a checkpoint was resumed. """ worker = ray.worker.global_worker # Get the most recent checkpoint stored, if any. checkpoint_index, checkpoint, frontier = get_actor_checkpoint( worker, worker.actor_id) # Try to resume from the checkpoint. checkpoint_resumed = False if checkpoint_index is not None: # Load the actor state from the checkpoint. worker.actors[worker.actor_id] = ( worker.actor_class.__ray_restore_from_checkpoint__( checkpoint)) # Set the number of tasks executed so far. worker.actor_task_counter = checkpoint_index # Set the actor frontier in the local scheduler. worker.raylet_client.set_actor_frontier(frontier) checkpoint_resumed = True return checkpoint_resumed Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ class_id = ActorClassID(_random_string()) return ActorClass(Class, class_id, checkpoint_interval, max_reconstructions, num_cpus, num_gpus, resources, actor_method_cpus)