Exemple #1
0
def register_actor_signatures(worker, driver_id, class_name,
                              actor_method_names,
                              actor_method_num_return_vals):
    """Register an actor's method signatures in the worker.

    Args:
        worker: The worker to register the signatures on.
        driver_id: The ID of the driver that this actor is associated with.
        actor_id: The ID of the actor.
        actor_method_names: The names of the methods to register.
        actor_method_num_return_vals: A list of the number of return values for
            each of the actor's methods.
    """
    assert len(actor_method_names) == len(actor_method_num_return_vals)
    for actor_method_name, num_return_vals in zip(
            actor_method_names, actor_method_num_return_vals):
        # TODO(rkn): When we create a second actor, we are probably overwriting
        # the values from the first actor here. This may or may not be a
        # problem.
        function_id = compute_actor_method_function_id(class_name,
                                                       actor_method_name).id()
        worker.function_properties[driver_id][function_id] = (
            # The extra return value is an actor dummy object.
            FunctionProperties(num_return_vals=num_return_vals + 1,
                               resources={"CPU": 1},
                               max_calls=0))
Exemple #2
0
def register_actor_signatures(worker,
                              driver_id,
                              class_id,
                              class_name,
                              actor_method_names,
                              actor_method_num_return_vals,
                              actor_creation_resources=None,
                              actor_method_cpus=None):
    """Register an actor's method signatures in the worker.

    Args:
        worker: The worker to register the signatures on.
        driver_id: The ID of the driver that this actor is associated with.
        class_id: The ID of the actor class.
        class_name: The name of the actor class.
        actor_method_names: The names of the methods to register.
        actor_method_num_return_vals: A list of the number of return values for
            each of the actor's methods.
        actor_creation_resources: The resources required by the actor creation
            task.
        actor_method_cpus: The number of CPUs required by each actor method.
    """
    assert len(actor_method_names) == len(actor_method_num_return_vals)
    for actor_method_name, num_return_vals in zip(
            actor_method_names, actor_method_num_return_vals):
        # TODO(rkn): When we create a second actor, we are probably overwriting
        # the values from the first actor here. This may or may not be a
        # problem.
        function_id = compute_actor_method_function_id(class_name,
                                                       actor_method_name).id()
        worker.function_properties[driver_id][function_id] = (
            # The extra return value is an actor dummy object.
            # In the cases where actor_method_cpus is None, that value should
            # never be used.
            FunctionProperties(num_return_vals=num_return_vals + 1,
                               resources={"CPU": actor_method_cpus},
                               max_calls=0))

    if actor_creation_resources is not None:
        # Also register the actor creation task.
        function_id = compute_actor_creation_function_id(class_id)
        worker.function_properties[driver_id][function_id.id()] = (
            # The extra return value is an actor dummy object.
            FunctionProperties(num_return_vals=0 + 1,
                               resources=actor_creation_resources,
                               max_calls=0))
Exemple #3
0
def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
                 worker):
  """Export an actor to redis.

  Args:
    actor_id: The ID of the actor.
    actor_method_names (list): A list of the names of this actor's methods.
    num_cpus (int): The number of CPUs that this actor requires.
    num_gpus (int): The number of GPUs that this actor requires.
  """
  ray.worker.check_main_thread()
  if worker.mode is None:
    raise Exception("Actors cannot be created before Ray has been started. "
                    "You can start Ray with 'ray.init()'.")
  key = b"Actor:" + actor_id.id()

  # For now, all actor methods have 1 return value.
  driver_id = worker.task_driver_id.id()
  for actor_method_name in actor_method_names:
    # TODO(rkn): When we create a second actor, we are probably overwriting
    # the values from the first actor here. This may or may not be a problem.
    function_id = get_actor_method_function_id(actor_method_name).id()
    worker.function_properties[driver_id][function_id] = FunctionProperties(
        num_return_vals=1,
        num_cpus=1,
        num_gpus=0,
        max_calls=0)

  # Get a list of the local schedulers from the client table.
  client_table = ray.global_state.client_table()
  local_schedulers = []
  for ip_address, clients in client_table.items():
    for client in clients:
      if client["ClientType"] == "local_scheduler" and not client["Deleted"]:
        local_schedulers.append(client)
  # Select a local scheduler for the actor.
  local_scheduler_id = select_local_scheduler(local_schedulers, num_gpus,
                                              worker)
  assert local_scheduler_id is not None

  # We must put the actor information in Redis before publishing the actor
  # notification so that when the newly created actor attempts to fetch the
  # information from Redis, it is already there.
  worker.redis_client.hmset(key, {"class_id": class_id,
                                  "num_gpus": num_gpus})

  # Really we should encode this message as a flatbuffer object. However, we're
  # having trouble getting that to work. It almost works, but in Python 2.7,
  # builder.CreateString fails on byte strings that contain characters outside
  # range(128).

  # TODO(rkn): There is actually no guarantee that the local scheduler that we
  # are publishing to has already subscribed to the actor_notifications
  # channel. Therefore, this message may be missed and the workload will hang.
  # This is a bug.
  worker.redis_client.publish("actor_notifications",
                              actor_id.id() + driver_id + local_scheduler_id)
Exemple #4
0
def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
                 worker):
    """Export an actor to redis.

    Args:
        actor_id: The ID of the actor.
        actor_method_names (list): A list of the names of this actor's methods.
        num_cpus (int): The number of CPUs that this actor requires.
        num_gpus (int): The number of GPUs that this actor requires.
    """
    ray.worker.check_main_thread()
    if worker.mode is None:
        raise Exception("Actors cannot be created before Ray has been "
                        "started. You can start Ray with 'ray.init()'.")
    key = b"Actor:" + actor_id.id()

    # For now, all actor methods have 1 return value.
    driver_id = worker.task_driver_id.id()
    for actor_method_name in actor_method_names:
        # TODO(rkn): When we create a second actor, we are probably overwriting
        # the values from the first actor here. This may or may not be a
        # problem.
        function_id = get_actor_method_function_id(actor_method_name).id()
        worker.function_properties[driver_id][function_id] = (
            FunctionProperties(num_return_vals=1,
                               num_cpus=1,
                               num_gpus=0,
                               num_custom_resource=0,
                               max_calls=0))

    # Select a local scheduler for the actor.
    local_scheduler_id = select_local_scheduler(
        worker.task_driver_id.id(), ray.global_state.local_schedulers(),
        num_gpus, worker.redis_client)
    assert local_scheduler_id is not None

    # We must put the actor information in Redis before publishing the actor
    # notification so that when the newly created actor attempts to fetch the
    # information from Redis, it is already there.
    worker.redis_client.hmset(
        key, {
            "class_id": class_id,
            "driver_id": driver_id,
            "local_scheduler_id": local_scheduler_id,
            "num_gpus": num_gpus,
            "removed": False
        })

    # TODO(rkn): There is actually no guarantee that the local scheduler that
    # we are publishing to has already subscribed to the actor_notifications
    # channel. Therefore, this message may be missed and the workload will
    # hang. This is a bug.
    ray.utils.publish_actor_creation(actor_id.id(), driver_id,
                                     local_scheduler_id, False,
                                     worker.redis_client)
Exemple #5
0
def fetch_and_register_actor(actor_class_key, worker):
  """Import an actor.

  This will be called by the worker's import thread when the worker receives
  the actor_class export, assuming that the worker is an actor for that class.
  """
  actor_id_str = worker.actor_id
  (driver_id, class_id, class_name,
   module, pickled_class, actor_method_names) = worker.redis_client.hmget(
       actor_class_key, ["driver_id", "class_id", "class_name", "module",
                         "class", "actor_method_names"])

  actor_name = class_name.decode("ascii")
  module = module.decode("ascii")
  actor_method_names = json.loads(actor_method_names.decode("ascii"))

  # Create a temporary actor with some temporary methods so that if the actor
  # fails to be unpickled, the temporary actor can be used (just to produce
  # error messages and to prevent the driver from hanging).
  class TemporaryActor(object):
    pass
  worker.actors[actor_id_str] = TemporaryActor()

  def temporary_actor_method(*xs):
    raise Exception("The actor with name {} failed to be imported, and so "
                    "cannot execute this method".format(actor_name))
  for actor_method_name in actor_method_names:
    function_id = get_actor_method_function_id(actor_method_name).id()
    worker.functions[driver_id][function_id] = (actor_method_name,
                                                temporary_actor_method)
    worker.function_properties[driver_id][function_id] = FunctionProperties(
        num_return_vals=1,
        num_cpus=1,
        num_gpus=0,
        max_calls=0)
    worker.num_task_executions[driver_id][function_id] = 0

  try:
    unpickled_class = pickle.loads(pickled_class)
  except Exception:
    # If an exception was thrown when the actor was imported, we record the
    # traceback and notify the scheduler of the failure.
    traceback_str = ray.worker.format_error_message(traceback.format_exc())
    # Log the error message.
    worker.push_error_to_driver(driver_id, "register_actor", traceback_str,
                                data={"actor_id": actor_id_str})
  else:
    # TODO(pcm): Why is the below line necessary?
    unpickled_class.__module__ = module
    worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
    for (k, v) in inspect.getmembers(
        unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or
                                               inspect.ismethod(x)))):
      function_id = get_actor_method_function_id(k).id()
      worker.functions[driver_id][function_id] = (k, v)
Exemple #6
0
def register_actor_signatures(worker, driver_id, class_name,
                              actor_method_names):
    """Register an actor's method signatures in the worker.

    Args:
        worker: The worker to register the signatures on.
        driver_id: The ID of the driver that this actor is associated with.
        actor_id: The ID of the actor.
        actor_method_names: The names of the methods to register.
    """
    for actor_method_name in actor_method_names:
        # TODO(rkn): When we create a second actor, we are probably overwriting
        # the values from the first actor here. This may or may not be a
        # problem.
        function_id = compute_actor_method_function_id(class_name,
                                                       actor_method_name).id()
        # For now, all actor methods have 1 return value.
        worker.function_properties[driver_id][function_id] = (
            FunctionProperties(num_return_vals=2,
                               num_cpus=1,
                               num_gpus=0,
                               num_custom_resource=0,
                               max_calls=0))
Exemple #7
0
def fetch_and_register_actor(actor_class_key, worker):
    """Import an actor.

    This will be called by the worker's import thread when the worker receives
    the actor_class export, assuming that the worker is an actor for that
    class.
    """
    actor_id_str = worker.actor_id
    (driver_id, class_id, class_name,
     module, pickled_class, checkpoint_interval,
     actor_method_names) = worker.redis_client.hmget(
         actor_class_key, ["driver_id", "class_id", "class_name", "module",
                           "class", "checkpoint_interval",
                           "actor_method_names"])

    actor_name = class_name.decode("ascii")
    module = module.decode("ascii")
    checkpoint_interval = int(checkpoint_interval)
    actor_method_names = json.loads(actor_method_names.decode("ascii"))

    # Create a temporary actor with some temporary methods so that if the actor
    # fails to be unpickled, the temporary actor can be used (just to produce
    # error messages and to prevent the driver from hanging).
    class TemporaryActor(object):
        pass
    worker.actors[actor_id_str] = TemporaryActor()
    worker.actor_checkpoint_interval = checkpoint_interval

    def temporary_actor_method(*xs):
        raise Exception("The actor with name {} failed to be imported, and so "
                        "cannot execute this method".format(actor_name))
    for actor_method_name in actor_method_names:
        function_id = get_actor_method_function_id(actor_method_name).id()
        temporary_executor = make_actor_method_executor(worker,
                                                        actor_method_name,
                                                        temporary_actor_method)
        worker.functions[driver_id][function_id] = (actor_method_name,
                                                    temporary_executor)
        worker.function_properties[driver_id][function_id] = (
            FunctionProperties(num_return_vals=2,
                               num_cpus=1,
                               num_gpus=0,
                               num_custom_resource=0,
                               max_calls=0))
        worker.num_task_executions[driver_id][function_id] = 0

    try:
        unpickled_class = pickle.loads(pickled_class)
        worker.actor_class = unpickled_class
    except Exception:
        # If an exception was thrown when the actor was imported, we record the
        # traceback and notify the scheduler of the failure.
        traceback_str = ray.worker.format_error_message(traceback.format_exc())
        # Log the error message.
        worker.push_error_to_driver(driver_id, "register_actor", traceback_str,
                                    data={"actor_id": actor_id_str})
        # TODO(rkn): In the future, it might make sense to have the worker exit
        # here. However, currently that would lead to hanging if someone calls
        # ray.get on a method invoked on the actor.
    else:
        # TODO(pcm): Why is the below line necessary?
        unpickled_class.__module__ = module
        worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
        actor_methods = inspect.getmembers(
            unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or
                                                   inspect.ismethod(x))))
        for actor_method_name, actor_method in actor_methods:
            function_id = get_actor_method_function_id(actor_method_name).id()
            executor = make_actor_method_executor(worker, actor_method_name,
                                                  actor_method)
            worker.functions[driver_id][function_id] = (actor_method_name,
                                                        executor)
            # We do not set worker.function_properties[driver_id][function_id]
            # because we currently do need the actor worker to submit new tasks
            # for the actor.

        # Store some extra information that will be used when the actor exits
        # to release GPU resources.
        worker.driver_id = binary_to_hex(driver_id)
        local_scheduler_id = worker.redis_client.hget(
            b"Actor:" + actor_id_str, "local_scheduler_id")
        worker.local_scheduler_id = binary_to_hex(local_scheduler_id)