Example #1
def driver(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.

    # Wait for all the nodes to join the cluster.

    # Limit the number of drivers running concurrently.
    for i in range(driver_index - max_concurrent_drivers + 1):
        _wait_for_event("DRIVER_{}_DONE".format(i), redis_address)

    def try_to_create_actor(actor_class, timeout=100):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
                actor = actor_class.remote()
            except Exception as e:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Create some actors that require one GPU.
    actors_one_gpu = []
    for _ in range(num_gpus_per_driver):

    for _ in range(100):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
Example #2
def driver(redis_address, driver_index):
    """The script for all drivers.

    This driver should create five actors that each use one GPU. After a while,
    it should exit.

    # Wait for all the nodes to join the cluster.

    # Limit the number of drivers running concurrently.
    for i in range(driver_index - max_concurrent_drivers + 1):
        _wait_for_event("DRIVER_{}_DONE".format(i), redis_address)

    def try_to_create_actor(actor_class, timeout=500):
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
                actor = actor_class.remote()
            except Exception:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Create some actors that require one GPU.
    actors_one_gpu = []
    for _ in range(num_gpus_per_driver):

    for _ in range(100):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
Example #3
def cleanup_driver(redis_address, driver_index):
    """The script for drivers 2 through 6.

    This driver should wait for the first two drivers to finish. Then it should
    create some actors that use a total of ten GPUs.

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # We go ahead and create some actors that don't require any GPUs. We
        # don't need to wait for the other drivers to finish. We call methods
        # on these actors later to make sure they haven't been killed.
        actors_no_gpus = [Actor0.remote(driver_index, i, redis_address)
                          for i in range(10)]

    _wait_for_event("DRIVER_0_DONE", redis_address)
    _wait_for_event("DRIVER_1_DONE", redis_address)

    def try_to_create_actor(actor_class, driver_index, actor_index,
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
                actor = actor_class.remote(driver_index, actor_index,
            except Exception as e:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # Create some actors that require two GPUs.
        actors_two_gpus = []
        for i in range(3):
            actors_two_gpus.append(try_to_create_actor(Actor2, driver_index,
                                                       10 + i))
        # Create some actors that require one GPU.
        actors_one_gpu = []
        for i in range(4):
            actors_one_gpu.append(try_to_create_actor(Actor1, driver_index,
                                                      10 + 3 + i))

    removed_workers = 0

    # Make sure that the PIDs for the long-running tasks from driver 0 and
    # driver 1 have been killed.
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(0, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(1, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1
    # Make sure that the PIDs for the actors from driver 0 and driver 1 have
    # been killed.
    for i in range(10):
        node_ip_address, pid = _wait_for_event(actor_event_name(0, i),
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1
    for i in range(9):
        node_ip_address, pid = _wait_for_event(actor_event_name(1, i),
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1

    print("{} workers/actors were removed on this node."

    # Only one of the cleanup drivers should create and use more actors.
    if driver_index == 2:
        for _ in range(1000):
            ray.get([actor.check_ids.remote() for actor in actors_two_gpus])
            ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
            ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)
Example #4
def cleanup_driver(redis_address, driver_index):
    """The script for drivers 2 through 6.

    This driver should wait for the first two drivers to finish. Then it should
    create some actors that use a total of ten GPUs.

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # We go ahead and create some actors that don't require any GPUs. We
        # don't need to wait for the other drivers to finish. We call methods
        # on these actors later to make sure they haven't been killed.
        actors_no_gpus = [
            Actor0.remote(driver_index, i, redis_address) for i in range(10)

    _wait_for_event("DRIVER_0_DONE", redis_address)
    _wait_for_event("DRIVER_1_DONE", redis_address)

    def try_to_create_actor(actor_class,
        # Try to create an actor, but allow failures while we wait for the
        # monitor to release the resources for the removed drivers.
        start_time = time.time()
        while time.time() - start_time < timeout:
                actor = actor_class.remote(driver_index, actor_index,
            except Exception as e:
                return actor
        # If we are here, then we timed out while looping.
        raise Exception("Timed out while trying to create actor.")

    # Only one of the cleanup drivers should create more actors.
    if driver_index == 2:
        # Create some actors that require one GPU.
        actors_one_gpu = []
        for i in range(10):
                try_to_create_actor(Actor1, driver_index, 10 + 3 + i))

    removed_workers = 0

    # Make sure that the PIDs for the long-running tasks from driver 0 and
    # driver 1 have been killed.
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(0, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1
    for i in range(num_long_running_tasks_per_driver):
        node_ip_address, pid = _wait_for_event(
            remote_function_event_name(1, i), redis_address)
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1
    # Make sure that the PIDs for the actors from driver 0 and driver 1 have
    # been killed.
    for i in range(10):
        node_ip_address, pid = _wait_for_event(actor_event_name(0, i),
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1
    for i in range(9):
        node_ip_address, pid = _wait_for_event(actor_event_name(1, i),
        if node_ip_address == ray.services.get_node_ip_address():
            removed_workers += 1

        "{} workers/actors were removed on this node.".format(removed_workers))

    # Only one of the cleanup drivers should create and use more actors.
    if driver_index == 2:
        for _ in range(1000):
            ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
            ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address)