Ejemplo n.º 1
0
def ray_start_combination(request):
    num_nodes = request.param[0]
    num_workers_per_scheduler = request.param[1]
    # Start the Ray processes.
    cluster = Cluster(initialize_head=True,
                      head_node_args={
                          "num_cpus": 10,
                          "redis_max_memory": 10**7
                      })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
    ray.init(address=cluster.address)

    yield num_nodes, num_workers_per_scheduler, cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 2
0
def test_namespace():
    """
    Most of the "checks" in this test case rely on the fact that
    `run_string_as_driver` will throw an exception if the driver string exits
    with a non-zero exit code (e.g. when the driver scripts throws an
    exception). Since all of these drivers start named, detached actors, the
    most likely failure case would be a collision of named actors if they're
    put in the same namespace.

    This test checks that:
    * When two drivers don't specify a namespace, they are placed in different
      anonymous namespaces.
    * When two drivers specify a namespace, they collide.
    * The namespace name (as provided by the runtime context) is correct.
    """
    cluster = Cluster()
    cluster.add_node(num_cpus=4, ray_client_server_port=50055)
    cluster.wait_for_nodes(1)

    template = """
import ray
ray.client("localhost:50055").namespace({namespace}).connect()

@ray.remote
class Foo:
    def ping(self):
        return "pong"

a = Foo.options(lifetime="detached", name="abc").remote()
ray.get(a.ping.remote())
print(ray.get_runtime_context().namespace)
    """

    anon_driver = template.format(namespace="None")
    run_string_as_driver(anon_driver)
    # This second run will fail if the actors don't run in separate anonymous
    # namespaces.
    run_string_as_driver(anon_driver)

    run_in_namespace = template.format(namespace="'namespace'")
    script_namespace = run_string_as_driver(run_in_namespace)
    # The second run fails because the actors are run in the same namespace.
    with pytest.raises(subprocess.CalledProcessError):
        run_string_as_driver(run_in_namespace)

    assert script_namespace.strip() == "namespace"
Ejemplo n.º 3
0
def setup_local_single_node_cluster(num_nodes):
    """Setup ray cluster locally via ray.init() and Cluster()

    Each actor is simulated in local process on single node,
    thus smaller scale by default.
    """
    cluster = Cluster()
    for i in range(num_nodes):
        cluster.add_node(
            redis_port=6379 if i == 0 else None,
            num_cpus=NUM_CPU_PER_NODE,
            num_gpus=0,
            resources={str(i): 2},
        )
    ray.init(address=cluster.address, dashboard_host="0.0.0.0")
    serve_client = serve.start(http_options=dict(location="EveryNode"))

    return serve_client
Ejemplo n.º 4
0
def test_temp_plasma_store_socket():
    ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket")
    assert os.path.exists(
        "/tmp/i_am_a_temp_socket"), "Specified socket path not found."
    ray.shutdown()
    try:
        os.remove("/tmp/i_am_a_temp_socket")
    except OSError:
        pass  # It could have been removed by Ray.
    cluster = Cluster(True)
    cluster.add_node(plasma_store_socket_name="/tmp/i_am_a_temp_socket_2")
    assert os.path.exists(
        "/tmp/i_am_a_temp_socket_2"), "Specified socket path not found."
    cluster.shutdown()
    try:
        os.remove("/tmp/i_am_a_temp_socket_2")
    except OSError:
        pass  # It could have been removed by Ray.
Ejemplo n.º 5
0
def test_ray_status_multinode():
    from ray.cluster_utils import Cluster
    cluster = Cluster()
    for _ in range(4):
        cluster.add_node(num_cpus=2)
    runner = CliRunner()

    def output_ready():
        result = runner.invoke(scripts.status)
        result.stdout
        return not result.exception and "memory" in result.output

    wait_for_condition(output_ready)

    result = runner.invoke(scripts.status, [])
    _check_output_via_pattern("test_ray_status_multinode.txt", result)
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 6
0
def test_raylet_socket_name(shutdown_only):
    sock1 = unix_socket_create_path("i_am_a_temp_socket_1")
    ray.init(raylet_socket_name=sock1)
    unix_socket_verify(sock1)
    ray.shutdown()
    try:
        unix_socket_delete(sock1)
    except OSError:
        pass  # It could have been removed by Ray.
    cluster = Cluster(True)
    sock2 = unix_socket_create_path("i_am_a_temp_socket_2")
    cluster.add_node(raylet_socket_name=sock2)
    unix_socket_verify(sock2)
    cluster.shutdown()
    try:
        unix_socket_delete(sock2)
    except OSError:
        pass  # It could have been removed by Ray.
Ejemplo n.º 7
0
def test_pull_bundles_admission_control_dynamic(shutdown_only):
    # This test is the same as test_pull_bundles_admission_control, except that
    # the object store's capacity starts off higher and is later consumed
    # dynamically by concurrent workers.
    cluster = Cluster()
    object_size = int(6e6)
    num_objects = 20
    num_tasks = 20
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0,
                     object_store_memory=2 * num_tasks * num_objects *
                     object_size)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node can fit 2 tasks at a time.
    cluster.add_node(num_cpus=1,
                     object_store_memory=2.5 * num_objects * object_size)
    cluster.wait_for_nodes()

    @ray.remote
    def foo(i, *args):
        print("foo", i)
        return

    @ray.remote
    def allocate(i):
        print("allocate", i)
        return np.zeros(object_size, dtype=np.uint8)

    args = []
    for _ in range(num_tasks):
        task_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        args.append(task_args)

    allocated = [allocate.remote(i) for i in range(num_objects)]
    ray.get(allocated)

    tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)]
    ray.get(tasks)
    del allocated
Ejemplo n.º 8
0
def test_system_config_when_connecting(ray_start_cluster):
    config = {"object_timeout_milliseconds": 200}
    cluster = Cluster()
    cluster.add_node(_system_config=config, object_store_memory=100 * 1024 * 1024)
    cluster.wait_for_nodes()

    # Specifying _system_config when connecting to a cluster is disallowed.
    with pytest.raises(ValueError):
        ray.init(address=cluster.address, _system_config=config)

    # Check that the config was picked up (object pinning is disabled).
    ray.init(address=cluster.address)
    obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))

    for _ in range(5):
        put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
    del put_ref

    ray.get(obj_ref)
Ejemplo n.º 9
0
def test_ray_status_multinode():
    cluster = Cluster()
    for _ in range(4):
        cluster.add_node(num_cpus=2)
    runner = CliRunner()

    def output_ready():
        result = runner.invoke(scripts.status)
        result.stdout
        if not result.exception and "memory" in result.output:
            return True
        raise RuntimeError(f"result.exception={result.exception} "
                           f"result.output={result.output}")

    wait_for_condition(output_ready)

    result = runner.invoke(scripts.status, [])
    _check_output_via_pattern("test_ray_status_multinode.txt", result)
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 10
0
def run_multi_nodes():
    c = Cluster()
    c.add_node(num_cpus=4,
               object_store_memory=object_store_size,
               _system_config=system_config)
    ray.init(address=c.address)
    for _ in range(num_nodes - 1):  # subtract a head node.
        c.add_node(num_cpus=4, object_store_memory=object_store_size)
    c.wait_for_nodes()

    # Run shuffle.
    print(
        f"\n\nTest streaming shuffle with {num_nodes} nodes.\n"
        f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}"
        "GB")
    run_shuffle()
    time.sleep(5)
    display_spilling_info(c.address)
    ray.shutdown()
    c.shutdown()
    time.sleep(5)
Ejemplo n.º 11
0
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config):
    object_spilling_config, temp_folder = object_spilling_config
    cluster = Cluster()
    cluster.add_node(
        num_cpus=0,
        object_store_memory=75 * 1024 * 1024,
        _system_config={"object_spilling_config": object_spilling_config},
    )
    ray.init(address=cluster.address)
    node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # This task will run on node 2 because node 1 has no CPU resource
    @ray.remote(num_cpus=1)
    def run_workload():
        ids = []
        for _ in range(2):
            arr = np.random.rand(5 * 1024 * 1024)  # 40 MB
            ids.append(ray.put(arr))
        return ids

    ids = ray.get(run_workload.remote())
    assert not is_dir_empty(temp_folder)

    # Kill node 2
    cluster.remove_node(node2)

    # Verify that the spill folder is not empty
    assert not is_dir_empty(temp_folder)

    # Start a new node
    cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # Verify that the spill folder is now cleaned up
    assert is_dir_empty(temp_folder)

    # We hold the object refs to prevent them from being deleted
    del ids
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 12
0
def test_pull_bundles_pinning(shutdown_only):
    cluster = Cluster()
    object_size = int(50e6)
    num_objects = 10
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0, object_store_memory=1000e6)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node cannot even fit a single task.
    cluster.add_node(num_cpus=1, object_store_memory=200e6)
    cluster.wait_for_nodes()

    @ray.remote(num_cpus=1)
    def foo(*args):
        return

    task_args = [
        ray.put(np.zeros(object_size, dtype=np.uint8))
        for _ in range(num_objects)
    ]
    ray.get(foo.remote(*task_args))
Ejemplo n.º 13
0
def setup_local_single_node_cluster(num_nodes):
    """Setup ray cluster locally via ray.init() and Cluster()

    Each actor is simulated in local process on single node,
    thus smaller scale by default.
    """
    cluster = Cluster()
    for i in range(num_nodes):
        cluster.add_node(
            redis_port=6379 if i == 0 else None,
            num_redis_shards=NUM_REDIS_SHARDS if i == 0 else None,
            num_cpus=NUM_CPU_PER_NODE,
            num_gpus=0,
            resources={str(i): 2},
            object_store_memory=OBJECT_STORE_MEMORY,
            redis_max_memory=REDIS_MAX_MEMORY,
            dashboard_host="0.0.0.0",
        )
    ray.init(address=cluster.address, dashboard_host="0.0.0.0")
    serve_client = serve.start()

    return serve_client
Ejemplo n.º 14
0
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, _internal_config=config)

    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return x

    obj = ray.put(np.zeros(10**7, dtype=np.uint8))
    result = dependent_task.options(resources={"node1": 1}).remote(obj)
    ray.get(result)
    del obj

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)

    for _ in range(20):
        ray.put(np.zeros(10**7, dtype=np.uint8))

    if reconstruction_enabled:
        ray.get(result)
    else:
        with pytest.raises(ray.exceptions.UnreconstructableError):
            ray.get(result)
Ejemplo n.º 15
0
def ray_cluster():
    try:
        from ray.cluster_utils import Cluster
    except ModuleNotFoundError:
        from ray._private.cluster_utils import Cluster
    cluster = Cluster()
    remote_nodes = []
    num_nodes = 1
    for i in range(num_nodes):
        remote_nodes.append(cluster.add_node(num_cpus=10))
        if len(remote_nodes) == 1:
            ray.init(address=cluster.address)
    yield
    ray.shutdown()
Ejemplo n.º 16
0
def main():
    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={"object_store_memory": 20 * 1024 * 1024 * 1024, "num_cpus": 16},
    )
    cluster.add_node(
        object_store_memory=20 * 1024 * 1024 * 1024, num_gpus=1, num_cpus=16
    )

    object_ref_list = []
    for i in range(0, 10):
        object_ref = ray.put(np.random.rand(1024 * 128, 1024))
        object_ref_list.append(object_ref)

    @ray.remote(num_gpus=1)
    def f(object_ref_list):
        diffs = []
        for object_ref in object_ref_list:
            before = time.time()
            ray.get(object_ref)
            after = time.time()
            diffs.append(after - before)
            time.sleep(1)
        return np.mean(diffs), np.std(diffs)

    time_diff, time_diff_std = ray.get(f.remote(object_ref_list))

    print(
        "latency to get an 1G object over network",
        round(time_diff, 2),
        "+-",
        round(time_diff_std, 2),
    )

    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 17
0
def test_cluster_handle_affinity():
    cluster = Cluster()
    # HACK: using two different ip address so the placement constraint for
    # resource check later will work.
    head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=4)
    cluster.add_node(node_ip_address="0.0.0.0", num_cpus=4)

    ray.init(head_node.address)

    # Make sure we have two nodes.
    node_ids = [n["NodeID"] for n in ray.nodes()]
    assert len(node_ids) == 2

    # Start the backend.
    client = serve.start(http_port=randint(10000, 30000), detached=True)
    client.create_backend("hi:v0", lambda _: "hi")
    client.create_endpoint("hi", backend="hi:v0")

    # Try to retrieve the handle from both head and worker node, check the
    # router's node id.
    @ray.remote
    def check_handle_router_id():
        client = serve.connect()
        handle = client.get_handle("hi")
        return get_node_id_for_actor(handle.router_handle)

    router_node_ids = ray.get([
        check_handle_router_id.options(resources={
            node_id: 0.01
        }).remote() for node_id in ray.state.node_ids()
    ])

    assert set(router_node_ids) == set(node_ids)

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 18
0
def test_ray_get_task_args_deadlock(shutdown_only):
    cluster = Cluster()
    object_size = int(6e6)
    num_objects = 10
    # Head node can fit all of the objects at once.
    cluster.add_node(num_cpus=0,
                     object_store_memory=4 * num_objects * object_size)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Worker node can only fit 1 task at a time.
    cluster.add_node(num_cpus=1,
                     object_store_memory=1.5 * num_objects * object_size)
    cluster.wait_for_nodes()

    @ray.remote
    def foo(*args):
        return

    @ray.remote
    def test_deadlock(get_args, task_args):
        foo.remote(*task_args)
        ray.get(get_args)

    for i in range(5):
        start = time.time()
        get_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        task_args = [
            ray.put(np.zeros(object_size, dtype=np.uint8))
            for _ in range(num_objects)
        ]
        ray.get(test_deadlock.remote(get_args, task_args))
        print(f"round {i} finished in {time.time() - start}")
Ejemplo n.º 19
0
def ray_cluster():
    try:
        from ray.cluster_utils import Cluster
    except ModuleNotFoundError:
        from ray._private.cluster_utils import Cluster
    cluster = Cluster()
    remote_nodes = []
    num_nodes = 3
    for i in range(num_nodes):
        remote_nodes.append(cluster.add_node(num_cpus=10))
        if len(remote_nodes) == 1:
            ray.init(address=cluster.address)
    mo.setup_cluster(address_to_resources=TEST_ADDRESS_TO_RESOURCES)

    yield

    RayActorDriver.stop_cluster()
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 20
0
def ray_start_regular_shared():
    try:
        from ray.cluster_utils import Cluster
    except ModuleNotFoundError:
        from ray._private.cluster_utils import Cluster
    cluster = Cluster()
    remote_nodes = []
    num_nodes = 3
    for i in range(num_nodes):
        remote_nodes.append(cluster.add_node(num_cpus=10))
        if len(remote_nodes) == 1:
            ray.init()
    if hasattr(ray.util, "get_placement_group"):
        pg = ray.util.placement_group(name=pg_name,
                                      bundles=[{
                                          'CPU': n_process
                                      }],
                                      strategy="SPREAD")
        ray.get(pg.ready())
    yield
    ray.shutdown()
Ejemplo n.º 21
0
def test_cached_object(ray_start_cluster):
    config = json.dumps({
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
    })
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    @ray.remote
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node1": 1}).remote()
    ray.get(dependent_task.options(resources={"node2": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    ray.get(dependent_task.remote(obj))
Ejemplo n.º 22
0
def ray_large_cluster():
    try:
        from ray.cluster_utils import Cluster
    except ModuleNotFoundError:
        from ray._private.cluster_utils import Cluster
    cluster = Cluster()
    remote_nodes = []
    num_nodes = 3
    for i in range(num_nodes):
        remote_nodes.append(cluster.add_node(num_cpus=10))
        if len(remote_nodes) == 1:
            ray.init(address=cluster.address)
    register_ray_serializers()
    try:
        yield
    finally:
        unregister_ray_serializers()
        Router.set_instance(None)
        RayServer.clear()
        ray.shutdown()
        cluster.shutdown()
        if 'COV_CORE_SOURCE' in os.environ:
            # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed
            subprocess.check_call(["ray", "stop", "--force"])
Ejemplo n.º 23
0
def _ray_start_cluster(**kwargs):
    init_kwargs = get_default_fixture_ray_kwargs()
    num_nodes = 0
    do_init = False
    # num_nodes & do_init are not arguments for ray.init, so delete them.
    if "num_nodes" in kwargs:
        num_nodes = kwargs["num_nodes"]
        del kwargs["num_nodes"]
    if "do_init" in kwargs:
        do_init = kwargs["do_init"]
        del kwargs["do_init"]
    elif num_nodes > 0:
        do_init = True
    init_kwargs.update(kwargs)
    cluster = Cluster()
    remote_nodes = []
    for _ in range(num_nodes):
        remote_nodes.append(cluster.add_node(**init_kwargs))
    if do_init:
        ray.init(address=cluster.address)
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 24
0
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
    }
    cluster = Cluster()
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    # Node to place the parent actor.
    node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
    # Node to place the child actor.
    cluster.add_node(num_cpus=1, resources={"child": 1})
    cluster.wait_for_nodes()

    @ray.remote
    def sleep():
        time.sleep(1000)

    @ray.remote(resources={"child": 1})
    def probe():
        return

    # TODO(swang): This test does not pass if max_restarts > 0 for the
    # raylet codepath. Add this parameter once the GCS actor service is enabled
    # by default.
    @ray.remote
    class Actor(object):
        def __init__(self):
            return

        def start_child(self, use_actors):
            if use_actors:
                child = Actor.options(resources={"child": 1}).remote()
                ray.get(child.sleep.remote())
            else:
                ray.get(sleep.options(resources={"child": 1}).remote())

        def sleep(self):
            time.sleep(1000)

        def get_pid(self):
            return os.getpid()

    # Returns whether the "child" resource is available.
    def child_resource_available():
        p = probe.remote()
        ready, _ = ray.wait([p], timeout=1)
        return len(ready) > 0

    # Test fate sharing if the parent process dies.
    def test_process_failure(use_actors):
        a = Actor.options(resources={"parent": 1}).remote()
        pid = ray.get(a.get_pid.remote())
        a.start_child.remote(use_actors=use_actors)
        # Wait for the child to be scheduled.
        wait_for_condition(lambda: not child_resource_available())
        # Kill the parent process.
        os.kill(pid, 9)
        wait_for_condition(child_resource_available)

    # Test fate sharing if the parent node dies.
    def test_node_failure(node_to_kill, use_actors):
        a = Actor.options(resources={"parent": 1}).remote()
        a.start_child.remote(use_actors=use_actors)
        # Wait for the child to be scheduled.
        wait_for_condition(lambda: not child_resource_available())
        # Kill the parent process.
        cluster.remove_node(node_to_kill, allow_graceful=False)
        node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
        wait_for_condition(child_resource_available)
        return node_to_kill

    if node_failure:
        test_node_failure(node_to_kill, use_actors)
    else:
        test_process_failure(use_actors)
Ejemplo n.º 25
0
object_store_memory = 10**8
num_nodes = 10

message = ("Make sure there is enough memory on this machine to run this "
           "workload. We divide the system memory by 2 to provide a buffer.")
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory <
        ray.utils.get_system_memory() / 2), message

# Simulate a cluster on one machine.

cluster = Cluster()
for i in range(num_nodes):
    cluster.add_node(redis_port=6379 if i == 0 else None,
                     num_redis_shards=num_redis_shards if i == 0 else None,
                     num_cpus=2,
                     num_gpus=0,
                     resources={str(i): 2},
                     object_store_memory=object_store_memory,
                     redis_max_memory=redis_max_memory,
                     webui_host="0.0.0.0")
ray.init(address=cluster.address)

# Run the workload.


@ray.remote
def f(*xs):
    return np.zeros(1024, dtype=np.uint8)


iteration = 0
ids = []
Ejemplo n.º 26
0
class RayExecutorQueueTest(unittest.TestCase):
    def setUp(self):
        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "num_cpus": 1,
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        self.trial_executor = RayTrialExecutor(queue_trials=True,
                                               refresh_period=0)
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testQueueTrial(self):
        """Tests that reset handles NotImplemented properly."""
        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        cpu_only = create_trial(1, 0)
        self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only))
        self.trial_executor.start_trial(cpu_only)

        gpu_only = create_trial(0, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only))

    def testHeadBlocking(self):
        # Once resource requests are deprecated, remove this test
        os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"

        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        gpu_trial = create_trial(1, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial))
        self.trial_executor.start_trial(gpu_trial)

        # TODO(rliaw): This behavior is probably undesirable, but right now
        #  trials with different resource requirements is not often used.
        cpu_only_trial = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))

        self.cluster.add_node(num_cpus=1, num_gpus=1)
        self.cluster.wait_for_nodes()

        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))
        self.trial_executor.start_trial(cpu_only_trial)

        cpu_only_trial2 = create_trial(1, 0)
        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial2))
        self.trial_executor.start_trial(cpu_only_trial2)

        cpu_only_trial3 = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial3))
Ejemplo n.º 27
0
def test_shutdown():
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    g.shutdown()
    assert not any(n.any_processes_alive() for n in [node, node2])
Ejemplo n.º 28
0
def test_multiple_routers():
    cluster = Cluster()
    head_node = cluster.add_node()
    cluster.add_node()

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    client = serve.start(http_port=8005)  # noqa: F841

    def get_proxy_names():
        proxy_names = []
        for node_id, _ in get_all_node_ids():
            proxy_names.append(
                format_actor_name(SERVE_PROXY_NAME, client._controller_name,
                                  node_id))
        return proxy_names

    wait_for_condition(lambda: len(get_proxy_names()) == 2)
    proxy_names = get_proxy_names()

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(proxy_names[0])
            ray.get_actor(proxy_names[1])
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    wait_for_condition(lambda: len(get_proxy_names()) == 3)
    third_proxy = get_proxy_names()[2]

    def get_third_actor():
        try:
            ray.get_actor(third_proxy)
            return True
        # IndexErrors covers when cluster resources aren't updated yet.
        except (IndexError, ValueError):
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(third_proxy)
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 29
0
def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.safe_load(f)
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "keep_checkpoints_num": args.keep_checkpoints_num,
                "checkpoint_score_attr": args.checkpoint_score_attr,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial and
                    resources_to_json(args.resources_per_trial)),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "upload_dir": args.upload_dir,
            }
        }

    verbose = 1
    for exp in experiments.values():
        # Bazel makes it hard to find files specified in `args` (and `data`).
        # Look for them here.
        # NOTE: Some of our yaml files don't have a `config` section.
        if exp.get("config", {}).get("input") and \
                not os.path.exists(exp["config"]["input"]):
            # This script runs in the ray/rllib dir.
            rllib_dir = Path(__file__).parent
            input_file = rllib_dir.absolute().joinpath(exp["config"]["input"])
            exp["config"]["input"] = str(input_file)

        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")
        if args.eager:
            exp["config"]["eager"] = True
        if args.torch:
            exp["config"]["use_pytorch"] = True
        if args.v:
            exp["config"]["log_level"] = "INFO"
            verbose = 2
        if args.vv:
            exp["config"]["log_level"] = "DEBUG"
            verbose = 3
        if args.trace:
            if not exp["config"].get("eager"):
                raise ValueError("Must enable --eager to enable tracing.")
            exp["config"]["eager_tracing"] = True

        ### Add Custom Callbacks
        exp["config"]["callbacks"] = CustomCallbacks

    if args.ray_num_nodes:
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(num_cpus=args.ray_num_cpus or 1,
                             num_gpus=args.ray_num_gpus or 0,
                             object_store_memory=args.ray_object_store_memory,
                             memory=args.ray_memory,
                             redis_max_memory=args.ray_redis_max_memory)
        ray.init(address=cluster.address)
    else:
        ray.init(address=args.ray_address,
                 object_store_memory=args.ray_object_store_memory,
                 memory=args.ray_memory,
                 redis_max_memory=args.ray_redis_max_memory,
                 num_cpus=args.ray_num_cpus,
                 num_gpus=args.ray_num_gpus)

    # NOTE: customs
    for exp in experiments.values():
        exp["loggers"] = make_loggers(args)

    # launch training
    run_experiments(experiments,
                    scheduler=_make_scheduler(args),
                    queue_trials=args.queue_trials,
                    resume=args.resume,
                    verbose=verbose,
                    concurrent=True)
Ejemplo n.º 30
0
def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.safe_load(f)
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "checkpoint_at_end": args.checkpoint_at_end,
                "keep_checkpoints_num": args.keep_checkpoints_num,
                "checkpoint_score_attr": args.checkpoint_score_attr,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial
                    and resources_to_json(args.resources_per_trial)
                ),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "sync_config": {
                    "upload_dir": args.upload_dir,
                },
            }
        }

    # Ray UI.
    if args.no_ray_ui:
        deprecation_warning(old="--no-ray-ui", new="--ray-ui", error=False)
        args.ray_ui = False

    verbose = 1
    for exp in experiments.values():
        # Bazel makes it hard to find files specified in `args` (and `data`).
        # Look for them here.
        # NOTE: Some of our yaml files don't have a `config` section.
        input_ = exp.get("config", {}).get("input")

        if input_ and input_ != "sampler":
            # This script runs in the ray/rllib dir.
            rllib_dir = Path(__file__).parent

            def patch_path(path):
                if isinstance(path, list):
                    return [patch_path(i) for i in path]
                elif isinstance(path, dict):
                    return {
                        patch_path(k): patch_path(v)
                        for k, v in path.items()
                    }
                elif isinstance(path, str):
                    if os.path.exists(path):
                        return path
                    else:
                        abs_path = str(rllib_dir.absolute().joinpath(path))
                        return abs_path if os.path.exists(abs_path) else path
                else:
                    return path

            exp["config"]["input"] = patch_path(input_)

        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")

        if args.torch:
            deprecation_warning("--torch", "--framework=torch")
            exp["config"]["framework"] = "torch"
        elif args.eager:
            deprecation_warning("--eager", "--framework=[tf2|tfe]")
            exp["config"]["framework"] = "tfe"
        elif args.framework is not None:
            exp["config"]["framework"] = args.framework

        if args.trace:
            if exp["config"]["framework"] not in ["tf2", "tfe"]:
                raise ValueError("Must enable --eager to enable tracing.")
            exp["config"]["eager_tracing"] = True

        if args.v:
            exp["config"]["log_level"] = "INFO"
            verbose = 3  # Print details on trial result
        if args.vv:
            exp["config"]["log_level"] = "DEBUG"
            verbose = 3  # Print details on trial result

    if args.ray_num_nodes:
        # Import this only here so that train.py also works with
        # older versions (and user doesn't use `--ray-num-nodes`).
        from ray.cluster_utils import Cluster

        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(
                num_cpus=args.ray_num_cpus or 1,
                num_gpus=args.ray_num_gpus or 0,
                object_store_memory=args.ray_object_store_memory,
            )
        ray.init(address=cluster.address)
    else:
        ray.init(
            include_dashboard=args.ray_ui,
            address=args.ray_address,
            object_store_memory=args.ray_object_store_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus,
            local_mode=args.local_mode,
        )

    if IS_NOTEBOOK:
        progress_reporter = JupyterNotebookReporter(
            overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1)
    else:
        progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1)

    run_experiments(
        experiments,
        scheduler=create_scheduler(args.scheduler, **args.scheduler_config),
        resume=args.resume,
        verbose=verbose,
        progress_reporter=progress_reporter,
        concurrent=True,
    )

    ray.shutdown()