Example #1
0
def test_multi_node_metrics_export_port_discovery(ray_start_cluster):
    NUM_NODES = 3
    cluster = ray_start_cluster
    nodes = [cluster.add_node() for _ in range(NUM_NODES)]
    nodes = {
        node.address_info["metrics_export_port"]: node.address_info
        for node in nodes
    }
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)
    node_info_list = ray.nodes()

    for node_info in node_info_list:
        metrics_export_port = node_info["MetricsExportPort"]
        address_info = nodes[metrics_export_port]
        assert (address_info["raylet_socket_name"] ==
                node_info["RayletSocketName"])

        # Make sure we can ping Prometheus endpoints.
        def test_prometheus_endpoint():
            response = requests.get(
                "http://localhost:{}".format(metrics_export_port))
            return response.status_code == 200

        wait_until_succeeded_without_exception(
            test_prometheus_endpoint, (requests.exceptions.ConnectionError, ))
Example #2
0
def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    my_obj = ray.put([1, 2, 3] * 100)  # noqa
    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa
    results = ray.get([actor.get_obj.remote() for actor in actors])  # noqa
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    resp = requests.get(webui_url + "/memory/set_fetch",
                        params={"shouldFetch": "true"})
    resp.raise_for_status()

    def check_mem_table():
        resp = requests.get(f"{webui_url}/memory/memory_table")
        resp_data = resp.json()
        assert resp_data["result"]
        latest_memory_table = resp_data["data"]["memoryTable"]
        summary = latest_memory_table["summary"]
        # 1 ref per handle and per object the actor has a ref to
        assert summary["totalActorHandles"] == len(actors) * 2
        # 1 ref for my_obj
        assert summary["totalLocalRefCount"] == 1

    wait_until_succeeded_without_exception(check_mem_table, (AssertionError, ),
                                           timeout_ms=1000)
Example #3
0
def test_submit_job(disable_aiohttp_cache, enable_test_module,
                    ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job = _prepare_job_for_test(webui_url)
    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    job_id = None
    job_submitted = False

    def _check_running():
        nonlocal job_id
        nonlocal job_submitted
        if not job_submitted:
            resp = requests.post(f"{webui_url}/jobs", json=job)
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True, resp.text
            job_submitted = True

        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        summary = result["data"]["summary"]
        assert len(summary) == 2

        # TODO(fyrestone): Return a job id when POST /jobs
        # The larger job id is the one we submitted.
        job_ids = sorted(s["jobId"] for s in summary)
        job_id = job_ids[1]

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["jobId"] == job_id

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["isDead"] is False
        job_actors = result["data"]["detail"]["jobActors"]
        job_workers = result["data"]["detail"]["jobWorkers"]
        assert len(job_actors) > 0
        assert len(job_workers) > 0

    wait_until_succeeded_without_exception(_check_running,
                                           exceptions=(AssertionError,
                                                       KeyError, IndexError),
                                           timeout_ms=30 * 1000,
                                           raise_last_ex=True)
Example #4
0
def test_logs(enable_test_module, disable_aiohttp_cache,
              ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la2 = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    la2_pid = str(ray.get(la2.get_pid.remote()))
    ray.get(la.go.remote(4))
    ray.get(la2.go.remote(1))

    def check_logs():
        node_logs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip})
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert type(node_logs["data"]["logs"]) is dict
        assert all(
            pid in node_logs["data"]["logs"] for pid in (la_pid, la2_pid))
        assert len(node_logs["data"]["logs"][la2_pid]) == 1

        actor_one_logs_response = requests.get(
            f"{webui_url}/node_logs",
            params={
                "ip": node_ip,
                "pid": str(la_pid)
            })
        actor_one_logs_response.raise_for_status()
        actor_one_logs = actor_one_logs_response.json()
        assert actor_one_logs["result"]
        assert type(actor_one_logs["data"]["logs"]) is dict
        assert len(actor_one_logs["data"]["logs"][la_pid]) == 4

    wait_until_succeeded_without_exception(
        check_logs, (AssertionError), timeout_ms=1000)
Example #5
0
 def test_pending_actor(ray_addresses):
     assert (wait_until_server_available(addresses["webui_url"]) is True)
     webui_url = ray_addresses["webui_url"].replace("localhost",
                                                    "http://127.0.0.1")
     raylet_info = requests.get(webui_url + "/api/raylet_info").json()
     actor_info = raylet_info["result"]["actors"]
     assert len(actor_info) == 1
     _, infeasible_actor_info = actor_info.popitem()
     wait_until_succeeded_without_exception(
         test_pending_actor,
         (AssertionError, requests.exceptions.ConnectionError),
         addresses,
         timeout_ms=30000,
         retry_interval_ms=1000)
Example #6
0
def test_raylet_infeasible_tasks(shutdown_only):
    """
    This test creates an actor that requires 5 GPUs
    but a ray cluster only has 3 GPUs. As a result,
    the new actor should be an infeasible actor.
    """
    addresses = ray.init(num_gpus=3)

    @ray.remote(num_gpus=5)
    class ActorRequiringGPU:
        def __init__(self):
            pass

    ActorRequiringGPU.remote()

    def test_infeasible_actor(ray_addresses):
        assert (wait_until_server_available(addresses["webui_url"]) is True)
        webui_url = ray_addresses["webui_url"].replace("localhost",
                                                       "http://127.0.0.1")
        raylet_info = requests.get(webui_url + "/api/raylet_info").json()
        actor_info = raylet_info["result"]["actors"]
        assert len(actor_info) == 1

        _, infeasible_actor_info = actor_info.popitem()
        assert infeasible_actor_info["state"] == -1
        assert infeasible_actor_info["invalidStateType"] == "infeasibleActor"

    assert (wait_until_succeeded_without_exception(
        test_infeasible_actor,
        (AssertionError, requests.exceptions.ConnectionError),
        addresses,
        timeout_ms=30000,
        retry_interval_ms=1000) is True)
Example #7
0
def test_get_cluster_status(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    # Check that the cluster_status endpoint works without the underlying data
    # from the GCS, but returns nothing.
    def get_cluster_status():
        response = requests.get(f"{webui_url}/api/cluster_status")
        response.raise_for_status()
        print(response.json())
        assert response.json()["result"]
        assert "autoscalingStatus" in response.json()["data"]
        assert response.json()["data"]["autoscalingStatus"] is None
        assert "autoscalingError" in response.json()["data"]
        assert response.json()["data"]["autoscalingError"] is None
        assert "clusterStatus" in response.json()["data"]
        assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]

    wait_until_succeeded_without_exception(get_cluster_status,
                                           (requests.RequestException, ))

    # Populate the GCS field, check that the data is returned from the
    # endpoint.
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(
        host=address[0],
        port=int(address[1]),
        password=ray_constants.REDIS_DEFAULT_PASSWORD)

    client.hset(DEBUG_AUTOSCALING_STATUS_LEGACY, "value", "hello")
    client.hset(DEBUG_AUTOSCALING_ERROR, "value", "world")

    response = requests.get(f"{webui_url}/api/cluster_status")
    response.raise_for_status()
    assert response.json()["result"]
    assert "autoscalingStatus" in response.json()["data"]
    assert response.json()["data"]["autoscalingStatus"] == "hello"
    assert "autoscalingError" in response.json()["data"]
    assert response.json()["data"]["autoscalingError"] == "world"
    assert "clusterStatus" in response.json()["data"]
    assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
Example #8
0
def test_errors(enable_test_module, disable_aiohttp_cache,
                ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class ErrorActor():
        def go(self):
            raise ValueError("This is an error")

        def get_pid(self):
            return os.getpid()

    ea = ErrorActor.remote()
    ea_pid = ea.get_pid.remote()
    ea.go.remote()

    def check_errs():
        node_errs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip})
        node_errs_response.raise_for_status()
        node_errs = node_errs_response.json()
        assert node_errs["result"]
        assert type(node_errs["data"]["errors"]) is dict
        assert ea_pid in node_errs["data"]["errors"]
        assert len(node_errs["data"]["errors"][ea_pid]) == 1

        actor_err_response = requests.get(
            f"{webui_url}/node_logs",
            params={
                "ip": node_ip,
                "pid": str(ea_pid)
            })
        actor_err_response.raise_for_status()
        actor_errs = actor_err_response.json()
        assert actor_errs["result"]
        assert type(actor_errs["data"]["errors"]) is dict
        assert len(actor_errs["data"]["errors"][ea_pid]) == 4

    wait_until_succeeded_without_exception(
        check_errs, (AssertionError), timeout_ms=1000)
Example #9
0
def test_raylet_pending_tasks(shutdown_only):
    # Make sure to specify num_cpus. Otherwise, the test can be broken
    # when the number of cores is less than the number of spawned actors.
    addresses = ray.init(num_gpus=3, num_cpus=4)

    @ray.remote(num_gpus=1)
    class ActorRequiringGPU:
        def __init__(self):
            pass

    @ray.remote
    class ParentActor:
        def __init__(self):
            self.a = [ActorRequiringGPU.remote() for i in range(4)]

    # If we do not get ParentActor actor handler, reference counter will
    # terminate ParentActor.
    parent_actor = ParentActor.remote()
    assert parent_actor is not None

    def test_pending_actor(ray_addresses):
        assert (wait_until_server_available(addresses["webui_url"]) is True)
        webui_url = ray_addresses["webui_url"].replace("localhost",
                                                       "http://127.0.0.1")
        raylet_info = requests.get(webui_url + "/api/raylet_info").json()
        actor_info = raylet_info["result"]["actors"]
        assert len(actor_info) == 1
        _, infeasible_actor_info = actor_info.popitem()

        # Verify there are 4 spawned actors.
        children = infeasible_actor_info["children"]
        assert len(children) == 4

        pending_actor_detected = 0
        for child_id, child in children.items():
            if ("invalidStateType" in child
                    and child["invalidStateType"] == "pendingActor"):
                pending_actor_detected += 1
        # 4 GPUActors are spawned although there are only 3 GPUs.
        # One actor should be in the pending state.
        assert pending_actor_detected == 1

    assert (wait_until_succeeded_without_exception(
        test_pending_actor,
        (AssertionError, requests.exceptions.ConnectionError),
        addresses,
        timeout_ms=30000,
        retry_interval_ms=1000) is True)