Example #1
0
def test_submit_job(disable_aiohttp_cache, enable_test_module,
                    ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job = _prepare_job_for_test(webui_url)
    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    job_id = None
    job_submitted = False

    def _check_running():
        nonlocal job_id
        nonlocal job_submitted
        if not job_submitted:
            resp = requests.post(f"{webui_url}/jobs", json=job)
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True, resp.text
            job_submitted = True

        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        summary = result["data"]["summary"]
        assert len(summary) == 2

        # TODO(fyrestone): Return a job id when POST /jobs
        # The larger job id is the one we submitted.
        job_ids = sorted(s["jobId"] for s in summary)
        job_id = job_ids[1]

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["jobId"] == job_id

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["isDead"] is False
        job_actors = result["data"]["detail"]["jobActors"]
        job_workers = result["data"]["detail"]["jobWorkers"]
        assert len(job_actors) > 0
        assert len(job_workers) > 0

    wait_until_succeeded_without_exception(
        _check_running,
        exceptions=(AssertionError, KeyError, IndexError),
        timeout_ms=30 * 1000,
        raise_last_ex=True)
Example #2
0
def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    my_obj = ray.put([1, 2, 3] * 100)  # noqa
    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa
    results = ray.get([actor.get_obj.remote() for actor in actors])  # noqa
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    resp = requests.get(webui_url + "/memory/set_fetch",
                        params={"shouldFetch": "true"})
    resp.raise_for_status()

    def check_mem_table():
        resp = requests.get(f"{webui_url}/memory/memory_table")
        resp_data = resp.json()
        assert resp_data["result"]
        latest_memory_table = resp_data["data"]["memoryTable"]
        summary = latest_memory_table["summary"]
        # 1 ref per handle and per object the actor has a ref to
        assert summary["totalActorHandles"] == len(actors) * 2
        # 1 ref for my_obj
        assert summary["totalLocalRefCount"] == 1

    wait_until_succeeded_without_exception(check_mem_table, (AssertionError, ),
                                           timeout_ms=1000)
Example #3
0
def test_multi_node_metrics_export_port_discovery(ray_start_cluster):
    NUM_NODES = 3
    cluster = ray_start_cluster
    nodes = [cluster.add_node() for _ in range(NUM_NODES)]
    nodes = {
        node.address_info["metrics_export_port"]: node.address_info
        for node in nodes
    }
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)
    node_info_list = ray.nodes()

    for node_info in node_info_list:
        metrics_export_port = node_info["MetricsExportPort"]
        address_info = nodes[metrics_export_port]
        assert (address_info["raylet_socket_name"] ==
                node_info["RayletSocketName"])

        # Make sure we can ping Prometheus endpoints.
        def test_prometheus_endpoint():
            response = requests.get(
                "http://localhost:{}".format(metrics_export_port))
            return response.status_code == 200

        wait_until_succeeded_without_exception(
            test_prometheus_endpoint, (requests.exceptions.ConnectionError, ))
Example #4
0
def test_logs_clean_up(
    enable_test_module, disable_aiohttp_cache, ray_start_cluster_head
):
    """Check if logs from the dead pids are GC'ed."""
    cluster = ray_start_cluster_head
    assert wait_until_server_available(cluster.webui_url) is True
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    ray.get(la.go.remote(1))

    def check_logs():
        node_logs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip}
        )
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert la_pid in node_logs["data"]["logs"]

    assert wait_until_succeeded_without_exception(
        check_logs, (AssertionError,), timeout_ms=1000
    )
    ray.kill(la)

    def check_logs_not_exist():
        node_logs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip}
        )
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert la_pid not in node_logs["data"]["logs"]

    assert wait_until_succeeded_without_exception(
        check_logs_not_exist, (AssertionError,), timeout_ms=10000
    )
Example #5
0
def test_logs(enable_test_module, disable_aiohttp_cache,
              ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la2 = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    la2_pid = str(ray.get(la2.get_pid.remote()))
    ray.get(la.go.remote(4))
    ray.get(la2.go.remote(1))

    def check_logs():
        node_logs_response = requests.get(f"{webui_url}/node_logs",
                                          params={"ip": node_ip})
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert type(node_logs["data"]["logs"]) is dict
        assert all(pid in node_logs["data"]["logs"]
                   for pid in (la_pid, la2_pid))
        assert len(node_logs["data"]["logs"][la2_pid]) == 1

        actor_one_logs_response = requests.get(f"{webui_url}/node_logs",
                                               params={
                                                   "ip": node_ip,
                                                   "pid": str(la_pid)
                                               })
        actor_one_logs_response.raise_for_status()
        actor_one_logs = actor_one_logs_response.json()
        assert actor_one_logs["result"]
        assert type(actor_one_logs["data"]["logs"]) is dict
        assert len(actor_one_logs["data"]["logs"][la_pid]) == 4

    wait_until_succeeded_without_exception(check_logs, (AssertionError),
                                           timeout_ms=1000)
Example #6
0
def test_get_cluster_status(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    # Check that the cluster_status endpoint works without the underlying data
    # from the GCS, but returns nothing.
    def get_cluster_status():
        response = requests.get(f"{webui_url}/api/cluster_status")
        response.raise_for_status()
        print(response.json())
        assert response.json()["result"]
        assert "autoscalingStatus" in response.json()["data"]
        assert "autoscalingError" in response.json()["data"]
        assert response.json()["data"]["autoscalingError"] is None
        assert "clusterStatus" in response.json()["data"]
        assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]

    wait_until_succeeded_without_exception(get_cluster_status,
                                           (requests.RequestException, ))

    # Populate the GCS field, check that the data is returned from the
    # endpoint.
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(host=address[0],
                               port=int(address[1]),
                               password=ray_constants.REDIS_DEFAULT_PASSWORD)
    gcs_client = ray._private.gcs_utils.GcsClient.create_from_redis(client)
    ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
    ray.experimental.internal_kv._internal_kv_put(
        DEBUG_AUTOSCALING_STATUS_LEGACY, "hello")
    ray.experimental.internal_kv._internal_kv_put(DEBUG_AUTOSCALING_ERROR,
                                                  "world")

    response = requests.get(f"{webui_url}/api/cluster_status")
    response.raise_for_status()
    assert response.json()["result"]
    assert "autoscalingStatus" in response.json()["data"]
    assert response.json()["data"]["autoscalingStatus"] == "hello"
    assert "autoscalingError" in response.json()["data"]
    assert response.json()["data"]["autoscalingError"] == "world"
    assert "clusterStatus" in response.json()["data"]
    assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
Example #7
0
def test_errors(enable_test_module, disable_aiohttp_cache,
                ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class ErrorActor():
        def go(self):
            raise ValueError("This is an error")

        def get_pid(self):
            return os.getpid()

    ea = ErrorActor.remote()
    ea_pid = ea.get_pid.remote()
    ea.go.remote()

    def check_errs():
        node_errs_response = requests.get(f"{webui_url}/node_logs",
                                          params={"ip": node_ip})
        node_errs_response.raise_for_status()
        node_errs = node_errs_response.json()
        assert node_errs["result"]
        assert "errors" in node_errs["data"]
        assert type(node_errs["data"]["errors"]) is dict
        assert ea_pid in node_errs["data"]["errors"]
        assert len(node_errs["data"]["errors"][ea_pid]) == 1

        actor_err_response = requests.get(f"{webui_url}/node_logs",
                                          params={
                                              "ip": node_ip,
                                              "pid": str(ea_pid)
                                          })
        actor_err_response.raise_for_status()
        actor_errs = actor_err_response.json()
        assert actor_errs["result"]
        assert type(actor_errs["data"]["errors"]) is dict
        assert len(actor_errs["data"]["errors"][ea_pid]) == 4

    wait_until_succeeded_without_exception(check_errs, (AssertionError, ),
                                           timeout_ms=1000)
Example #8
0
def test_logs_max_count(enable_test_module, disable_aiohttp_cache,
                        ray_start_cluster_head):
    """Test that each Ray worker cannot cache more than 1000 logs at a time.
    """
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    ray.get(la.go.remote(MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD))

    def check_logs():
        node_logs_response = requests.get(f"{webui_url}/node_logs",
                                          params={"ip": node_ip})
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert type(node_logs["data"]["logs"]) is dict
        assert la_pid in node_logs["data"]["logs"]
        log_lengths = len(node_logs["data"]["logs"][la_pid])
        assert log_lengths >= MAX_LOGS_TO_CACHE
        assert log_lengths <= MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD

        actor_one_logs_response = requests.get(f"{webui_url}/node_logs",
                                               params={
                                                   "ip": node_ip,
                                                   "pid": str(la_pid)
                                               })
        actor_one_logs_response.raise_for_status()
        actor_one_logs = actor_one_logs_response.json()
        assert actor_one_logs["result"]
        assert type(actor_one_logs["data"]["logs"]) is dict
        log_lengths = len(actor_one_logs["data"]["logs"][la_pid])
        assert log_lengths >= MAX_LOGS_TO_CACHE
        assert log_lengths <= MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD

    assert wait_until_succeeded_without_exception(check_logs,
                                                  (AssertionError, ),
                                                  timeout_ms=10000)
Example #9
0
def test_get_cluster_status(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    # Check that the cluster_status endpoint works without the underlying data
    # from the GCS, but returns nothing.
    def get_cluster_status():
        response = requests.get(f"{webui_url}/api/cluster_status")
        response.raise_for_status()
        print(response.json())
        assert response.json()["result"]
        assert "autoscalingStatus" in response.json()["data"]
        assert "autoscalingError" in response.json()["data"]
        assert response.json()["data"]["autoscalingError"] is None
        assert "clusterStatus" in response.json()["data"]
        assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]

    wait_until_succeeded_without_exception(get_cluster_status,
                                           (requests.RequestException, ))

    gcs_client = make_gcs_client(address_info)
    ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
    ray.experimental.internal_kv._internal_kv_put(
        DEBUG_AUTOSCALING_STATUS_LEGACY, "hello")
    ray.experimental.internal_kv._internal_kv_put(DEBUG_AUTOSCALING_ERROR,
                                                  "world")

    response = requests.get(f"{webui_url}/api/cluster_status")
    response.raise_for_status()
    assert response.json()["result"]
    assert "autoscalingStatus" in response.json()["data"]
    assert response.json()["data"]["autoscalingStatus"] == "hello"
    assert "autoscalingError" in response.json()["data"]
    assert response.json()["data"]["autoscalingError"] == "world"
    assert "clusterStatus" in response.json()["data"]
    assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]