Exemple #1
0
def test_submit_job(disable_aiohttp_cache, enable_test_module,
                    ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job = _prepare_job_for_test(webui_url)
    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    job_id = None
    job_submitted = False

    def _check_running():
        nonlocal job_id
        nonlocal job_submitted
        if not job_submitted:
            resp = requests.post(f"{webui_url}/jobs", json=job)
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True, resp.text
            job_submitted = True

        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        summary = result["data"]["summary"]
        assert len(summary) == 2

        # TODO(fyrestone): Return a job id when POST /jobs
        # The larger job id is the one we submitted.
        job_ids = sorted(s["jobId"] for s in summary)
        job_id = job_ids[1]

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["jobId"] == job_id

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["isDead"] is False
        job_actors = result["data"]["detail"]["jobActors"]
        job_workers = result["data"]["detail"]["jobWorkers"]
        assert len(job_actors) > 0
        assert len(job_workers) > 0

    wait_until_succeeded_without_exception(
        _check_running,
        exceptions=(AssertionError, KeyError, IndexError),
        timeout_ms=30 * 1000,
        raise_last_ex=True)
Exemple #2
0
def test_multi_nodes_info(enable_test_module, disable_aiohttp_cache,
                          ray_start_cluster_head):
    cluster: Cluster = ray_start_cluster_head
    assert wait_until_server_available(cluster.webui_url) is True
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    cluster.add_node()
    cluster.add_node()

    def _check_nodes():
        try:
            response = requests.get(webui_url + "/nodes?view=summary")
            response.raise_for_status()
            summary = response.json()
            assert summary["result"] is True, summary["msg"]
            summary = summary["data"]["summary"]
            assert len(summary) == 3
            for node_info in summary:
                node_id = node_info["raylet"]["nodeId"]
                response = requests.get(webui_url + f"/nodes/{node_id}")
                response.raise_for_status()
                detail = response.json()
                assert detail["result"] is True, detail["msg"]
                detail = detail["data"]["detail"]
                assert detail["raylet"]["state"] == "ALIVE"
            response = requests.get(webui_url + "/test/dump?key=agents")
            response.raise_for_status()
            agents = response.json()
            assert len(agents["data"]["agents"]) == 3
            return True
        except Exception as ex:
            logger.info(ex)
            return False

    wait_for_condition(_check_nodes, timeout=15)
Exemple #3
0
def test_kill_actor(ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            ray.worker.show_in_dashboard("test")
            return os.getpid()

    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())  # noqa

    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    def actor_killed(pid):
        """Check For the existence of a unix pid."""
        try:
            os.kill(pid, 0)
        except OSError:
            return True
        else:
            return False

    def get_actor():
        resp = requests.get(f"{webui_url}/logical/actor_groups")
        resp.raise_for_status()
        actor_groups_resp = resp.json()
        assert actor_groups_resp["result"] is True, actor_groups_resp["msg"]
        actor_groups = actor_groups_resp["data"]["actorGroups"]
        actor = actor_groups["Actor"]["entries"][0]
        return actor

    def kill_actor_using_dashboard(actor):
        resp = requests.get(
            webui_url + "/logical/kill_actor",
            params={
                "actorId": actor["actorId"],
                "ipAddress": actor["ipAddress"],
                "port": actor["port"],
            },
        )
        resp.raise_for_status()
        resp_json = resp.json()
        assert resp_json["result"] is True, "msg" in resp_json

    start = time.time()
    last_exc = None
    while time.time() - start <= 10:
        try:
            actor = get_actor()
            kill_actor_using_dashboard(actor)
            last_exc = None
            break
        except (KeyError, AssertionError) as e:
            last_exc = e
            time.sleep(0.1)
    assert last_exc is None
Exemple #4
0
def test_profiling(shutdown_only):
    addresses = ray.init(include_dashboard=True, num_cpus=6)

    @ray.remote(num_cpus=2)
    class Actor:
        def getpid(self):
            return os.getpid()

    c = Actor.remote()
    actor_pid = ray.get(c.getpid.remote())

    webui_url = addresses["webui_url"]
    assert (wait_until_server_available(webui_url) is True)
    webui_url = format_web_url(webui_url)

    start_time = time.time()
    launch_profiling = None
    while True:
        # Sometimes some startup time is required
        if time.time() - start_time > 15:
            raise RayTestTimeoutException(
                "Timed out while collecting profiling stats, "
                f"launch_profiling: {launch_profiling}")
        launch_profiling = requests.get(
            webui_url + "/api/launch_profiling",
            params={
                "ip": ray.nodes()[0]["NodeManagerAddress"],
                "pid": actor_pid,
                "duration": 5
            }).json()
        if launch_profiling["result"]:
            profiling_info = launch_profiling["data"]["profilingInfo"]
            break
        time.sleep(1)
    logger.info(profiling_info)
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                           enable_test_module):
    address = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    entrypoint_cmd = ("python -c\""
                      "import ray;"
                      "ray.init();"
                      "import time;"
                      "time.sleep(5);"
                      "import sys;"
                      "sys.exit(1);"
                      "\"")
    client = JobSubmissionClient(address)
    job_id = client.submit_job(entrypoint=entrypoint_cmd)

    def wait_for_job_to_fail():
        data = _get_snapshot(address)
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert job_entry["statusMessage"] is not None
                return job_entry["status"] == "FAILED"

        return False

    wait_for_condition(wait_for_job_to_fail, timeout=30)
Exemple #6
0
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                           enable_test_module):
    address = ray_start_with_dashboard.address_info["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    job_sleep_time_s = 5
    entrypoint_cmd = ('python -c"'
                      "import ray;"
                      "ray.init();"
                      "import time;"
                      f"time.sleep({job_sleep_time_s});"
                      "import sys;"
                      "sys.exit(1);"
                      '"')
    start_time_s = int(time.time())
    client = JobSubmissionClient(address)
    runtime_env = {"env_vars": {"RAY_TEST_456": "456"}}
    metadata = {"ray_test_789": "789"}
    job_id = client.submit_job(entrypoint=entrypoint_cmd,
                               metadata=metadata,
                               runtime_env=runtime_env)

    def wait_for_job_to_fail():
        data = _get_snapshot(address)

        legacy_job_failed = False
        job_failed = False

        # Test legacy job snapshot (one driver per job).
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert job_entry["statusMessage"] is not None
                legacy_job_failed = job_entry["status"] == "FAILED"

        # Test new jobs snapshot (0 to N drivers per job).
        for job_submission_id, entry in data["data"]["snapshot"][
                "jobSubmission"].items():
            if entry["status"] is not None:
                assert entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert entry["message"] is not None
                # TODO(architkulkarni): Disable automatic camelcase.
                assert entry["runtimeEnv"] == {
                    "envVars": {
                        "RAYTest456": "456"
                    }
                }
                assert entry["metadata"] == {"rayTest789": "789"}
                assert entry["errorType"] is None
                assert abs(entry["startTime"] - start_time_s) <= 2
                if entry["status"] == "FAILED":
                    job_failed = True
                    assert entry[
                        "endTime"] >= entry["startTime"] + job_sleep_time_s
        return legacy_job_failed and job_failed

    wait_for_condition(wait_for_job_to_fail, timeout=10)
Exemple #7
0
def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only):
    address_info = ray.init(num_cpus=1, include_dashboard=True)
    assert (wait_until_server_available(address_info["webui_url"]) is True)

    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 10
    start_time = time.time()
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/test/dump",
                                    proxies={
                                        "http": None,
                                        "https": None
                                    })
            response.raise_for_status()
            try:
                response.json()
                assert response.ok
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")
Exemple #8
0
def test_log_proxy(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            # Test range request.
            response = requests.get(
                f"{webui_url}/log_proxy?url={webui_url}/logs/dashboard.log",
                headers={"Range": "bytes=44-52"})
            response.raise_for_status()
            assert response.text == "Dashboard"
            # Test 404.
            response = requests.get(f"{webui_url}/log_proxy?"
                                    f"url={webui_url}/logs/not_exist_file.log")
            assert response.status_code == 404
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Exemple #9
0
def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard):
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"])

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    my_obj = ray.put([1, 2, 3] * 100)  # noqa
    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa
    results = ray.get([actor.get_obj.remote() for actor in actors])  # noqa
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    resp = requests.get(webui_url + "/memory/set_fetch", params={"shouldFetch": "true"})
    resp.raise_for_status()

    def check_mem_table():
        resp = requests.get(f"{webui_url}/memory/memory_table")
        resp_data = resp.json()
        assert resp_data["result"]
        latest_memory_table = resp_data["data"]["memoryTable"]
        summary = latest_memory_table["summary"]
        # 1 ref per handle and per object the actor has a ref to
        assert summary["totalActorHandles"] == len(actors) * 2
        # 1 ref for my_obj. 2 refs for self.obj_ref for each actor.
        assert summary["totalLocalRefCount"] == 3

    assert wait_until_succeeded_without_exception(
        check_mem_table, (AssertionError,), timeout_ms=10000
    )
Exemple #10
0
def test_event_message_limit(small_event_line_limit, disable_aiohttp_cache,
                             ray_start_with_dashboard):
    event_read_line_length_limit = small_event_line_limit
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"])
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    session_dir = ray_start_with_dashboard["session_dir"]
    event_dir = os.path.join(session_dir, "logs", "events")
    job_id = ray.JobID.from_int(100).hex()
    events = []
    # Sample event equals with limit.
    sample_event = _get_event("", job_id=job_id)
    message_len = event_read_line_length_limit - len(json.dumps(sample_event))
    for i in range(10):
        sample_event = copy.deepcopy(sample_event)
        sample_event["event_id"] = binary_to_hex(np.random.bytes(18))
        sample_event["message"] = str(i) * message_len
        assert len(json.dumps(sample_event)) == event_read_line_length_limit
        events.append(sample_event)
    # Sample event longer than limit.
    sample_event = copy.deepcopy(sample_event)
    sample_event["event_id"] = binary_to_hex(np.random.bytes(18))
    sample_event["message"] = "2" * (message_len + 1)
    assert len(json.dumps(sample_event)) > event_read_line_length_limit
    events.append(sample_event)

    for i in range(event_consts.EVENT_READ_LINE_COUNT_LIMIT):
        events.append(_get_event(str(i), job_id=job_id))

    with open(os.path.join(event_dir, "tmp.log"), "w") as f:
        f.writelines([(json.dumps(e) + "\n") for e in events])

    try:
        os.remove(os.path.join(event_dir, "event_GCS.log"))
    except Exception:
        pass
    os.rename(os.path.join(event_dir, "tmp.log"),
              os.path.join(event_dir, "event_GCS.log"))

    def _check_events():
        try:
            resp = requests.get(f"{webui_url}/events")
            resp.raise_for_status()
            result = resp.json()
            all_events = result["data"]["events"]
            assert (len(all_events[job_id]) >=
                    event_consts.EVENT_READ_LINE_COUNT_LIMIT + 10)
            messages = [e["message"] for e in all_events[job_id]]
            for i in range(10):
                assert str(i) * message_len in messages
            assert "2" * (message_len + 1) not in messages
            assert str(event_consts.EVENT_READ_LINE_COUNT_LIMIT -
                       1) in messages
            return True
        except Exception as ex:
            logger.exception(ex)
            return False

    wait_for_condition(_check_events, timeout=15)
Exemple #11
0
def test_snapshot(ray_start_with_dashboard):
    driver_template = """
import ray

ray.init(address="{address}", namespace="my_namespace")

@ray.remote
class Pinger:
    def ping(self):
        return "pong"

a = Pinger.options(lifetime={lifetime}, name={name}).remote()
ray.get(a.ping.remote())
    """

    detached_driver = driver_template.format(
        address=ray_start_with_dashboard["redis_address"],
        lifetime="'detached'",
        name="'abc'")
    named_driver = driver_template.format(
        address=ray_start_with_dashboard["redis_address"],
        lifetime="None",
        name="'xyz'")
    unnamed_driver = driver_template.format(
        address=ray_start_with_dashboard["redis_address"],
        lifetime="None",
        name="None")

    run_string_as_driver(detached_driver)
    run_string_as_driver(named_driver)
    run_string_as_driver(unnamed_driver)

    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    response = requests.get(f"{webui_url}/api/snapshot")
    response.raise_for_status()
    data = response.json()
    schema_path = os.path.join(os.path.dirname(dashboard.__file__),
                               "modules/snapshot/snapshot_schema.json")
    pprint.pprint(data)
    jsonschema.validate(instance=data, schema=json.load(open(schema_path)))

    assert len(data["data"]["snapshot"]["actors"]) == 3
    assert len(data["data"]["snapshot"]["jobs"]) == 4
    assert len(data["data"]["snapshot"]["deployments"]) == 0

    for actor_id, entry in data["data"]["snapshot"]["actors"].items():
        assert entry["jobId"] in data["data"]["snapshot"]["jobs"]
        assert entry["actorClass"] == "Pinger"
        assert entry["startTime"] >= 0
        if entry["isDetached"]:
            assert entry["endTime"] == 0, entry
        else:
            assert entry["endTime"] > 0, entry
        assert "runtimeEnv" in entry
    assert data["data"]["snapshot"]["rayCommit"] == ray.__commit__
    assert data["data"]["snapshot"]["rayVersion"] == ray.__version__
Exemple #12
0
def test_logs_max_count(enable_test_module, disable_aiohttp_cache,
                        ray_start_cluster_head):
    """Test that each Ray worker cannot cache more than 1000 logs at a time.
    """
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    ray.get(la.go.remote(MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD))

    def check_logs():
        node_logs_response = requests.get(f"{webui_url}/node_logs",
                                          params={"ip": node_ip})
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert type(node_logs["data"]["logs"]) is dict
        assert la_pid in node_logs["data"]["logs"]
        log_lengths = len(node_logs["data"]["logs"][la_pid])
        assert log_lengths >= MAX_LOGS_TO_CACHE
        assert log_lengths <= MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD

        actor_one_logs_response = requests.get(f"{webui_url}/node_logs",
                                               params={
                                                   "ip": node_ip,
                                                   "pid": str(la_pid)
                                               })
        actor_one_logs_response.raise_for_status()
        actor_one_logs = actor_one_logs_response.json()
        assert actor_one_logs["result"]
        assert type(actor_one_logs["data"]["logs"]) is dict
        log_lengths = len(actor_one_logs["data"]["logs"][la_pid])
        assert log_lengths >= MAX_LOGS_TO_CACHE
        assert log_lengths <= MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD

    assert wait_until_succeeded_without_exception(check_logs,
                                                  (AssertionError, ),
                                                  timeout_ms=10000)
Exemple #13
0
def test_actors(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Foo:
        def __init__(self, num):
            self.num = num

        def do_task(self):
            return self.num

    @ray.remote(num_gpus=1)
    class InfeasibleActor:
        pass

    foo_actors = [Foo.remote(4), Foo.remote(5)]
    infeasible_actor = InfeasibleActor.remote()  # noqa
    results = [actor.do_task.remote() for actor in foo_actors]  # noqa
    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            resp = requests.get(f"{webui_url}/logical/actors")
            resp_json = resp.json()
            resp_data = resp_json["data"]
            actors = resp_data["actors"]
            assert len(actors) == 3
            one_entry = list(actors.values())[0]
            assert "jobId" in one_entry
            assert "taskSpec" in one_entry
            assert "functionDescriptor" in one_entry["taskSpec"]
            assert type(one_entry["taskSpec"]["functionDescriptor"]) is dict
            assert "address" in one_entry
            assert type(one_entry["address"]) is dict
            assert "state" in one_entry
            assert "name" in one_entry
            assert "numRestarts" in one_entry
            assert "pid" in one_entry
            all_pids = {entry["pid"] for entry in actors.values()}
            assert 0 in all_pids  # The infeasible actor
            assert len(all_pids) > 1
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Exemple #14
0
def test_get_all_node_details(disable_aiohttp_cache, ray_start_with_dashboard):
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"])

    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            print("I also log a line")
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa
    timeout_seconds = 20
    start_time = time.time()
    last_ex = None

    def check_node_details():
        resp = requests.get(f"{webui_url}/nodes?view=details")
        resp_json = resp.json()
        resp_data = resp_json["data"]
        clients = resp_data["clients"]
        node = clients[0]
        assert len(clients) == 1
        assert len(node.get("actors")) == 2
        # Workers information should be in the detailed payload
        assert "workers" in node
        assert "logCount" in node
        # Two lines printed by ActorWithObjs
        assert node["logCount"] >= 2
        print(node["workers"])
        assert len(node["workers"]) == 2
        assert node["workers"][0]["logCount"] == 1

    while True:
        time.sleep(1)
        try:
            check_node_details()
            break
        except (AssertionError, KeyError, IndexError) as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = (
                    traceback.format_exception(
                        type(last_ex), last_ex, last_ex.__traceback__
                    )
                    if last_ex
                    else []
                )
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Exemple #15
0
def test_logs(enable_test_module, disable_aiohttp_cache,
              ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la2 = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    la2_pid = str(ray.get(la2.get_pid.remote()))
    ray.get(la.go.remote(4))
    ray.get(la2.go.remote(1))

    def check_logs():
        node_logs_response = requests.get(f"{webui_url}/node_logs",
                                          params={"ip": node_ip})
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert type(node_logs["data"]["logs"]) is dict
        assert all(pid in node_logs["data"]["logs"]
                   for pid in (la_pid, la2_pid))
        assert len(node_logs["data"]["logs"][la2_pid]) == 1

        actor_one_logs_response = requests.get(f"{webui_url}/node_logs",
                                               params={
                                                   "ip": node_ip,
                                                   "pid": str(la_pid)
                                               })
        actor_one_logs_response.raise_for_status()
        actor_one_logs = actor_one_logs_response.json()
        assert actor_one_logs["result"]
        assert type(actor_one_logs["data"]["logs"]) is dict
        assert len(actor_one_logs["data"]["logs"][la_pid]) == 4

    assert wait_until_succeeded_without_exception(check_logs,
                                                  (AssertionError, ),
                                                  timeout_ms=1000)
Exemple #16
0
def test_logs_clean_up(
    enable_test_module, disable_aiohttp_cache, ray_start_cluster_head
):
    """Check if logs from the dead pids are GC'ed."""
    cluster = ray_start_cluster_head
    assert wait_until_server_available(cluster.webui_url) is True
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    ray.get(la.go.remote(1))

    def check_logs():
        node_logs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip}
        )
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert la_pid in node_logs["data"]["logs"]

    assert wait_until_succeeded_without_exception(
        check_logs, (AssertionError,), timeout_ms=1000
    )
    ray.kill(la)

    def check_logs_not_exist():
        node_logs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip}
        )
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert la_pid not in node_logs["data"]["logs"]

    assert wait_until_succeeded_without_exception(
        check_logs_not_exist, (AssertionError,), timeout_ms=10000
    )
Exemple #17
0
def test_logs_experimental_list(ray_start_with_dashboard):
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    # test that list logs is comprehensive
    response = requests.get(webui_url + "/api/experimental/logs/list")
    response.raise_for_status()
    logs = json.loads(response.text)
    assert len(logs) == 1
    node_id = next(iter(logs))

    # test worker logs
    outs = logs[node_id]["worker_outs"]
    errs = logs[node_id]["worker_outs"]
    core_worker_logs = logs[node_id]["python_core_worker_logs"]

    assert len(outs) == len(errs) == len(core_worker_logs)
    assert len(outs) > 0

    for file in ["debug_state_gcs.txt", "gcs_server.out", "gcs_server.err"]:
        assert file in logs[node_id]["gcs_server"]
    for file in ["raylet.out", "raylet.err"]:
        assert file in logs[node_id]["raylet"]
    for file in ["dashboard_agent.log", "dashboard.log"]:
        assert file in logs[node_id]["dashboard"]
    return True

    # Test that logs/list can be filtered
    response = requests.get(webui_url + "/api/experimental/logs/list?filters=gcs")
    response.raise_for_status()
    logs = json.loads(response.text)
    assert len(logs) == 1
    node_id = next(iter(logs))
    assert "gcs_server" in logs[node_id]
    for category in logs[node_id]:
        if category != "gcs_server":
            assert len(logs[node_id][category]) == 0

    response = requests.get(webui_url + "/api/experimental/logs/list?filters=worker")
    response.raise_for_status()
    logs = json.loads(response.text)
    assert len(logs) == 1
    node_id = next(iter(logs))
    worker_log_categories = [
        "python_core_worker_logs",
        "worker_outs",
        "worker_errors",
    ]
    assert all([cat in logs[node_id] for cat in worker_log_categories])
    for category in logs[node_id]:
        if category not in worker_log_categories:
            assert len(logs[node_id][category]) == 0
Exemple #18
0
def test_event_basic(disable_aiohttp_cache, ray_start_with_dashboard):
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"])
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    session_dir = ray_start_with_dashboard["session_dir"]
    event_dir = os.path.join(session_dir, "logs", "events")
    job_id = ray.JobID.from_int(100).hex()

    source_type_gcs = event_pb2.Event.SourceType.Name(event_pb2.Event.GCS)
    source_type_raylet = event_pb2.Event.SourceType.Name(
        event_pb2.Event.RAYLET)
    test_count = 20

    for source_type in [source_type_gcs, source_type_raylet]:
        test_log_file = os.path.join(event_dir, f"event_{source_type}.log")
        test_logger = _test_logger(
            __name__ + str(random.random()),
            test_log_file,
            max_bytes=2000,
            backup_count=1000,
        )
        for i in range(test_count):
            sample_event = _get_event(str(i),
                                      job_id=job_id,
                                      source_type=source_type)
            test_logger.info("%s", json.dumps(sample_event))

    def _check_events():
        try:
            resp = requests.get(f"{webui_url}/events")
            resp.raise_for_status()
            result = resp.json()
            all_events = result["data"]["events"]
            job_events = all_events[job_id]
            assert len(job_events) >= test_count * 2
            source_messages = {}
            for e in job_events:
                source_type = e["sourceType"]
                message = e["message"]
                source_messages.setdefault(source_type, set()).add(message)
            assert len(source_messages[source_type_gcs]) >= test_count
            assert len(source_messages[source_type_raylet]) >= test_count
            data = {str(i) for i in range(test_count)}
            assert data & source_messages[source_type_gcs] == data
            assert data & source_messages[source_type_raylet] == data
            return True
        except Exception as ex:
            logger.exception(ex)
            return False

    wait_for_condition(_check_events, timeout=15)
def test_get_cluster_status(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    # Check that the cluster_status endpoint works without the underlying data
    # from the GCS, but returns nothing.
    def get_cluster_status():
        response = requests.get(f"{webui_url}/api/cluster_status")
        response.raise_for_status()
        print(response.json())
        assert response.json()["result"]
        assert "autoscalingStatus" in response.json()["data"]
        assert "autoscalingError" in response.json()["data"]
        assert response.json()["data"]["autoscalingError"] is None
        assert "clusterStatus" in response.json()["data"]
        assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]

    wait_until_succeeded_without_exception(get_cluster_status,
                                           (requests.RequestException, ))

    # Populate the GCS field, check that the data is returned from the
    # endpoint.
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(host=address[0],
                               port=int(address[1]),
                               password=ray_constants.REDIS_DEFAULT_PASSWORD)
    gcs_client = ray._private.gcs_utils.GcsClient.create_from_redis(client)
    ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
    ray.experimental.internal_kv._internal_kv_put(
        DEBUG_AUTOSCALING_STATUS_LEGACY, "hello")
    ray.experimental.internal_kv._internal_kv_put(DEBUG_AUTOSCALING_ERROR,
                                                  "world")

    response = requests.get(f"{webui_url}/api/cluster_status")
    response.raise_for_status()
    assert response.json()["result"]
    assert "autoscalingStatus" in response.json()["data"]
    assert response.json()["data"]["autoscalingStatus"] == "hello"
    assert "autoscalingError" in response.json()["data"]
    assert response.json()["data"]["autoscalingError"] == "world"
    assert "clusterStatus" in response.json()["data"]
    assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
Exemple #20
0
def test_errors(enable_test_module, disable_aiohttp_cache,
                ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class ErrorActor():
        def go(self):
            raise ValueError("This is an error")

        def get_pid(self):
            return os.getpid()

    ea = ErrorActor.remote()
    ea_pid = ea.get_pid.remote()
    ea.go.remote()

    def check_errs():
        node_errs_response = requests.get(f"{webui_url}/node_logs",
                                          params={"ip": node_ip})
        node_errs_response.raise_for_status()
        node_errs = node_errs_response.json()
        assert node_errs["result"]
        assert "errors" in node_errs["data"]
        assert type(node_errs["data"]["errors"]) is dict
        assert ea_pid in node_errs["data"]["errors"]
        assert len(node_errs["data"]["errors"][ea_pid]) == 1

        actor_err_response = requests.get(f"{webui_url}/node_logs",
                                          params={
                                              "ip": node_ip,
                                              "pid": str(ea_pid)
                                          })
        actor_err_response.raise_for_status()
        actor_errs = actor_err_response.json()
        assert actor_errs["result"]
        assert type(actor_errs["data"]["errors"]) is dict
        assert len(actor_errs["data"]["errors"][ea_pid]) == 4

    wait_until_succeeded_without_exception(check_errs, (AssertionError, ),
                                           timeout_ms=1000)
Exemple #21
0
def test_nodes_update(enable_test_module, ray_start_with_dashboard):
    assert wait_until_server_available(
        ray_start_with_dashboard["webui_url"]) is True
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 10
    start_time = time.time()
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/test/dump")
            response.raise_for_status()
            try:
                dump_info = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert dump_info["result"] is True
            dump_data = dump_info["data"]
            assert len(dump_data["nodes"]) == 1
            assert len(dump_data["agents"]) == 1
            assert len(dump_data["nodeIdToIp"]) == 1
            assert len(dump_data["nodeIdToHostname"]) == 1
            assert dump_data["nodes"].keys(
            ) == dump_data["nodeIdToHostname"].keys()

            response = requests.get(webui_url + "/test/notified_agents")
            response.raise_for_status()
            try:
                notified_agents = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert notified_agents["result"] is True
            notified_agents = notified_agents["data"]
            assert len(notified_agents) == 1
            assert notified_agents == dump_data["agents"]
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")
Exemple #22
0
def test_http_get(enable_test_module, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    target_url = webui_url + "/test/dump"

    timeout_seconds = 30
    start_time = time.time()
    while True:
        time.sleep(3)
        try:
            response = requests.get(webui_url + "/test/http_get?url=" +
                                    target_url)
            response.raise_for_status()
            try:
                dump_info = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert dump_info["result"] is True
            dump_data = dump_info["data"]
            assert len(dump_data["agents"]) == 1
            node_id, ports = next(iter(dump_data["agents"].items()))
            ip = ray_start_with_dashboard["node_ip_address"]
            http_port, grpc_port = ports

            response = requests.get(
                f"http://{ip}:{http_port}"
                f"/test/http_get_from_agent?url={target_url}")
            response.raise_for_status()
            try:
                dump_info = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert dump_info["result"] is True
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")
Exemple #23
0
def test_log_proxy(ray_start_with_dashboard):
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    test_log_text = "test_log_text"
    test_file = "test.log"
    with open(
        f"{ray.worker.global_worker.node.get_logs_dir_path()}/{test_file}", "w"
    ) as f:
        f.write(test_log_text)
    while True:
        time.sleep(1)
        try:
            # Test range request.
            response = requests.get(
                f"{webui_url}/log_proxy?url={webui_url}/logs/{test_file}",
                headers={"Range": "bytes=2-5"},
            )
            response.raise_for_status()
            assert response.text == test_log_text[2:6]
            # Test 404.
            response = requests.get(
                f"{webui_url}/log_proxy?" f"url={webui_url}/logs/not_exist_file.log"
            )
            assert response.status_code == 404
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = (
                    traceback.format_exception(
                        type(last_ex), last_ex, last_ex.__traceback__
                    )
                    if last_ex
                    else []
                )
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Exemple #24
0
def test_active_component_activities(ray_start_with_dashboard):
    # Verify drivers which don't have namespace starting with _ray_internal_job_info_
    # are considered active.

    driver_template = """
import ray

ray.init(address="auto", namespace="{namespace}")
    """
    run_string_as_driver_nonblocking(
        driver_template.format(namespace="my_namespace"))
    run_string_as_driver_nonblocking(
        driver_template.format(namespace="my_namespace"))
    run_string_as_driver_nonblocking(
        driver_template.format(namespace="_ray_internal_job_info_id1"))

    # Wait 1 sec for drivers to start
    time.sleep(1)

    # Verify drivers are considered active after running script
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    response = requests.get(f"{webui_url}/api/component_activities")
    response.raise_for_status()

    # Validate schema of response
    data = response.json()
    schema_path = os.path.join(
        os.path.dirname(dashboard.__file__),
        "modules/snapshot/component_activities_schema.json",
    )
    pprint.pprint(data)
    jsonschema.validate(instance=data, schema=json.load(open(schema_path)))

    # Validate ray_activity_response field can be cast to RayActivityResponse object
    driver_ray_activity_response = RayActivityResponse(**data["driver"])

    assert driver_ray_activity_response.is_active == "ACTIVE"
    # Drivers with namespace starting with "_ray_internal_job_info_" are not
    # considered active drivers. Three active drivers are the two
    # run with namespace "my_namespace" and the one started
    # from ray_start_with_dashboard
    assert driver_ray_activity_response.reason == "Number of active drivers: 3"
Exemple #25
0
def test_temporary_uri_reference(monkeypatch, expiration_s):
    """Test that temporary GCS URI references are deleted after expiration_s."""
    monkeypatch.setenv("RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S",
                       str(expiration_s))
    # We can't use a fixture with a shared Ray runtime because we need to set the
    # expiration_s env var before Ray starts.
    with _ray_start(include_dashboard=True, num_cpus=1) as ctx:
        headers = {
            "Connection": "keep-alive",
            "Authorization": "TOK:<MY_TOKEN>"
        }
        address = ctx.address_info["webui_url"]
        assert wait_until_server_available(address)
        client = JobSubmissionClient(format_web_url(address), headers=headers)
        with tempfile.TemporaryDirectory() as tmp_dir:
            path = Path(tmp_dir)

            hello_file = path / "hi.txt"
            with hello_file.open(mode="w") as f:
                f.write("hi\n")

            start = time.time()

            client.submit_job(entrypoint="echo hi",
                              runtime_env={"working_dir": tmp_dir})

            # Give time for deletion to occur if expiration_s is 0.
            time.sleep(2)
            # Need to connect to Ray to check internal_kv.
            # ray.init(address="auto")

            print("Starting Internal KV checks at time ", time.time() - start)
            if expiration_s > 0:
                assert not check_internal_kv_gced()
                wait_for_condition(check_internal_kv_gced,
                                   timeout=2 * expiration_s)
                assert expiration_s < time.time() - start < 2 * expiration_s
                print("Internal KV was GC'ed at time ", time.time() - start)
            else:
                wait_for_condition(check_internal_kv_gced)
                print("Internal KV was GC'ed at time ", time.time() - start)
Exemple #26
0
def test_multi_node_churn(
    enable_test_module, disable_aiohttp_cache, ray_start_cluster_head
):
    cluster: Cluster = ray_start_cluster_head
    assert wait_until_server_available(cluster.webui_url) is True
    webui_url = format_web_url(cluster.webui_url)

    def cluster_chaos_monkey():
        worker_nodes = []
        while True:
            time.sleep(5)
            if len(worker_nodes) < 2:
                worker_nodes.append(cluster.add_node())
                continue
            should_add_node = random.randint(0, 1)
            if should_add_node:
                worker_nodes.append(cluster.add_node())
            else:
                node_index = random.randrange(0, len(worker_nodes))
                node_to_remove = worker_nodes.pop(node_index)
                cluster.remove_node(node_to_remove)

    def get_index():
        resp = requests.get(webui_url)
        resp.raise_for_status()

    def get_nodes():
        resp = requests.get(webui_url + "/nodes?view=summary")
        resp.raise_for_status()
        summary = resp.json()
        assert summary["result"] is True, summary["msg"]
        assert summary["data"]["summary"]

    t = threading.Thread(target=cluster_chaos_monkey, daemon=True)
    t.start()

    t_st = datetime.now()
    duration = timedelta(seconds=60)
    while datetime.now() < t_st + duration:
        get_index()
        time.sleep(2)
Exemple #27
0
def test_node_physical_stats(enable_test_module, shutdown_only):
    addresses = ray.init(include_dashboard=True, num_cpus=6)

    @ray.remote(num_cpus=1)
    class Actor:
        def getpid(self):
            return os.getpid()

    actors = [Actor.remote() for _ in range(6)]
    actor_pids = ray.get([actor.getpid.remote() for actor in actors])
    actor_pids = set(actor_pids)

    webui_url = addresses["webui_url"]
    assert (wait_until_server_available(webui_url) is True)
    webui_url = format_web_url(webui_url)

    def _check_workers():
        try:
            resp = requests.get(webui_url +
                                "/test/dump?key=node_physical_stats")
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True
            node_physical_stats = result["data"]["nodePhysicalStats"]
            assert len(node_physical_stats) == 1
            current_stats = node_physical_stats[addresses["node_id"]]
            # Check Actor workers
            current_actor_pids = set()
            for worker in current_stats["workers"]:
                if "ray::Actor" in worker["cmdline"][0]:
                    current_actor_pids.add(worker["pid"])
            assert current_actor_pids == actor_pids
            # Check raylet cmdline
            assert "raylet" in current_stats["cmdline"][0]
            return True
        except Exception as ex:
            logger.info(ex)
            return False

    wait_for_condition(_check_workers, timeout=10)
Exemple #28
0
def test_kill_actor_gcs(ray_start_with_dashboard):
    # Start the dashboard
    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    @ray.remote
    class Actor:
        def f(self):
            ray.worker.show_in_dashboard("test")
            return os.getpid()

        def loop(self):
            while True:
                time.sleep(1)
                print("Looping...")

    # Create an actor
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())  # noqa
    actor_id = a._ray_actor_id.hex()

    # Kill the actor
    _kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=False)
    assert _actor_killed_loop(worker_pid)

    # Create an actor and have it loop
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())  # noqa
    actor_id = a._ray_actor_id.hex()
    a.loop.remote()

    # Try to kill the actor, it should not die since a task is running
    _kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=False)
    assert not _actor_killed_loop(worker_pid, timeout_secs=1)

    # Force kill the actor
    _kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=True)
    assert _actor_killed_loop(worker_pid)
Exemple #29
0
def test_get_cluster_status(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    # Check that the cluster_status endpoint works without the underlying data
    # from the GCS, but returns nothing.
    def get_cluster_status():
        response = requests.get(f"{webui_url}/api/cluster_status")
        response.raise_for_status()
        print(response.json())
        assert response.json()["result"]
        assert "autoscalingStatus" in response.json()["data"]
        assert "autoscalingError" in response.json()["data"]
        assert response.json()["data"]["autoscalingError"] is None
        assert "clusterStatus" in response.json()["data"]
        assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]

    wait_until_succeeded_without_exception(get_cluster_status,
                                           (requests.RequestException, ))

    gcs_client = make_gcs_client(address_info)
    ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
    ray.experimental.internal_kv._internal_kv_put(
        DEBUG_AUTOSCALING_STATUS_LEGACY, "hello")
    ray.experimental.internal_kv._internal_kv_put(DEBUG_AUTOSCALING_ERROR,
                                                  "world")

    response = requests.get(f"{webui_url}/api/cluster_status")
    response.raise_for_status()
    assert response.json()["result"]
    assert "autoscalingStatus" in response.json()["data"]
    assert response.json()["data"]["autoscalingStatus"] == "hello"
    assert "autoscalingError" in response.json()["data"]
    assert response.json()["data"]["autoscalingError"] == "world"
    assert "clusterStatus" in response.json()["data"]
    assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
Exemple #30
0
def test_nil_node(enable_test_module, disable_aiohttp_cache,
                  ray_start_with_dashboard):
    assert wait_until_server_available(
        ray_start_with_dashboard["webui_url"]) is True
    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    @ray.remote(num_gpus=1)
    class InfeasibleActor:
        pass

    infeasible_actor = InfeasibleActor.remote()  # noqa

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            resp = requests.get(f"{webui_url}/logical/actors")
            resp_json = resp.json()
            resp_data = resp_json["data"]
            actors = resp_data["actors"]
            assert len(actors) == 1
            response = requests.get(webui_url + "/test/dump?key=node_actors")
            response.raise_for_status()
            result = response.json()
            assert actor_consts.NIL_NODE_ID not in result["data"]["nodeActors"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = (traceback.format_exception(type(last_ex), last_ex,
                                                       last_ex.__traceback__)
                            if last_ex else [])
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")