Ejemplo n.º 1
0
def test_basic(ray_start_with_dashboard):
    """Dashboard test that starts a Ray cluster with a dashboard server running,
    then hits the dashboard API and asserts that it receives sensible data."""
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    node_id = address_info["node_id"]
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(host=address[0],
                               port=int(address[1]),
                               password=ray_constants.REDIS_DEFAULT_PASSWORD)

    all_processes = ray.worker._global_node.all_processes
    assert ray_constants.PROCESS_TYPE_DASHBOARD in all_processes
    assert ray_constants.PROCESS_TYPE_REPORTER not in all_processes
    dashboard_proc_info = all_processes[
        ray_constants.PROCESS_TYPE_DASHBOARD][0]
    dashboard_proc = psutil.Process(dashboard_proc_info.process.pid)
    assert dashboard_proc.status() in [
        psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING
    ]
    raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0]
    raylet_proc = psutil.Process(raylet_proc_info.process.pid)

    def _search_agent(processes):
        for p in processes:
            try:
                for c in p.cmdline():
                    if "new_dashboard/agent.py" in c:
                        return p
            except Exception:
                pass

    # Test for bad imports, the agent should be restarted.
    logger.info("Test for bad imports.")
    agent_proc = _search_agent(raylet_proc.children())
    prepare_test_files()
    agent_pids = set()
    try:
        assert agent_proc is not None
        agent_proc.kill()
        agent_proc.wait()
        # The agent will be restarted for imports failure.
        for x in range(40):
            agent_proc = _search_agent(raylet_proc.children())
            if agent_proc:
                agent_pids.add(agent_proc.pid)
            time.sleep(0.1)
    finally:
        cleanup_test_files()
    assert len(agent_pids) > 1, agent_pids

    agent_proc = _search_agent(raylet_proc.children())
    if agent_proc:
        agent_proc.kill()
        agent_proc.wait()

    logger.info("Test agent register is OK.")
    wait_for_condition(lambda: _search_agent(raylet_proc.children()))
    assert dashboard_proc.status() in [
        psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING
    ]
    agent_proc = _search_agent(raylet_proc.children())
    agent_pid = agent_proc.pid

    # Check if agent register is OK.
    for x in range(5):
        logger.info("Check agent is alive.")
        agent_proc = _search_agent(raylet_proc.children())
        assert agent_proc.pid == agent_pid
        time.sleep(1)

    # Check redis keys are set.
    logger.info("Check redis keys are set.")
    dashboard_address = client.get(dashboard_consts.REDIS_KEY_DASHBOARD)
    assert dashboard_address is not None
    dashboard_rpc_address = client.get(
        dashboard_consts.REDIS_KEY_DASHBOARD_RPC)
    assert dashboard_rpc_address is not None
    key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}"
    agent_ports = client.get(key)
    assert agent_ports is not None
Ejemplo n.º 2
0
def test_raylet_info_endpoint(shutdown_only):
    addresses = ray.init(include_webui=True, num_cpus=6)

    @ray.remote
    def f():
        return "test"

    @ray.remote(num_cpus=1)
    class ActorA:
        def __init__(self):
            pass

    @ray.remote(resources={"CustomResource": 1})
    class ActorB:
        def __init__(self):
            pass

    @ray.remote(num_cpus=2)
    class ActorC:
        def __init__(self):
            self.children = [ActorA.remote(), ActorB.remote()]

        def local_store(self):
            self.local_storage = [f.remote() for _ in range(10)]

        def remote_store(self):
            self.remote_storage = ray.put("test")

        def getpid(self):
            return os.getpid()

    c = ActorC.remote()
    actor_pid = ray.get(c.getpid.remote())
    c.local_store.remote()
    c.remote_store.remote()

    assert (wait_until_server_available(addresses["webui_url"]) is True)

    start_time = time.time()
    while True:
        time.sleep(1)
        try:
            webui_url = addresses["webui_url"]
            webui_url = webui_url.replace("localhost", "http://127.0.0.1")
            response = requests.get(webui_url + "/api/raylet_info")
            response.raise_for_status()
            try:
                raylet_info = response.json()
            except Exception as ex:
                print("failed response: {}".format(response.text))
                raise ex
            actor_info = raylet_info["result"]["actors"]
            try:
                assert len(actor_info) == 1
                _, parent_actor_info = actor_info.popitem()
                assert parent_actor_info["numObjectIdsInScope"] == 13
                assert parent_actor_info["numLocalObjects"] == 10
                children = parent_actor_info["children"]
                assert len(children) == 2
                break
            except AssertionError:
                if time.time() > start_time + 30:
                    raise Exception("Timed out while waiting for actor info \
                        or object store info update.")
        except requests.exceptions.ConnectionError:
            if time.time() > start_time + 30:
                raise Exception(
                    "Timed out while waiting for dashboard to start.")

    assert parent_actor_info["usedResources"]["CPU"] == 2
    assert parent_actor_info["numExecutedTasks"] == 4
    for _, child_actor_info in children.items():
        if child_actor_info["state"] == -1:
            assert child_actor_info["requiredResources"]["CustomResource"] == 1
        else:
            assert child_actor_info["state"] == 1
            assert len(child_actor_info["children"]) == 0
            assert child_actor_info["usedResources"]["CPU"] == 1

    profiling_id = requests.get(webui_url + "/api/launch_profiling",
                                params={
                                    "node_id": ray.nodes()[0]["NodeID"],
                                    "pid": actor_pid,
                                    "duration": 5
                                }).json()["result"]
    start_time = time.time()
    while True:
        # Sometimes some startup time is required
        if time.time() - start_time > 30:
            raise RayTestTimeoutException(
                "Timed out while collecting profiling stats.")
        profiling_info = requests.get(webui_url +
                                      "/api/check_profiling_status",
                                      params={
                                          "profiling_id": profiling_id,
                                      }).json()
        status = profiling_info["result"]["status"]
        assert status in ("finished", "pending", "error")
        if status in ("finished", "error"):
            break
        time.sleep(1)
Ejemplo n.º 3
0
def test_node_info(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def getpid(self):
            return os.getpid()

    actors = [Actor.remote(), Actor.remote()]
    actor_pids = [actor.getpid.remote() for actor in actors]
    actor_pids = set(ray.get(actor_pids))

    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    node_id = ray_start_with_dashboard["node_id"]

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/nodes?view=hostnamelist")
            response.raise_for_status()
            hostname_list = response.json()
            assert hostname_list["result"] is True, hostname_list["msg"]
            hostname_list = hostname_list["data"]["hostNameList"]
            assert len(hostname_list) == 1

            hostname = hostname_list[0]
            response = requests.get(webui_url + f"/nodes/{node_id}")
            response.raise_for_status()
            detail = response.json()
            assert detail["result"] is True, detail["msg"]
            detail = detail["data"]["detail"]
            assert detail["hostname"] == hostname
            assert detail["raylet"]["state"] == "ALIVE"
            assert "raylet" in detail["cmdline"][0]
            assert len(detail["workers"]) >= 2
            assert len(detail["actors"]) == 2, detail["actors"]
            assert len(detail["raylet"]["viewData"]) > 0

            actor_worker_pids = set()
            for worker in detail["workers"]:
                if "ray::Actor" in worker["cmdline"][0]:
                    actor_worker_pids.add(worker["pid"])
            assert actor_worker_pids == actor_pids

            response = requests.get(webui_url + "/nodes?view=summary")
            response.raise_for_status()
            summary = response.json()
            assert summary["result"] is True, summary["msg"]
            assert len(summary["data"]["summary"]) == 1
            summary = summary["data"]["summary"][0]
            assert summary["hostname"] == hostname
            assert summary["raylet"]["state"] == "ALIVE"
            assert "raylet" in summary["cmdline"][0]
            assert "workers" not in summary
            assert "actors" not in summary
            assert "viewData" not in summary["raylet"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Ejemplo n.º 4
0
def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard):
    timeout = 5
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(host=address[0],
                               port=int(address[1]),
                               password=ray_constants.REDIS_DEFAULT_PASSWORD)

    p = client.pubsub(ignore_subscribe_messages=True)
    p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN)

    @ray.remote
    class DummyActor:
        def __init__(self):
            pass

    # Create a dummy actor.
    a = DummyActor.remote()

    def handle_pub_messages(client, msgs, timeout, expect_num):
        start_time = time.time()
        while time.time() - start_time < timeout and len(msgs) < expect_num:
            msg = client.get_message()
            if msg is None:
                time.sleep(0.01)
                continue
            pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"])
            actor_data = ray.gcs_utils.ActorTableData.FromString(
                pubsub_msg.data)
            msgs.append(actor_data)

    msgs = []
    handle_pub_messages(p, msgs, timeout, 2)

    # Assert we received published actor messages with state
    # DEPENDENCIES_UNREADY and ALIVE.
    assert len(msgs) == 2

    # Kill actor.
    ray.kill(a)
    handle_pub_messages(p, msgs, timeout, 3)

    # Assert we received published actor messages with state DEAD.
    assert len(msgs) == 3

    def actor_table_data_to_dict(message):
        return dashboard_utils.message_to_dict(
            message, {
                "actorId", "parentId", "jobId", "workerId", "rayletId",
                "actorCreationDummyObjectId", "callerId", "taskId",
                "parentTaskId", "sourceActorId", "placementGroupId"
            },
            including_default_value_fields=False)

    non_state_keys = ("actorId", "jobId", "taskSpec")
    for msg in msgs:
        actor_data_dict = actor_table_data_to_dict(msg)
        # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We
        # need check its original value.
        if msg.state == 0:
            assert len(actor_data_dict) > 5
            for k in non_state_keys:
                assert k in actor_data_dict
        # For status that is not DEPENDENCIES_UNREADY, only states fields will
        # be published.
        elif actor_data_dict["state"] in ("ALIVE", "DEAD"):
            assert actor_data_dict.keys() == {
                "state", "address", "timestamp", "pid"
            }
        else:
            raise Exception("Unknown state: {}".format(
                actor_data_dict["state"]))
Ejemplo n.º 5
0
def test_memory_dashboard(shutdown_only):
    """Test Memory table.

    These tests verify examples in this document.
    https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory
    """
    addresses = ray.init(num_cpus=2)
    webui_url = addresses["webui_url"].replace("localhost", "http://127.0.0.1")
    assert (wait_until_server_available(addresses["webui_url"]) is True)

    def get_memory_table():
        memory_table = requests.get(webui_url + "/api/memory_table").json()
        return memory_table["result"]

    def memory_table_ready():
        """Wait until the new fresh memory table is ready."""
        global prev_memory_table
        memory_table = get_memory_table()
        is_ready = memory_table["group"] != prev_memory_table
        prev_memory_table = memory_table["group"]
        return is_ready

    def stop_memory_table():
        requests.get(webui_url + "/api/stop_memory_table").json()

    def test_local_reference():
        @ray.remote
        def f(arg):
            return arg

        # a and b are local references.
        a = ray.put(None)  # Noqa F841
        b = f.remote(None)  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        group = memory_table["group"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 2
        for table in group.values():
            for entry in table["entries"]:
                assert (
                    entry["reference_type"] == ReferenceType.LOCAL_REFERENCE)
        stop_memory_table()
        return True

    def test_object_pineed_in_memory():
        import numpy as np

        a = ray.put(np.zeros(1))
        b = ray.get(a)  # Noqa F841
        del a

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        group = memory_table["group"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 1
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 0
        for table in group.values():
            for entry in table["entries"]:
                assert (
                    entry["reference_type"] == ReferenceType.PINNED_IN_MEMORY)
        stop_memory_table()
        return True

    def test_pending_task_references():
        @ray.remote
        def f(arg):
            time.sleep(1)

        a = ray.put(None)  # Noqa F841
        b = f.remote(a)  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 1
        assert summary["total_used_by_pending_task"] == 1
        assert summary["total_local_ref_count"] == 1
        # Make sure the function f is done before going to the next test.
        # Otherwise, the memory table will be corrupted because the
        # task f won't be done when the next test is running.
        ray.get(b)
        stop_memory_table()
        return True

    def test_serialized_object_id_reference():
        @ray.remote
        def f(arg):
            time.sleep(1)

        a = ray.put(None)  # Noqa F841
        b = f.remote([a])  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 1
        assert summary["total_local_ref_count"] == 2
        # Make sure the function f is done before going to the next test.
        # Otherwise, the memory table will be corrupted because the
        # task f won't be done when the next test is running.
        ray.get(b)
        stop_memory_table()
        return True

    def test_captured_object_id_reference():
        a = ray.put(None)
        b = ray.put([a])  # Noqa F841
        del a

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        assert summary["total_captured_in_objects"] == 1
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 1
        stop_memory_table()
        return True

    def test_actor_handle_reference():
        @ray.remote
        class Actor:
            pass

        a = Actor.remote()  # Noqa F841
        b = Actor.remote()  # Noqa F841
        c = Actor.remote()  # Noqa F841

        wait_for_condition(memory_table_ready)
        memory_table = get_memory_table()
        summary = memory_table["summary"]
        group = memory_table["group"]
        assert summary["total_captured_in_objects"] == 0
        assert summary["total_pinned_in_memory"] == 0
        assert summary["total_used_by_pending_task"] == 0
        assert summary["total_local_ref_count"] == 0
        assert summary["total_actor_handles"] == 3
        for table in group.values():
            for entry in table["entries"]:
                assert (entry["reference_type"] == ReferenceType.ACTOR_HANDLE)
        stop_memory_table()
        return True

    # These tests should be retried because it takes at least one second
    # to get the fresh new memory table. It is because memory table is updated
    # Whenever raylet and node info is renewed which takes 1 second.
    assert (wait_for_condition(
        test_local_reference, timeout=30000, retry_interval_ms=1000) is True)

    assert (wait_for_condition(test_object_pineed_in_memory,
                               timeout=30000,
                               retry_interval_ms=1000) is True)

    assert (wait_for_condition(test_pending_task_references,
                               timeout=30000,
                               retry_interval_ms=1000) is True)

    assert (wait_for_condition(test_serialized_object_id_reference,
                               timeout=30000,
                               retry_interval_ms=1000) is True)

    assert (wait_for_condition(test_captured_object_id_reference,
                               timeout=30000,
                               retry_interval_ms=1000) is True)

    assert (wait_for_condition(test_actor_handle_reference,
                               timeout=30000,
                               retry_interval_ms=1000) is True)
Ejemplo n.º 6
0
def test_aiohttp_cache(enable_test_module, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    value1_timestamps = []
    while True:
        time.sleep(1)
        try:
            for x in range(10):
                response = requests.get(webui_url +
                                        "/test/aiohttp_cache/t1?value=1")
                response.raise_for_status()
                timestamp = response.json()["data"]["timestamp"]
                value1_timestamps.append(timestamp)
            assert len(collections.Counter(value1_timestamps)) > 1
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")

    sub_path_timestamps = []
    for x in range(10):
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache/tt{x}?value=1")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        sub_path_timestamps.append(timestamp)
    assert len(collections.Counter(sub_path_timestamps)) == 10

    volatile_value_timestamps = []
    for x in range(10):
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache/tt?value={x}")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        volatile_value_timestamps.append(timestamp)
    assert len(collections.Counter(volatile_value_timestamps)) == 10

    response = requests.get(webui_url + "/test/aiohttp_cache/raise_exception")
    response.raise_for_status()
    result = response.json()
    assert result["result"] is False
    assert "KeyError" in result["msg"]

    volatile_value_timestamps = []
    for x in range(10):
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache_lru/tt{x % 4}")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        volatile_value_timestamps.append(timestamp)
    assert len(collections.Counter(volatile_value_timestamps)) == 4

    volatile_value_timestamps = []
    data = collections.defaultdict(set)
    for x in [0, 1, 2, 3, 4, 5, 2, 1, 0, 3]:
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache_lru/t1?value={x}")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        data[x].add(timestamp)
        volatile_value_timestamps.append(timestamp)
    assert len(collections.Counter(volatile_value_timestamps)) == 8
    assert len(data[3]) == 2
    assert len(data[0]) == 2
Ejemplo n.º 7
0
def test_worker_stats(shutdown_only):
    addresses = ray.init(num_cpus=1, include_webui=True)
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)

    def try_get_node_stats(num_retry=5, timeout=2):
        reply = None
        for _ in range(num_retry):
            try:
                reply = stub.GetNodeStats(
                    node_manager_pb2.GetNodeStatsRequest(), timeout=timeout)
                break
            except grpc.RpcError:
                continue
        assert reply is not None
        return reply

    reply = try_get_node_stats()
    # Check that there is one connected driver.
    drivers = [worker for worker in reply.workers_stats if worker.is_driver]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    @ray.remote
    def f():
        ray.show_in_webui("test")
        return os.getpid()

    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            ray.show_in_webui("test")
            return os.getpid()

    # Test show_in_webui for remote functions.
    worker_pid = ray.get(f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    # Test show_in_webui for remote actors.
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.workers_stats) < num_cpus + 1:
            time.sleep(1)
            reply = try_get_node_stats()
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        assert len(reply.workers_stats) == num_cpus + 1
        views = [view.view_name for view in reply.view_data]
        assert "local_available_resource" in views
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl) why does travis/mi end up in the process list
            assert ("python" in process or "conda" in process
                    or "travis" in process or "runner" in process
                    or "ray" in process)
        break

    # Test kill_actor.
    def actor_killed(PID):
        """Check For the existence of a unix pid."""
        try:
            os.kill(PID, 0)
        except OSError:
            return True
        else:
            return False

    assert (wait_until_server_available(addresses["webui_url"]) is True)

    webui_url = addresses["webui_url"]
    webui_url = webui_url.replace("localhost", "http://127.0.0.1")
    for worker in reply.workers_stats:
        if worker.is_driver:
            continue
        requests.get(webui_url + "/api/kill_actor",
                     params={
                         "actor_id":
                         ray.utils.binary_to_hex(
                             worker.core_worker_stats.actor_id),
                         "ip_address":
                         worker.core_worker_stats.ip_address,
                         "port":
                         worker.core_worker_stats.port
                     })
    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException("Timed out while killing actors")
        if all(
                actor_killed(worker.pid) for worker in reply.workers_stats
                if not worker.is_driver):
            break
Ejemplo n.º 8
0
def test_get_job_info(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def getpid(self):
            return os.getpid()

    actor = Actor.remote()
    actor_pid = ray.get(actor.getpid.remote())
    actor_id = actor._actor_id.hex()

    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    ip = ray._private.services.get_node_ip_address()

    def _check():
        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_summary = result["data"]["summary"]
        assert len(job_summary) == 1, resp.text
        one_job = job_summary[0]
        assert "jobId" in one_job
        job_id = one_job["jobId"]
        assert ray._raylet.JobID(hex_to_binary(one_job["jobId"]))
        assert "driverIpAddress" in one_job
        assert one_job["driverIpAddress"] == ip
        assert "driverPid" in one_job
        assert one_job["driverPid"] == str(os.getpid())
        assert "config" in one_job
        assert type(one_job["config"]) is dict
        assert "isDead" in one_job
        assert one_job["isDead"] is False
        assert "timestamp" in one_job
        one_job_summary_keys = one_job.keys()

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_detail = result["data"]["detail"]
        assert "jobInfo" in job_detail
        assert len(one_job_summary_keys - job_detail["jobInfo"].keys()) == 0
        assert "jobActors" in job_detail
        job_actors = job_detail["jobActors"]
        assert len(job_actors) == 1, resp.text
        one_job_actor = job_actors[actor_id]
        assert "taskSpec" in one_job_actor
        assert type(one_job_actor["taskSpec"]) is dict
        assert "functionDescriptor" in one_job_actor["taskSpec"]
        assert type(one_job_actor["taskSpec"]["functionDescriptor"]) is dict
        assert "pid" in one_job_actor
        assert one_job_actor["pid"] == actor_pid
        check_actor_keys = [
            "name", "timestamp", "address", "actorId", "jobId", "state"
        ]
        for k in check_actor_keys:
            assert k in one_job_actor
        assert "jobWorkers" in job_detail
        job_workers = job_detail["jobWorkers"]
        assert len(job_workers) == 1, resp.text
        one_job_worker = job_workers[0]
        check_worker_keys = [
            "cmdline", "pid", "cpuTimes", "memoryInfo", "cpuPercent",
            "coreWorkerStats", "language", "jobId"
        ]
        for k in check_worker_keys:
            assert k in one_job_worker

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            _check()
            break
        except (AssertionError, KeyError, IndexError) as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Ejemplo n.º 9
0
def test_actor_groups(ray_start_with_dashboard):
    @ray.remote
    class Foo:
        def __init__(self, num):
            self.num = num

        def do_task(self):
            return self.num

    @ray.remote(num_gpus=1)
    class InfeasibleActor:
        pass

    foo_actors = [Foo.remote(4), Foo.remote(5)]
    infeasible_actor = InfeasibleActor.remote()  # noqa
    results = [actor.do_task.remote() for actor in foo_actors]  # noqa
    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/logical/actor_groups")
            response.raise_for_status()
            actor_groups_resp = response.json()
            assert actor_groups_resp["result"] is True, actor_groups_resp[
                "msg"]
            actor_groups = actor_groups_resp["data"]["actorGroups"]
            assert "Foo" in actor_groups
            summary = actor_groups["Foo"]["summary"]
            # 2 __init__ tasks and 2 do_task tasks
            assert summary["numExecutedTasks"] == 4
            assert summary["stateToCount"]["ALIVE"] == 2

            entries = actor_groups["Foo"]["entries"]
            foo_entry = entries[0]
            assert type(foo_entry["gpus"]) is list
            assert "timestamp" in foo_entry
            assert "actorConstructor" in foo_entry
            assert "actorClass" in foo_entry
            assert "actorId" in foo_entry
            assert "ipAddress" in foo_entry
            assert len(entries) == 2
            assert "InfeasibleActor" in actor_groups

            entries = actor_groups["InfeasibleActor"]["entries"]
            assert "requiredResources" in entries[0]
            assert "GPU" in entries[0]["requiredResources"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Ejemplo n.º 10
0
def test_submit_job_validation(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    def _ensure_available_nodes():
        resp = requests.post(f"{webui_url}/jobs")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is False
        return "no nodes available" not in result["msg"]

    wait_for_condition(_ensure_available_nodes, timeout=5)

    # Invalid value.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": "Unsupported",
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                             "driver_entry": "python_file_name_without_ext",
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert "language" in msg and "Unsupported" in msg, resp.text

    # Missing required field.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["missing", "driver_entry"]), resp.text

    # Incorrect value type.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": ["http://xxx/yyy.zip"]
                             },
                             "driver_entry": "python_file_name_without_ext",
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["working_dir", "str"]), resp.text

    # Invalid key.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                             "driver_entry": "python_file_name_without_ext",
                             "invalid_key": 1,
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["unexpected", "invalid_key"]), resp.text
Ejemplo n.º 11
0
def test_log(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    def write_log(s):
        print(s)

    test_log_text = "test_log_text"
    ray.get(write_log.remote(test_log_text))
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    node_id = ray_start_with_dashboard["node_id"]

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/log_index")
            response.raise_for_status()
            parser = LogUrlParser()
            parser.feed(response.text)
            all_nodes_log_urls = parser.get_urls()
            assert len(all_nodes_log_urls) == 1

            response = requests.get(all_nodes_log_urls[0])
            response.raise_for_status()
            parser = LogUrlParser()
            parser.feed(response.text)

            # Search test_log_text from all worker logs.
            parsed_url = urllib.parse.urlparse(all_nodes_log_urls[0])
            paths = parser.get_urls()
            urls = []
            for p in paths:
                if "worker" in p:
                    urls.append(parsed_url._replace(path=p).geturl())

            for u in urls:
                response = requests.get(u)
                response.raise_for_status()
                if test_log_text in response.text:
                    break
            else:
                raise Exception(f"Can't find {test_log_text} from {urls}")

            # Test range request.
            response = requests.get(webui_url + "/logs/dashboard.log",
                                    headers={"Range": "bytes=43-51"})
            response.raise_for_status()
            assert response.text == "Dashboard"

            # Test logUrl in node info.
            response = requests.get(webui_url + f"/nodes/{node_id}")
            response.raise_for_status()
            node_info = response.json()
            assert node_info["result"] is True
            node_info = node_info["data"]["detail"]
            assert "logUrl" in node_info
            assert node_info["logUrl"] in all_nodes_log_urls
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")