コード例 #1
0
def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    my_obj = ray.put([1, 2, 3] * 100)  # noqa
    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa
    results = ray.get([actor.get_obj.remote() for actor in actors])  # noqa
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    resp = requests.get(webui_url + "/memory/set_fetch",
                        params={"shouldFetch": "true"})
    resp.raise_for_status()

    def check_mem_table():
        resp = requests.get(f"{webui_url}/memory/memory_table")
        resp_data = resp.json()
        assert resp_data["result"]
        latest_memory_table = resp_data["data"]["memoryTable"]
        summary = latest_memory_table["summary"]
        # 1 ref per handle and per object the actor has a ref to
        assert summary["totalActorHandles"] == len(actors) * 2
        # 1 ref for my_obj
        assert summary["totalLocalRefCount"] == 1

    wait_until_succeeded_without_exception(check_mem_table, (AssertionError, ),
                                           timeout_ms=1000)
コード例 #2
0
ファイル: test_dashboard.py プロジェクト: zivzone/ray
def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only):
    address_info = ray.init(num_cpus=1, include_dashboard=True)
    assert (wait_until_server_available(address_info["webui_url"]) is True)

    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 10
    start_time = time.time()
    while True:
        time.sleep(1)
        try:
            response = requests.get(
                webui_url + "/test/dump",
                proxies={
                    "http": None,
                    "https": None
                })
            response.raise_for_status()
            try:
                response.json()
                assert response.ok
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")
コード例 #3
0
def test_get_all_node_details(disable_aiohttp_cache, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))

    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa

    def check_node_details():
        resp = requests.get(f"{webui_url}/nodes?view=details")
        resp_json = resp.json()
        resp_data = resp_json["data"]
        try:
            clients = resp_data["clients"]
            node = clients[0]
            assert len(clients) == 1
            assert len(node.get("actors")) == 2
            # Workers information should be in the detailed payload
            assert "workers" in node
            assert "logCount" in node
            assert len(node["workers"]) == 2
            return True
        except (AssertionError, KeyError, IndexError):
            return False

    wait_for_condition(check_node_details, 15)
コード例 #4
0
ファイル: test_log.py プロジェクト: zoltanszekely21/ray
def test_log_proxy(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            # Test range request.
            response = requests.get(
                f"{webui_url}/log_proxy?url={webui_url}/logs/dashboard.log",
                headers={"Range": "bytes=43-51"})
            response.raise_for_status()
            assert response.text == "Dashboard"
            # Test 404.
            response = requests.get(f"{webui_url}/log_proxy?"
                                    f"url={webui_url}/logs/not_exist_file.log")
            assert response.status_code == 404
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #5
0
ファイル: test_reporter.py プロジェクト: zjutoe/ray
def test_profiling(shutdown_only):
    addresses = ray.init(include_dashboard=True, num_cpus=6)

    @ray.remote(num_cpus=2)
    class Actor:
        def getpid(self):
            return os.getpid()

    c = Actor.remote()
    actor_pid = ray.get(c.getpid.remote())

    webui_url = addresses["webui_url"]
    assert (wait_until_server_available(webui_url) is True)
    webui_url = format_web_url(webui_url)

    start_time = time.time()
    launch_profiling = None
    while True:
        # Sometimes some startup time is required
        if time.time() - start_time > 15:
            raise RayTestTimeoutException(
                "Timed out while collecting profiling stats, "
                f"launch_profiling: {launch_profiling}")
        launch_profiling = requests.get(
            webui_url + "/api/launch_profiling",
            params={
                "ip": ray.nodes()[0]["NodeManagerAddress"],
                "pid": actor_pid,
                "duration": 5
            }).json()
        if launch_profiling["result"]:
            profiling_info = launch_profiling["data"]["profilingInfo"]
            break
        time.sleep(1)
    logger.info(profiling_info)
コード例 #6
0
ファイル: test_stats_collector.py プロジェクト: yukingx/ray
def test_multi_nodes_info(enable_test_module, disable_aiohttp_cache,
                          ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    cluster.add_node()
    cluster.add_node()

    def _check_nodes():
        try:
            response = requests.get(webui_url + "/nodes?view=summary")
            response.raise_for_status()
            summary = response.json()
            assert summary["result"] is True, summary["msg"]
            summary = summary["data"]["summary"]
            assert len(summary) == 3
            for node_info in summary:
                node_id = node_info["raylet"]["nodeId"]
                response = requests.get(webui_url + f"/nodes/{node_id}")
                response.raise_for_status()
                detail = response.json()
                assert detail["result"] is True, detail["msg"]
                detail = detail["data"]["detail"]
                assert detail["raylet"]["state"] == "ALIVE"
            response = requests.get(webui_url + "/test/dump?key=agents")
            response.raise_for_status()
            agents = response.json()
            assert len(agents["data"]["agents"]) == 3
            return True
        except Exception as ex:
            logger.info(ex)
            return False

    wait_for_condition(_check_nodes, timeout=10)
コード例 #7
0
ファイル: test_job.py プロジェクト: nikitavemuri/ray
def test_submit_job(disable_aiohttp_cache, enable_test_module,
                    ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job = _prepare_job_for_test(webui_url)
    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    job_id = None
    job_submitted = False

    def _check_running():
        nonlocal job_id
        nonlocal job_submitted
        if not job_submitted:
            resp = requests.post(f"{webui_url}/jobs", json=job)
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True, resp.text
            job_submitted = True

        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        summary = result["data"]["summary"]
        assert len(summary) == 2

        # TODO(fyrestone): Return a job id when POST /jobs
        # The larger job id is the one we submitted.
        job_ids = sorted(s["jobId"] for s in summary)
        job_id = job_ids[1]

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["jobId"] == job_id

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_info = result["data"]["detail"]["jobInfo"]
        assert job_info["isDead"] is False
        job_actors = result["data"]["detail"]["jobActors"]
        job_workers = result["data"]["detail"]["jobWorkers"]
        assert len(job_actors) > 0
        assert len(job_workers) > 0

    wait_until_succeeded_without_exception(_check_running,
                                           exceptions=(AssertionError,
                                                       KeyError, IndexError),
                                           timeout_ms=30 * 1000,
                                           raise_last_ex=True)
コード例 #8
0
ファイル: test_actor.py プロジェクト: yncxcw/ray
def test_kill_actor(ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            ray.worker.show_in_dashboard("test")
            return os.getpid()

    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())  # noqa

    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    def actor_killed(pid):
        """Check For the existence of a unix pid."""
        try:
            os.kill(pid, 0)
        except OSError:
            return True
        else:
            return False

    def get_actor():
        resp = requests.get(f"{webui_url}/logical/actor_groups")
        resp.raise_for_status()
        actor_groups_resp = resp.json()
        assert actor_groups_resp["result"] is True, actor_groups_resp["msg"]
        actor_groups = actor_groups_resp["data"]["actorGroups"]
        actor = actor_groups["Actor"]["entries"][0]
        return actor

    def kill_actor_using_dashboard(actor):
        resp = requests.get(
            webui_url + "/logical/kill_actor",
            params={
                "actorId": actor["actorId"],
                "ipAddress": actor["ipAddress"],
                "port": actor["port"]
            })
        resp.raise_for_status()
        resp_json = resp.json()
        assert resp_json["result"] is True, "msg" in resp_json

    start = time.time()
    last_exc = None
    while time.time() - start <= 10:
        try:
            actor = get_actor()
            kill_actor_using_dashboard(actor)
            last_exc = None
            break
        except (KeyError, AssertionError) as e:
            last_exc = e
            time.sleep(.1)
    assert last_exc is None
コード例 #9
0
ファイル: test_event.py プロジェクト: nikitavemuri/ray
def test_event_message_limit(enable_event_module, small_event_line_limit,
                             disable_aiohttp_cache, ray_start_with_dashboard):
    event_read_line_length_limit = small_event_line_limit
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    session_dir = ray_start_with_dashboard["session_dir"]
    event_dir = os.path.join(session_dir, "logs", "events")
    job_id = ray.JobID.from_int(100).hex()
    events = []
    # Sample event equals with limit.
    sample_event = _get_event("", job_id=job_id)
    message_len = event_read_line_length_limit - len(json.dumps(sample_event))
    for i in range(10):
        sample_event = copy.deepcopy(sample_event)
        sample_event["event_id"] = binary_to_hex(np.random.bytes(18))
        sample_event["message"] = str(i) * message_len
        assert len(json.dumps(sample_event)) == event_read_line_length_limit
        events.append(sample_event)
    # Sample event longer than limit.
    sample_event = copy.deepcopy(sample_event)
    sample_event["event_id"] = binary_to_hex(np.random.bytes(18))
    sample_event["message"] = "2" * (message_len + 1)
    assert len(json.dumps(sample_event)) > event_read_line_length_limit
    events.append(sample_event)

    for i in range(event_consts.EVENT_READ_LINE_COUNT_LIMIT):
        events.append(_get_event(str(i), job_id=job_id))

    with open(os.path.join(event_dir, "tmp.log"), "w") as f:
        f.writelines([(json.dumps(e) + "\n") for e in events])

    try:
        os.remove(os.path.join(event_dir, "event_GCS.log"))
    except Exception:
        pass
    os.rename(os.path.join(event_dir, "tmp.log"),
              os.path.join(event_dir, "event_GCS.log"))

    def _check_events():
        try:
            resp = requests.get(f"{webui_url}/events")
            resp.raise_for_status()
            result = resp.json()
            all_events = result["data"]["events"]
            assert len(all_events[job_id]
                       ) >= event_consts.EVENT_READ_LINE_COUNT_LIMIT + 10
            messages = [e["message"] for e in all_events[job_id]]
            for i in range(10):
                assert str(i) * message_len in messages
            assert "2" * (message_len + 1) not in messages
            assert str(event_consts.EVENT_READ_LINE_COUNT_LIMIT -
                       1) in messages
            return True
        except Exception as ex:
            logger.exception(ex)
            return False

    wait_for_condition(_check_events, timeout=15)
コード例 #10
0
ファイル: test_snapshot.py プロジェクト: hngenc/ray
def test_snapshot(ray_start_with_dashboard):
    driver_template = """
import ray

ray.init(address="{address}", namespace="my_namespace")

@ray.remote
class Pinger:
    def ping(self):
        return "pong"

a = Pinger.options(lifetime={lifetime}, name={name}).remote()
ray.get(a.ping.remote())
    """

    detached_driver = driver_template.format(
        address=ray_start_with_dashboard["redis_address"],
        lifetime="'detached'",
        name="'abc'")
    named_driver = driver_template.format(
        address=ray_start_with_dashboard["redis_address"],
        lifetime="None",
        name="'xyz'")
    unnamed_driver = driver_template.format(
        address=ray_start_with_dashboard["redis_address"],
        lifetime="None",
        name="None")

    run_string_as_driver(detached_driver)
    run_string_as_driver(named_driver)
    run_string_as_driver(unnamed_driver)

    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    response = requests.get(f"{webui_url}/api/snapshot")
    response.raise_for_status()
    data = response.json()
    schema_path = os.path.join(os.path.dirname(dashboard.__file__),
                               "modules/snapshot/snapshot_schema.json")
    pprint.pprint(data)
    jsonschema.validate(instance=data, schema=json.load(open(schema_path)))

    assert len(data["data"]["snapshot"]["actors"]) == 3
    assert len(data["data"]["snapshot"]["jobs"]) == 4
    assert len(data["data"]["snapshot"]["deployments"]) == 0

    for actor_id, entry in data["data"]["snapshot"]["actors"].items():
        assert entry["jobId"] in data["data"]["snapshot"]["jobs"]
        assert entry["actorClass"] == "Pinger"
        assert entry["startTime"] >= 0
        if entry["isDetached"]:
            assert entry["endTime"] == 0, entry
        else:
            assert entry["endTime"] > 0, entry
        assert "runtimeEnv" in entry
    assert data["data"]["snapshot"]["rayCommit"] == ray.__commit__
    assert data["data"]["snapshot"]["rayVersion"] == ray.__version__
コード例 #11
0
ファイル: test_logical_view_head.py プロジェクト: yynst2/ray
def test_actor_groups(ray_start_with_dashboard):
    @ray.remote
    class Foo:
        def __init__(self, num):
            self.num = num

        def do_task(self):
            return self.num

    @ray.remote(num_gpus=1)
    class InfeasibleActor:
        pass

    foo_actors = [Foo.remote(4), Foo.remote(5)]
    infeasible_actor = InfeasibleActor.remote()  # noqa
    results = [actor.do_task.remote() for actor in foo_actors]  # noqa
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/logical/actor_groups")
            response.raise_for_status()
            actor_groups_resp = response.json()
            assert actor_groups_resp["result"] is True, actor_groups_resp[
                "msg"]
            actor_groups = actor_groups_resp["data"]["actorGroups"]
            assert "Foo" in actor_groups
            summary = actor_groups["Foo"]["summary"]
            # 2 __init__ tasks and 2 do_task tasks
            assert summary["numExecutedTasks"] == 4
            assert summary["stateToCount"]["ALIVE"] == 2

            entries = actor_groups["Foo"]["entries"]
            assert len(entries) == 2
            assert "InfeasibleActor" in actor_groups

            entries = actor_groups["InfeasibleActor"]["entries"]
            assert "requiredResources" in entries[0]
            assert "GPU" in entries[0]["requiredResources"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #12
0
ファイル: test_actor.py プロジェクト: yncxcw/ray
def test_actors(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Foo:
        def __init__(self, num):
            self.num = num

        def do_task(self):
            return self.num

    @ray.remote(num_gpus=1)
    class InfeasibleActor:
        pass

    foo_actors = [Foo.remote(4), Foo.remote(5)]
    infeasible_actor = InfeasibleActor.remote()  # noqa
    results = [actor.do_task.remote() for actor in foo_actors]  # noqa
    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            resp = requests.get(f"{webui_url}/logical/actors")
            resp_json = resp.json()
            resp_data = resp_json["data"]
            actors = resp_data["actors"]
            assert len(actors) == 3
            one_entry = list(actors.values())[0]
            assert "jobId" in one_entry
            assert "taskSpec" in one_entry
            assert "functionDescriptor" in one_entry["taskSpec"]
            assert type(one_entry["taskSpec"]["functionDescriptor"]) is dict
            assert "address" in one_entry
            assert type(one_entry["address"]) is dict
            assert "state" in one_entry
            assert "name" in one_entry
            assert "numRestarts" in one_entry
            assert "pid" in one_entry
            all_pids = {entry["pid"] for entry in actors.values()}
            assert 0 in all_pids  # The infeasible actor
            assert len(all_pids) > 1
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #13
0
def test_logs(enable_test_module, disable_aiohttp_cache,
              ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class LoggingActor:
        def go(self, n):
            i = 0
            while i < n:
                print(f"On number {i}")
                i += 1

        def get_pid(self):
            return os.getpid()

    la = LoggingActor.remote()
    la2 = LoggingActor.remote()
    la_pid = str(ray.get(la.get_pid.remote()))
    la2_pid = str(ray.get(la2.get_pid.remote()))
    ray.get(la.go.remote(4))
    ray.get(la2.go.remote(1))

    def check_logs():
        node_logs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip})
        node_logs_response.raise_for_status()
        node_logs = node_logs_response.json()
        assert node_logs["result"]
        assert type(node_logs["data"]["logs"]) is dict
        assert all(
            pid in node_logs["data"]["logs"] for pid in (la_pid, la2_pid))
        assert len(node_logs["data"]["logs"][la2_pid]) == 1

        actor_one_logs_response = requests.get(
            f"{webui_url}/node_logs",
            params={
                "ip": node_ip,
                "pid": str(la_pid)
            })
        actor_one_logs_response.raise_for_status()
        actor_one_logs = actor_one_logs_response.json()
        assert actor_one_logs["result"]
        assert type(actor_one_logs["data"]["logs"]) is dict
        assert len(actor_one_logs["data"]["logs"][la_pid]) == 4

    wait_until_succeeded_without_exception(
        check_logs, (AssertionError), timeout_ms=1000)
コード例 #14
0
def test_get_all_node_details(disable_aiohttp_cache, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))

    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])

    @ray.remote
    class ActorWithObjs:
        def __init__(self):
            print("I also log a line")
            self.obj_ref = ray.put([1, 2, 3])

        def get_obj(self):
            return ray.get(self.obj_ref)

    actors = [ActorWithObjs.remote() for _ in range(2)]  # noqa
    timeout_seconds = 20
    start_time = time.time()
    last_ex = None

    def check_node_details():
        resp = requests.get(f"{webui_url}/nodes?view=details")
        resp_json = resp.json()
        resp_data = resp_json["data"]
        clients = resp_data["clients"]
        node = clients[0]
        assert len(clients) == 1
        assert len(node.get("actors")) == 2
        # Workers information should be in the detailed payload
        assert "workers" in node
        assert "logCount" in node
        # Two lines printed by ActorWithObjs
        # One line printed by autoscaler: monitor.py:118 -- Monitor: Started
        assert node["logCount"] > 2
        print(node["workers"])
        assert len(node["workers"]) == 2
        assert node["workers"][0]["logCount"] == 1

    while True:
        time.sleep(1)
        try:
            check_node_details()
            break
        except (AssertionError, KeyError, IndexError) as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #15
0
ファイル: test_event.py プロジェクト: nikitavemuri/ray
def test_event_basic(enable_event_module, disable_aiohttp_cache,
                     ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]))
    webui_url = format_web_url(ray_start_with_dashboard["webui_url"])
    session_dir = ray_start_with_dashboard["session_dir"]
    event_dir = os.path.join(session_dir, "logs", "events")
    job_id = ray.JobID.from_int(100).hex()

    source_type_gcs = event_pb2.Event.SourceType.Name(event_pb2.Event.GCS)
    source_type_raylet = event_pb2.Event.SourceType.Name(
        event_pb2.Event.RAYLET)
    test_count = 20

    for source_type in [source_type_gcs, source_type_raylet]:
        test_log_file = os.path.join(event_dir, f"event_{source_type}.log")
        test_logger = _test_logger(__name__ + str(random.random()),
                                   test_log_file,
                                   max_bytes=2000,
                                   backup_count=1000)
        for i in range(test_count):
            sample_event = _get_event(str(i),
                                      job_id=job_id,
                                      source_type=source_type)
            test_logger.info("%s", json.dumps(sample_event))

    def _check_events():
        try:
            resp = requests.get(f"{webui_url}/events")
            resp.raise_for_status()
            result = resp.json()
            all_events = result["data"]["events"]
            job_events = all_events[job_id]
            assert len(job_events) >= test_count * 2
            source_messages = {}
            for e in job_events:
                source_type = e["sourceType"]
                message = e["message"]
                source_messages.setdefault(source_type, set()).add(message)
            assert len(source_messages[source_type_gcs]) >= test_count
            assert len(source_messages[source_type_raylet]) >= test_count
            data = {str(i) for i in range(test_count)}
            assert data & source_messages[source_type_gcs] == data
            assert data & source_messages[source_type_raylet] == data
            return True
        except Exception as ex:
            logger.exception(ex)
            return False

    wait_for_condition(_check_events, timeout=15)
コード例 #16
0
ファイル: test_dashboard.py プロジェクト: zivzone/ray
def test_get_cluster_status(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    address_info = ray_start_with_dashboard
    webui_url = address_info["webui_url"]
    webui_url = format_web_url(webui_url)

    # Check that the cluster_status endpoint works without the underlying data
    # from the GCS, but returns nothing.
    def get_cluster_status():
        response = requests.get(f"{webui_url}/api/cluster_status")
        response.raise_for_status()
        print(response.json())
        assert response.json()["result"]
        assert "autoscalingStatus" in response.json()["data"]
        assert response.json()["data"]["autoscalingStatus"] is None
        assert "autoscalingError" in response.json()["data"]
        assert response.json()["data"]["autoscalingError"] is None
        assert "clusterStatus" in response.json()["data"]
        assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]

    wait_until_succeeded_without_exception(get_cluster_status,
                                           (requests.RequestException, ))

    # Populate the GCS field, check that the data is returned from the
    # endpoint.
    address = address_info["redis_address"]
    address = address.split(":")
    assert len(address) == 2

    client = redis.StrictRedis(
        host=address[0],
        port=int(address[1]),
        password=ray_constants.REDIS_DEFAULT_PASSWORD)

    client.hset(DEBUG_AUTOSCALING_STATUS_LEGACY, "value", "hello")
    client.hset(DEBUG_AUTOSCALING_ERROR, "value", "world")

    response = requests.get(f"{webui_url}/api/cluster_status")
    response.raise_for_status()
    assert response.json()["result"]
    assert "autoscalingStatus" in response.json()["data"]
    assert response.json()["data"]["autoscalingStatus"] == "hello"
    assert "autoscalingError" in response.json()["data"]
    assert response.json()["data"]["autoscalingError"] == "world"
    assert "clusterStatus" in response.json()["data"]
    assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
コード例 #17
0
def test_errors(enable_test_module, disable_aiohttp_cache,
                ray_start_cluster_head):
    cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = cluster.webui_url
    webui_url = format_web_url(webui_url)
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_ip = nodes[0]["NodeManagerAddress"]

    @ray.remote
    class ErrorActor():
        def go(self):
            raise ValueError("This is an error")

        def get_pid(self):
            return os.getpid()

    ea = ErrorActor.remote()
    ea_pid = ea.get_pid.remote()
    ea.go.remote()

    def check_errs():
        node_errs_response = requests.get(
            f"{webui_url}/node_logs", params={"ip": node_ip})
        node_errs_response.raise_for_status()
        node_errs = node_errs_response.json()
        assert node_errs["result"]
        assert type(node_errs["data"]["errors"]) is dict
        assert ea_pid in node_errs["data"]["errors"]
        assert len(node_errs["data"]["errors"][ea_pid]) == 1

        actor_err_response = requests.get(
            f"{webui_url}/node_logs",
            params={
                "ip": node_ip,
                "pid": str(ea_pid)
            })
        actor_err_response.raise_for_status()
        actor_errs = actor_err_response.json()
        assert actor_errs["result"]
        assert type(actor_errs["data"]["errors"]) is dict
        assert len(actor_errs["data"]["errors"][ea_pid]) == 4

    wait_until_succeeded_without_exception(
        check_errs, (AssertionError), timeout_ms=1000)
コード例 #18
0
ファイル: test_dashboard.py プロジェクト: zzmcdc/ray
def test_http_get(enable_test_module, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    target_url = webui_url + "/test/dump"

    timeout_seconds = 10
    start_time = time.time()
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/test/http_get?url=" +
                                    target_url)
            response.raise_for_status()
            try:
                dump_info = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert dump_info["result"] is True
            dump_data = dump_info["data"]
            assert len(dump_data["agents"]) == 1
            node_id, ports = next(iter(dump_data["agents"].items()))
            ip = ray_start_with_dashboard["node_ip_address"]
            http_port, grpc_port = ports

            response = requests.get(
                f"http://{ip}:{http_port}"
                f"/test/http_get_from_agent?url={target_url}")
            response.raise_for_status()
            try:
                dump_info = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert dump_info["result"] is True
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")
コード例 #19
0
ファイル: test_dashboard.py プロジェクト: zzmcdc/ray
def test_nodes_update(enable_test_module, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 10
    start_time = time.time()
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/test/dump")
            response.raise_for_status()
            try:
                dump_info = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert dump_info["result"] is True
            dump_data = dump_info["data"]
            assert len(dump_data["nodes"]) == 1
            assert len(dump_data["agents"]) == 1
            assert len(dump_data["nodeIdToIp"]) == 1
            assert len(dump_data["nodeIdToHostname"]) == 1
            assert dump_data["nodes"].keys(
            ) == dump_data["nodeIdToHostname"].keys()

            response = requests.get(webui_url + "/test/notified_agents")
            response.raise_for_status()
            try:
                notified_agents = response.json()
            except Exception as ex:
                logger.info("failed response: %s", response.text)
                raise ex
            assert notified_agents["result"] is True
            notified_agents = notified_agents["data"]
            assert len(notified_agents) == 1
            assert notified_agents == dump_data["agents"]
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")
コード例 #20
0
ファイル: test_reporter.py プロジェクト: zjutoe/ray
def test_node_physical_stats(enable_test_module, shutdown_only):
    addresses = ray.init(include_dashboard=True, num_cpus=6)

    @ray.remote(num_cpus=1)
    class Actor:
        def getpid(self):
            return os.getpid()

    actors = [Actor.remote() for _ in range(6)]
    actor_pids = ray.get([actor.getpid.remote() for actor in actors])
    actor_pids = set(actor_pids)

    webui_url = addresses["webui_url"]
    assert (wait_until_server_available(webui_url) is True)
    webui_url = format_web_url(webui_url)

    def _check_workers():
        try:
            resp = requests.get(webui_url +
                                "/test/dump?key=node_physical_stats")
            resp.raise_for_status()
            result = resp.json()
            assert result["result"] is True
            node_physical_stats = result["data"]["nodePhysicalStats"]
            assert len(node_physical_stats) == 1
            current_stats = node_physical_stats[addresses["node_id"]]
            # Check Actor workers
            current_actor_pids = set()
            for worker in current_stats["workers"]:
                if "ray::Actor" in worker["cmdline"][0]:
                    current_actor_pids.add(worker["pid"])
            assert current_actor_pids == actor_pids
            # Check raylet cmdline
            assert "raylet" in current_stats["cmdline"][0]
            return True
        except Exception as ex:
            logger.info(ex)
            return False

    wait_for_condition(_check_workers, timeout=10)
コード例 #21
0
def test_nil_node(enable_test_module, disable_aiohttp_cache,
                  ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    assert wait_until_server_available(webui_url)
    webui_url = format_web_url(webui_url)

    @ray.remote(num_gpus=1)
    class InfeasibleActor:
        pass

    infeasible_actor = InfeasibleActor.remote()  # noqa

    timeout_seconds = 5
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            resp = requests.get(f"{webui_url}/logical/actors")
            resp_json = resp.json()
            resp_data = resp_json["data"]
            actors = resp_data["actors"]
            assert len(actors) == 1
            response = requests.get(webui_url + "/test/dump?key=node_actors")
            response.raise_for_status()
            result = response.json()
            assert stats_collector_consts.NIL_NODE_ID not in result["data"][
                "nodeActors"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #22
0
def test_multi_node_churn(enable_test_module, disable_aiohttp_cache,
                          ray_start_cluster_head):
    cluster: Cluster = ray_start_cluster_head
    assert (wait_until_server_available(cluster.webui_url) is True)
    webui_url = format_web_url(cluster.webui_url)

    def cluster_chaos_monkey():
        worker_nodes = []
        while True:
            time.sleep(5)
            if len(worker_nodes) < 2:
                worker_nodes.append(cluster.add_node())
                continue
            should_add_node = random.randint(0, 1)
            if should_add_node:
                worker_nodes.append(cluster.add_node())
            else:
                node_index = random.randrange(0, len(worker_nodes))
                node_to_remove = worker_nodes.pop(node_index)
                cluster.remove_node(node_to_remove)

    def get_index():
        resp = requests.get(webui_url)
        resp.raise_for_status()

    def get_nodes():
        resp = requests.get(webui_url + "/nodes?view=summary")
        resp.raise_for_status()
        summary = resp.json()
        assert summary["result"] is True, summary["msg"]
        assert summary["data"]["summary"]

    t = threading.Thread(target=cluster_chaos_monkey, daemon=True)
    t.start()

    t_st = datetime.now()
    duration = timedelta(seconds=60)
    while datetime.now() < t_st + duration:
        get_index()
        time.sleep(2)
コード例 #23
0
ファイル: test_snapshot.py プロジェクト: hngenc/ray
def test_serve_snapshot(ray_start_with_dashboard):
    """Test detached and nondetached Serve instances running concurrently."""

    detached_serve_driver_script = f"""
import ray
from ray import serve

ray.init(
    address="{ray_start_with_dashboard['redis_address']}",
    namespace="serve")

serve.start(detached=True)

@serve.deployment(version="v1")
def my_func(request):
  return "hello"

my_func.deploy()
    """

    run_string_as_driver(detached_serve_driver_script)
    assert requests.get("http://127.0.0.1:8000/my_func").text == "hello"

    # Use a new port to avoid clobbering the first Serve instance.
    serve.start(http_options={"port": 8123})

    @serve.deployment(version="v1")
    def my_func_nondetached(request):
        return "hello"

    my_func_nondetached.deploy()

    assert requests.get(
        "http://127.0.0.1:8123/my_func_nondetached").text == "hello"

    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    response = requests.get(f"{webui_url}/api/snapshot")
    response.raise_for_status()
    data = response.json()
    schema_path = os.path.join(os.path.dirname(dashboard.__file__),
                               "modules/snapshot/snapshot_schema.json")
    pprint.pprint(data)
    jsonschema.validate(instance=data, schema=json.load(open(schema_path)))

    assert len(data["data"]["snapshot"]["deployments"]) == 2

    entry = data["data"]["snapshot"]["deployments"]["myFunc"]
    assert entry["name"] == "my_func"
    assert entry["version"] == "v1"
    assert entry["namespace"] == "serve"
    assert entry["httpRoute"] == "/my_func"
    assert entry["className"] == "my_func"
    assert entry["status"] == "RUNNING"
    assert entry["rayJobId"] is not None
    assert entry["startTime"] > 0
    assert entry["endTime"] == 0

    assert len(entry["actors"]) == 1
    actor_id = next(iter(entry["actors"]))
    metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][
        "serve"]
    assert metadata["deploymentName"] == "my_func"
    assert metadata["version"] == "v1"
    assert len(metadata["replicaTag"]) > 0

    entry_nondetached = data["data"]["snapshot"]["deployments"][
        "myFuncNondetached"]
    assert entry_nondetached["name"] == "my_func_nondetached"
    assert entry_nondetached["version"] == "v1"
    assert entry_nondetached["namespace"] == ""
    assert entry_nondetached["httpRoute"] == "/my_func_nondetached"
    assert entry_nondetached["className"] == "my_func_nondetached"
    assert entry_nondetached["status"] == "RUNNING"
    assert entry_nondetached["rayJobId"] is not None
    assert entry_nondetached["startTime"] > 0
    assert entry_nondetached["endTime"] == 0

    assert len(entry_nondetached["actors"]) == 1
    actor_id = next(iter(entry_nondetached["actors"]))
    metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][
        "serve"]
    assert metadata["deploymentName"] == "my_func_nondetached"
    assert metadata["version"] == "v1"
    assert len(metadata["replicaTag"]) > 0
コード例 #24
0
ファイル: test_stats_collector.py プロジェクト: yukingx/ray
def test_node_info(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def getpid(self):
            return os.getpid()

    actors = [Actor.remote(), Actor.remote()]
    actor_pids = [actor.getpid.remote() for actor in actors]
    actor_pids = set(ray.get(actor_pids))

    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    node_id = ray_start_with_dashboard["node_id"]

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/nodes?view=hostnamelist")
            response.raise_for_status()
            hostname_list = response.json()
            assert hostname_list["result"] is True, hostname_list["msg"]
            hostname_list = hostname_list["data"]["hostNameList"]
            assert len(hostname_list) == 1

            hostname = hostname_list[0]
            response = requests.get(webui_url + f"/nodes/{node_id}")
            response.raise_for_status()
            detail = response.json()
            assert detail["result"] is True, detail["msg"]
            detail = detail["data"]["detail"]
            assert detail["hostname"] == hostname
            assert detail["raylet"]["state"] == "ALIVE"
            assert "raylet" in detail["cmdline"][0]
            assert len(detail["workers"]) >= 2
            assert len(detail["actors"]) == 2, detail["actors"]
            assert len(detail["raylet"]["viewData"]) > 0

            actor_worker_pids = set()
            for worker in detail["workers"]:
                if "ray::Actor" in worker["cmdline"][0]:
                    actor_worker_pids.add(worker["pid"])
            assert actor_worker_pids == actor_pids

            response = requests.get(webui_url + "/nodes?view=summary")
            response.raise_for_status()
            summary = response.json()
            assert summary["result"] is True, summary["msg"]
            assert len(summary["data"]["summary"]) == 1
            summary = summary["data"]["summary"][0]
            assert summary["hostname"] == hostname
            assert summary["raylet"]["state"] == "ALIVE"
            assert "raylet" in summary["cmdline"][0]
            assert "workers" not in summary
            assert "actors" not in summary
            assert "viewData" not in summary["raylet"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #25
0
ファイル: test_log.py プロジェクト: zoltanszekely21/ray
def test_log(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    def write_log(s):
        print(s)

    test_log_text = "test_log_text"
    ray.get(write_log.remote(test_log_text))
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    node_id = ray_start_with_dashboard["node_id"]

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/log_index")
            response.raise_for_status()
            parser = LogUrlParser()
            parser.feed(response.text)
            all_nodes_log_urls = parser.get_urls()
            assert len(all_nodes_log_urls) == 1

            response = requests.get(all_nodes_log_urls[0])
            response.raise_for_status()
            parser = LogUrlParser()
            parser.feed(response.text)

            # Search test_log_text from all worker logs.
            parsed_url = urllib.parse.urlparse(all_nodes_log_urls[0])
            paths = parser.get_urls()
            urls = []
            for p in paths:
                if "worker" in p:
                    urls.append(parsed_url._replace(path=p).geturl())

            for u in urls:
                response = requests.get(u)
                response.raise_for_status()
                if test_log_text in response.text:
                    break
            else:
                raise Exception(f"Can't find {test_log_text} from {urls}")

            # Test range request.
            response = requests.get(webui_url + "/logs/dashboard.log",
                                    headers={"Range": "bytes=43-51"})
            response.raise_for_status()
            assert response.text == "Dashboard"

            # Test logUrl in node info.
            response = requests.get(webui_url + f"/nodes/{node_id}")
            response.raise_for_status()
            node_info = response.json()
            assert node_info["result"] is True
            node_info = node_info["data"]["detail"]
            assert "logUrl" in node_info
            assert node_info["logUrl"] in all_nodes_log_urls
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #26
0
def test_aiohttp_cache(enable_test_module, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    timeout_seconds = 5
    start_time = time.time()
    value1_timestamps = []
    while True:
        time.sleep(1)
        try:
            for x in range(10):
                response = requests.get(webui_url +
                                        "/test/aiohttp_cache/t1?value=1")
                response.raise_for_status()
                timestamp = response.json()["data"]["timestamp"]
                value1_timestamps.append(timestamp)
            assert len(collections.Counter(value1_timestamps)) > 1
            break
        except (AssertionError, requests.exceptions.ConnectionError) as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")

    sub_path_timestamps = []
    for x in range(10):
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache/tt{x}?value=1")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        sub_path_timestamps.append(timestamp)
    assert len(collections.Counter(sub_path_timestamps)) == 10

    volatile_value_timestamps = []
    for x in range(10):
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache/tt?value={x}")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        volatile_value_timestamps.append(timestamp)
    assert len(collections.Counter(volatile_value_timestamps)) == 10

    response = requests.get(webui_url + "/test/aiohttp_cache/raise_exception")
    response.raise_for_status()
    result = response.json()
    assert result["result"] is False
    assert "KeyError" in result["msg"]

    volatile_value_timestamps = []
    for x in range(10):
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache_lru/tt{x % 4}")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        volatile_value_timestamps.append(timestamp)
    assert len(collections.Counter(volatile_value_timestamps)) == 4

    volatile_value_timestamps = []
    data = collections.defaultdict(set)
    for x in [0, 1, 2, 3, 4, 5, 2, 1, 0, 3]:
        response = requests.get(webui_url +
                                f"/test/aiohttp_cache_lru/t1?value={x}")
        response.raise_for_status()
        timestamp = response.json()["data"]["timestamp"]
        data[x].add(timestamp)
        volatile_value_timestamps.append(timestamp)
    assert len(collections.Counter(volatile_value_timestamps)) == 8
    assert len(data[3]) == 2
    assert len(data[0]) == 2
コード例 #27
0
def test_get_job_info(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def getpid(self):
            return os.getpid()

    actor = Actor.remote()
    actor_pid = ray.get(actor.getpid.remote())
    actor_id = actor._actor_id.hex()

    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    ip = ray._private.services.get_node_ip_address()

    def _check():
        resp = requests.get(f"{webui_url}/jobs?view=summary")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_summary = result["data"]["summary"]
        assert len(job_summary) == 1, resp.text
        one_job = job_summary[0]
        assert "jobId" in one_job
        job_id = one_job["jobId"]
        assert ray._raylet.JobID(hex_to_binary(one_job["jobId"]))
        assert "driverIpAddress" in one_job
        assert one_job["driverIpAddress"] == ip
        assert "driverPid" in one_job
        assert one_job["driverPid"] == str(os.getpid())
        assert "config" in one_job
        assert type(one_job["config"]) is dict
        assert "isDead" in one_job
        assert one_job["isDead"] is False
        assert "timestamp" in one_job
        one_job_summary_keys = one_job.keys()

        resp = requests.get(f"{webui_url}/jobs/{job_id}")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is True, resp.text
        job_detail = result["data"]["detail"]
        assert "jobInfo" in job_detail
        assert len(one_job_summary_keys - job_detail["jobInfo"].keys()) == 0
        assert "jobActors" in job_detail
        job_actors = job_detail["jobActors"]
        assert len(job_actors) == 1, resp.text
        one_job_actor = job_actors[actor_id]
        assert "taskSpec" in one_job_actor
        assert type(one_job_actor["taskSpec"]) is dict
        assert "functionDescriptor" in one_job_actor["taskSpec"]
        assert type(one_job_actor["taskSpec"]["functionDescriptor"]) is dict
        assert "pid" in one_job_actor
        assert one_job_actor["pid"] == actor_pid
        check_actor_keys = [
            "name", "timestamp", "address", "actorId", "jobId", "state"
        ]
        for k in check_actor_keys:
            assert k in one_job_actor
        assert "jobWorkers" in job_detail
        job_workers = job_detail["jobWorkers"]
        assert len(job_workers) == 1, resp.text
        one_job_worker = job_workers[0]
        check_worker_keys = [
            "cmdline", "pid", "cpuTimes", "memoryInfo", "cpuPercent",
            "coreWorkerStats", "language", "jobId"
        ]
        for k in check_worker_keys:
            assert k in one_job_worker

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            _check()
            break
        except (AssertionError, KeyError, IndexError) as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
コード例 #28
0
ファイル: test_job.py プロジェクト: nikitavemuri/ray
def test_submit_job_validation(ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)

    job_root_dir = os.path.join(
        os.path.dirname(ray_start_with_dashboard["session_dir"]), "job")
    shutil.rmtree(job_root_dir, ignore_errors=True)

    def _ensure_available_nodes():
        resp = requests.post(f"{webui_url}/jobs")
        resp.raise_for_status()
        result = resp.json()
        assert result["result"] is False
        return "no nodes available" not in result["msg"]

    wait_for_condition(_ensure_available_nodes, timeout=5)

    # Invalid value.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": "Unsupported",
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                             "driver_entry": "python_file_name_without_ext",
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert "language" in msg and "Unsupported" in msg, resp.text

    # Missing required field.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["missing", "driver_entry"]), resp.text

    # Incorrect value type.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": ["http://xxx/yyy.zip"]
                             },
                             "driver_entry": "python_file_name_without_ext",
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["working_dir", "str"]), resp.text

    # Invalid key.
    resp = requests.post(f"{webui_url}/jobs",
                         json={
                             "language": job_consts.PYTHON,
                             "runtime_env": {
                                 "working_dir": "http://xxx/yyy.zip"
                             },
                             "driver_entry": "python_file_name_without_ext",
                             "invalid_key": 1,
                         })
    resp.raise_for_status()
    result = resp.json()
    assert result["result"] is False
    msg = result["msg"]
    assert all(p in msg for p in ["unexpected", "invalid_key"]), resp.text