def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])) @ray.remote class ActorWithObjs: def __init__(self): self.obj_ref = ray.put([1, 2, 3]) def get_obj(self): return ray.get(self.obj_ref) my_obj = ray.put([1, 2, 3] * 100) # noqa actors = [ActorWithObjs.remote() for _ in range(2)] # noqa results = ray.get([actor.get_obj.remote() for actor in actors]) # noqa webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) resp = requests.get(webui_url + "/memory/set_fetch", params={"shouldFetch": "true"}) resp.raise_for_status() def check_mem_table(): resp = requests.get(f"{webui_url}/memory/memory_table") resp_data = resp.json() assert resp_data["result"] latest_memory_table = resp_data["data"]["memoryTable"] summary = latest_memory_table["summary"] # 1 ref per handle and per object the actor has a ref to assert summary["totalActorHandles"] == len(actors) * 2 # 1 ref for my_obj assert summary["totalLocalRefCount"] == 1 wait_until_succeeded_without_exception(check_mem_table, (AssertionError, ), timeout_ms=1000)
def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only): address_info = ray.init(num_cpus=1, include_dashboard=True) assert (wait_until_server_available(address_info["webui_url"]) is True) webui_url = address_info["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 10 start_time = time.time() while True: time.sleep(1) try: response = requests.get( webui_url + "/test/dump", proxies={ "http": None, "https": None }) response.raise_for_status() try: response.json() assert response.ok except Exception as ex: logger.info("failed response: %s", response.text) raise ex break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.")
def test_get_all_node_details(disable_aiohttp_cache, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) @ray.remote class ActorWithObjs: def __init__(self): self.obj_ref = ray.put([1, 2, 3]) def get_obj(self): return ray.get(self.obj_ref) actors = [ActorWithObjs.remote() for _ in range(2)] # noqa def check_node_details(): resp = requests.get(f"{webui_url}/nodes?view=details") resp_json = resp.json() resp_data = resp_json["data"] try: clients = resp_data["clients"] node = clients[0] assert len(clients) == 1 assert len(node.get("actors")) == 2 # Workers information should be in the detailed payload assert "workers" in node assert "logCount" in node assert len(node["workers"]) == 2 return True except (AssertionError, KeyError, IndexError): return False wait_for_condition(check_node_details, 15)
def test_log_proxy(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: # Test range request. response = requests.get( f"{webui_url}/log_proxy?url={webui_url}/logs/dashboard.log", headers={"Range": "bytes=43-51"}) response.raise_for_status() assert response.text == "Dashboard" # Test 404. response = requests.get(f"{webui_url}/log_proxy?" f"url={webui_url}/logs/not_exist_file.log") assert response.status_code == 404 break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_profiling(shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=6) @ray.remote(num_cpus=2) class Actor: def getpid(self): return os.getpid() c = Actor.remote() actor_pid = ray.get(c.getpid.remote()) webui_url = addresses["webui_url"] assert (wait_until_server_available(webui_url) is True) webui_url = format_web_url(webui_url) start_time = time.time() launch_profiling = None while True: # Sometimes some startup time is required if time.time() - start_time > 15: raise RayTestTimeoutException( "Timed out while collecting profiling stats, " f"launch_profiling: {launch_profiling}") launch_profiling = requests.get( webui_url + "/api/launch_profiling", params={ "ip": ray.nodes()[0]["NodeManagerAddress"], "pid": actor_pid, "duration": 5 }).json() if launch_profiling["result"]: profiling_info = launch_profiling["data"]["profilingInfo"] break time.sleep(1) logger.info(profiling_info)
def test_multi_nodes_info(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) cluster.add_node() cluster.add_node() def _check_nodes(): try: response = requests.get(webui_url + "/nodes?view=summary") response.raise_for_status() summary = response.json() assert summary["result"] is True, summary["msg"] summary = summary["data"]["summary"] assert len(summary) == 3 for node_info in summary: node_id = node_info["raylet"]["nodeId"] response = requests.get(webui_url + f"/nodes/{node_id}") response.raise_for_status() detail = response.json() assert detail["result"] is True, detail["msg"] detail = detail["data"]["detail"] assert detail["raylet"]["state"] == "ALIVE" response = requests.get(webui_url + "/test/dump?key=agents") response.raise_for_status() agents = response.json() assert len(agents["data"]["agents"]) == 3 return True except Exception as ex: logger.info(ex) return False wait_for_condition(_check_nodes, timeout=10)
def test_submit_job(disable_aiohttp_cache, enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) job = _prepare_job_for_test(webui_url) job_root_dir = os.path.join( os.path.dirname(ray_start_with_dashboard["session_dir"]), "job") shutil.rmtree(job_root_dir, ignore_errors=True) job_id = None job_submitted = False def _check_running(): nonlocal job_id nonlocal job_submitted if not job_submitted: resp = requests.post(f"{webui_url}/jobs", json=job) resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_submitted = True resp = requests.get(f"{webui_url}/jobs?view=summary") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text summary = result["data"]["summary"] assert len(summary) == 2 # TODO(fyrestone): Return a job id when POST /jobs # The larger job id is the one we submitted. job_ids = sorted(s["jobId"] for s in summary) job_id = job_ids[1] resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_info = result["data"]["detail"]["jobInfo"] assert job_info["jobId"] == job_id resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_info = result["data"]["detail"]["jobInfo"] assert job_info["isDead"] is False job_actors = result["data"]["detail"]["jobActors"] job_workers = result["data"]["detail"]["jobWorkers"] assert len(job_actors) > 0 assert len(job_workers) > 0 wait_until_succeeded_without_exception(_check_running, exceptions=(AssertionError, KeyError, IndexError), timeout_ms=30 * 1000, raise_last_ex=True)
def test_kill_actor(ray_start_with_dashboard): @ray.remote class Actor: def __init__(self): pass def f(self): ray.worker.show_in_dashboard("test") return os.getpid() a = Actor.remote() worker_pid = ray.get(a.f.remote()) # noqa webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) def actor_killed(pid): """Check For the existence of a unix pid.""" try: os.kill(pid, 0) except OSError: return True else: return False def get_actor(): resp = requests.get(f"{webui_url}/logical/actor_groups") resp.raise_for_status() actor_groups_resp = resp.json() assert actor_groups_resp["result"] is True, actor_groups_resp["msg"] actor_groups = actor_groups_resp["data"]["actorGroups"] actor = actor_groups["Actor"]["entries"][0] return actor def kill_actor_using_dashboard(actor): resp = requests.get( webui_url + "/logical/kill_actor", params={ "actorId": actor["actorId"], "ipAddress": actor["ipAddress"], "port": actor["port"] }) resp.raise_for_status() resp_json = resp.json() assert resp_json["result"] is True, "msg" in resp_json start = time.time() last_exc = None while time.time() - start <= 10: try: actor = get_actor() kill_actor_using_dashboard(actor) last_exc = None break except (KeyError, AssertionError) as e: last_exc = e time.sleep(.1) assert last_exc is None
def test_event_message_limit(enable_event_module, small_event_line_limit, disable_aiohttp_cache, ray_start_with_dashboard): event_read_line_length_limit = small_event_line_limit assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) session_dir = ray_start_with_dashboard["session_dir"] event_dir = os.path.join(session_dir, "logs", "events") job_id = ray.JobID.from_int(100).hex() events = [] # Sample event equals with limit. sample_event = _get_event("", job_id=job_id) message_len = event_read_line_length_limit - len(json.dumps(sample_event)) for i in range(10): sample_event = copy.deepcopy(sample_event) sample_event["event_id"] = binary_to_hex(np.random.bytes(18)) sample_event["message"] = str(i) * message_len assert len(json.dumps(sample_event)) == event_read_line_length_limit events.append(sample_event) # Sample event longer than limit. sample_event = copy.deepcopy(sample_event) sample_event["event_id"] = binary_to_hex(np.random.bytes(18)) sample_event["message"] = "2" * (message_len + 1) assert len(json.dumps(sample_event)) > event_read_line_length_limit events.append(sample_event) for i in range(event_consts.EVENT_READ_LINE_COUNT_LIMIT): events.append(_get_event(str(i), job_id=job_id)) with open(os.path.join(event_dir, "tmp.log"), "w") as f: f.writelines([(json.dumps(e) + "\n") for e in events]) try: os.remove(os.path.join(event_dir, "event_GCS.log")) except Exception: pass os.rename(os.path.join(event_dir, "tmp.log"), os.path.join(event_dir, "event_GCS.log")) def _check_events(): try: resp = requests.get(f"{webui_url}/events") resp.raise_for_status() result = resp.json() all_events = result["data"]["events"] assert len(all_events[job_id] ) >= event_consts.EVENT_READ_LINE_COUNT_LIMIT + 10 messages = [e["message"] for e in all_events[job_id]] for i in range(10): assert str(i) * message_len in messages assert "2" * (message_len + 1) not in messages assert str(event_consts.EVENT_READ_LINE_COUNT_LIMIT - 1) in messages return True except Exception as ex: logger.exception(ex) return False wait_for_condition(_check_events, timeout=15)
def test_snapshot(ray_start_with_dashboard): driver_template = """ import ray ray.init(address="{address}", namespace="my_namespace") @ray.remote class Pinger: def ping(self): return "pong" a = Pinger.options(lifetime={lifetime}, name={name}).remote() ray.get(a.ping.remote()) """ detached_driver = driver_template.format( address=ray_start_with_dashboard["redis_address"], lifetime="'detached'", name="'abc'") named_driver = driver_template.format( address=ray_start_with_dashboard["redis_address"], lifetime="None", name="'xyz'") unnamed_driver = driver_template.format( address=ray_start_with_dashboard["redis_address"], lifetime="None", name="None") run_string_as_driver(detached_driver) run_string_as_driver(named_driver) run_string_as_driver(unnamed_driver) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) response = requests.get(f"{webui_url}/api/snapshot") response.raise_for_status() data = response.json() schema_path = os.path.join(os.path.dirname(dashboard.__file__), "modules/snapshot/snapshot_schema.json") pprint.pprint(data) jsonschema.validate(instance=data, schema=json.load(open(schema_path))) assert len(data["data"]["snapshot"]["actors"]) == 3 assert len(data["data"]["snapshot"]["jobs"]) == 4 assert len(data["data"]["snapshot"]["deployments"]) == 0 for actor_id, entry in data["data"]["snapshot"]["actors"].items(): assert entry["jobId"] in data["data"]["snapshot"]["jobs"] assert entry["actorClass"] == "Pinger" assert entry["startTime"] >= 0 if entry["isDetached"]: assert entry["endTime"] == 0, entry else: assert entry["endTime"] > 0, entry assert "runtimeEnv" in entry assert data["data"]["snapshot"]["rayCommit"] == ray.__commit__ assert data["data"]["snapshot"]["rayVersion"] == ray.__version__
def test_actor_groups(ray_start_with_dashboard): @ray.remote class Foo: def __init__(self, num): self.num = num def do_task(self): return self.num @ray.remote(num_gpus=1) class InfeasibleActor: pass foo_actors = [Foo.remote(4), Foo.remote(5)] infeasible_actor = InfeasibleActor.remote() # noqa results = [actor.do_task.remote() for actor in foo_actors] # noqa assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: response = requests.get(webui_url + "/logical/actor_groups") response.raise_for_status() actor_groups_resp = response.json() assert actor_groups_resp["result"] is True, actor_groups_resp[ "msg"] actor_groups = actor_groups_resp["data"]["actorGroups"] assert "Foo" in actor_groups summary = actor_groups["Foo"]["summary"] # 2 __init__ tasks and 2 do_task tasks assert summary["numExecutedTasks"] == 4 assert summary["stateToCount"]["ALIVE"] == 2 entries = actor_groups["Foo"]["entries"] assert len(entries) == 2 assert "InfeasibleActor" in actor_groups entries = actor_groups["InfeasibleActor"]["entries"] assert "requiredResources" in entries[0] assert "GPU" in entries[0]["requiredResources"] break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_actors(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote class Foo: def __init__(self, num): self.num = num def do_task(self): return self.num @ray.remote(num_gpus=1) class InfeasibleActor: pass foo_actors = [Foo.remote(4), Foo.remote(5)] infeasible_actor = InfeasibleActor.remote() # noqa results = [actor.do_task.remote() for actor in foo_actors] # noqa webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: resp = requests.get(f"{webui_url}/logical/actors") resp_json = resp.json() resp_data = resp_json["data"] actors = resp_data["actors"] assert len(actors) == 3 one_entry = list(actors.values())[0] assert "jobId" in one_entry assert "taskSpec" in one_entry assert "functionDescriptor" in one_entry["taskSpec"] assert type(one_entry["taskSpec"]["functionDescriptor"]) is dict assert "address" in one_entry assert type(one_entry["address"]) is dict assert "state" in one_entry assert "name" in one_entry assert "numRestarts" in one_entry assert "pid" in one_entry all_pids = {entry["pid"] for entry in actors.values()} assert 0 in all_pids # The infeasible actor assert len(all_pids) > 1 break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_logs(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class LoggingActor: def go(self, n): i = 0 while i < n: print(f"On number {i}") i += 1 def get_pid(self): return os.getpid() la = LoggingActor.remote() la2 = LoggingActor.remote() la_pid = str(ray.get(la.get_pid.remote())) la2_pid = str(ray.get(la2.get_pid.remote())) ray.get(la.go.remote(4)) ray.get(la2.go.remote(1)) def check_logs(): node_logs_response = requests.get( f"{webui_url}/node_logs", params={"ip": node_ip}) node_logs_response.raise_for_status() node_logs = node_logs_response.json() assert node_logs["result"] assert type(node_logs["data"]["logs"]) is dict assert all( pid in node_logs["data"]["logs"] for pid in (la_pid, la2_pid)) assert len(node_logs["data"]["logs"][la2_pid]) == 1 actor_one_logs_response = requests.get( f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(la_pid) }) actor_one_logs_response.raise_for_status() actor_one_logs = actor_one_logs_response.json() assert actor_one_logs["result"] assert type(actor_one_logs["data"]["logs"]) is dict assert len(actor_one_logs["data"]["logs"][la_pid]) == 4 wait_until_succeeded_without_exception( check_logs, (AssertionError), timeout_ms=1000)
def test_get_all_node_details(disable_aiohttp_cache, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) @ray.remote class ActorWithObjs: def __init__(self): print("I also log a line") self.obj_ref = ray.put([1, 2, 3]) def get_obj(self): return ray.get(self.obj_ref) actors = [ActorWithObjs.remote() for _ in range(2)] # noqa timeout_seconds = 20 start_time = time.time() last_ex = None def check_node_details(): resp = requests.get(f"{webui_url}/nodes?view=details") resp_json = resp.json() resp_data = resp_json["data"] clients = resp_data["clients"] node = clients[0] assert len(clients) == 1 assert len(node.get("actors")) == 2 # Workers information should be in the detailed payload assert "workers" in node assert "logCount" in node # Two lines printed by ActorWithObjs # One line printed by autoscaler: monitor.py:118 -- Monitor: Started assert node["logCount"] > 2 print(node["workers"]) assert len(node["workers"]) == 2 assert node["workers"][0]["logCount"] == 1 while True: time.sleep(1) try: check_node_details() break except (AssertionError, KeyError, IndexError) as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_event_basic(enable_event_module, disable_aiohttp_cache, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) session_dir = ray_start_with_dashboard["session_dir"] event_dir = os.path.join(session_dir, "logs", "events") job_id = ray.JobID.from_int(100).hex() source_type_gcs = event_pb2.Event.SourceType.Name(event_pb2.Event.GCS) source_type_raylet = event_pb2.Event.SourceType.Name( event_pb2.Event.RAYLET) test_count = 20 for source_type in [source_type_gcs, source_type_raylet]: test_log_file = os.path.join(event_dir, f"event_{source_type}.log") test_logger = _test_logger(__name__ + str(random.random()), test_log_file, max_bytes=2000, backup_count=1000) for i in range(test_count): sample_event = _get_event(str(i), job_id=job_id, source_type=source_type) test_logger.info("%s", json.dumps(sample_event)) def _check_events(): try: resp = requests.get(f"{webui_url}/events") resp.raise_for_status() result = resp.json() all_events = result["data"]["events"] job_events = all_events[job_id] assert len(job_events) >= test_count * 2 source_messages = {} for e in job_events: source_type = e["sourceType"] message = e["message"] source_messages.setdefault(source_type, set()).add(message) assert len(source_messages[source_type_gcs]) >= test_count assert len(source_messages[source_type_raylet]) >= test_count data = {str(i) for i in range(test_count)} assert data & source_messages[source_type_gcs] == data assert data & source_messages[source_type_raylet] == data return True except Exception as ex: logger.exception(ex) return False wait_for_condition(_check_events, timeout=15)
def test_get_cluster_status(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard webui_url = address_info["webui_url"] webui_url = format_web_url(webui_url) # Check that the cluster_status endpoint works without the underlying data # from the GCS, but returns nothing. def get_cluster_status(): response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() print(response.json()) assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert response.json()["data"]["autoscalingStatus"] is None assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] is None assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"] wait_until_succeeded_without_exception(get_cluster_status, (requests.RequestException, )) # Populate the GCS field, check that the data is returned from the # endpoint. address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis( host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) client.hset(DEBUG_AUTOSCALING_STATUS_LEGACY, "value", "hello") client.hset(DEBUG_AUTOSCALING_ERROR, "value", "world") response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert response.json()["data"]["autoscalingStatus"] == "hello" assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] == "world" assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
def test_errors(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class ErrorActor(): def go(self): raise ValueError("This is an error") def get_pid(self): return os.getpid() ea = ErrorActor.remote() ea_pid = ea.get_pid.remote() ea.go.remote() def check_errs(): node_errs_response = requests.get( f"{webui_url}/node_logs", params={"ip": node_ip}) node_errs_response.raise_for_status() node_errs = node_errs_response.json() assert node_errs["result"] assert type(node_errs["data"]["errors"]) is dict assert ea_pid in node_errs["data"]["errors"] assert len(node_errs["data"]["errors"][ea_pid]) == 1 actor_err_response = requests.get( f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(ea_pid) }) actor_err_response.raise_for_status() actor_errs = actor_err_response.json() assert actor_errs["result"] assert type(actor_errs["data"]["errors"]) is dict assert len(actor_errs["data"]["errors"][ea_pid]) == 4 wait_until_succeeded_without_exception( check_errs, (AssertionError), timeout_ms=1000)
def test_http_get(enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) target_url = webui_url + "/test/dump" timeout_seconds = 10 start_time = time.time() while True: time.sleep(1) try: response = requests.get(webui_url + "/test/http_get?url=" + target_url) response.raise_for_status() try: dump_info = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert dump_info["result"] is True dump_data = dump_info["data"] assert len(dump_data["agents"]) == 1 node_id, ports = next(iter(dump_data["agents"].items())) ip = ray_start_with_dashboard["node_ip_address"] http_port, grpc_port = ports response = requests.get( f"http://{ip}:{http_port}" f"/test/http_get_from_agent?url={target_url}") response.raise_for_status() try: dump_info = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert dump_info["result"] is True break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.")
def test_nodes_update(enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 10 start_time = time.time() while True: time.sleep(1) try: response = requests.get(webui_url + "/test/dump") response.raise_for_status() try: dump_info = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert dump_info["result"] is True dump_data = dump_info["data"] assert len(dump_data["nodes"]) == 1 assert len(dump_data["agents"]) == 1 assert len(dump_data["nodeIdToIp"]) == 1 assert len(dump_data["nodeIdToHostname"]) == 1 assert dump_data["nodes"].keys( ) == dump_data["nodeIdToHostname"].keys() response = requests.get(webui_url + "/test/notified_agents") response.raise_for_status() try: notified_agents = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert notified_agents["result"] is True notified_agents = notified_agents["data"] assert len(notified_agents) == 1 assert notified_agents == dump_data["agents"] break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.")
def test_node_physical_stats(enable_test_module, shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=6) @ray.remote(num_cpus=1) class Actor: def getpid(self): return os.getpid() actors = [Actor.remote() for _ in range(6)] actor_pids = ray.get([actor.getpid.remote() for actor in actors]) actor_pids = set(actor_pids) webui_url = addresses["webui_url"] assert (wait_until_server_available(webui_url) is True) webui_url = format_web_url(webui_url) def _check_workers(): try: resp = requests.get(webui_url + "/test/dump?key=node_physical_stats") resp.raise_for_status() result = resp.json() assert result["result"] is True node_physical_stats = result["data"]["nodePhysicalStats"] assert len(node_physical_stats) == 1 current_stats = node_physical_stats[addresses["node_id"]] # Check Actor workers current_actor_pids = set() for worker in current_stats["workers"]: if "ray::Actor" in worker["cmdline"][0]: current_actor_pids.add(worker["pid"]) assert current_actor_pids == actor_pids # Check raylet cmdline assert "raylet" in current_stats["cmdline"][0] return True except Exception as ex: logger.info(ex) return False wait_for_condition(_check_workers, timeout=10)
def test_nil_node(enable_test_module, disable_aiohttp_cache, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) @ray.remote(num_gpus=1) class InfeasibleActor: pass infeasible_actor = InfeasibleActor.remote() # noqa timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: resp = requests.get(f"{webui_url}/logical/actors") resp_json = resp.json() resp_data = resp_json["data"] actors = resp_data["actors"] assert len(actors) == 1 response = requests.get(webui_url + "/test/dump?key=node_actors") response.raise_for_status() result = response.json() assert stats_collector_consts.NIL_NODE_ID not in result["data"][ "nodeActors"] break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_multi_node_churn(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster: Cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = format_web_url(cluster.webui_url) def cluster_chaos_monkey(): worker_nodes = [] while True: time.sleep(5) if len(worker_nodes) < 2: worker_nodes.append(cluster.add_node()) continue should_add_node = random.randint(0, 1) if should_add_node: worker_nodes.append(cluster.add_node()) else: node_index = random.randrange(0, len(worker_nodes)) node_to_remove = worker_nodes.pop(node_index) cluster.remove_node(node_to_remove) def get_index(): resp = requests.get(webui_url) resp.raise_for_status() def get_nodes(): resp = requests.get(webui_url + "/nodes?view=summary") resp.raise_for_status() summary = resp.json() assert summary["result"] is True, summary["msg"] assert summary["data"]["summary"] t = threading.Thread(target=cluster_chaos_monkey, daemon=True) t.start() t_st = datetime.now() duration = timedelta(seconds=60) while datetime.now() < t_st + duration: get_index() time.sleep(2)
def test_serve_snapshot(ray_start_with_dashboard): """Test detached and nondetached Serve instances running concurrently.""" detached_serve_driver_script = f""" import ray from ray import serve ray.init( address="{ray_start_with_dashboard['redis_address']}", namespace="serve") serve.start(detached=True) @serve.deployment(version="v1") def my_func(request): return "hello" my_func.deploy() """ run_string_as_driver(detached_serve_driver_script) assert requests.get("http://127.0.0.1:8000/my_func").text == "hello" # Use a new port to avoid clobbering the first Serve instance. serve.start(http_options={"port": 8123}) @serve.deployment(version="v1") def my_func_nondetached(request): return "hello" my_func_nondetached.deploy() assert requests.get( "http://127.0.0.1:8123/my_func_nondetached").text == "hello" webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) response = requests.get(f"{webui_url}/api/snapshot") response.raise_for_status() data = response.json() schema_path = os.path.join(os.path.dirname(dashboard.__file__), "modules/snapshot/snapshot_schema.json") pprint.pprint(data) jsonschema.validate(instance=data, schema=json.load(open(schema_path))) assert len(data["data"]["snapshot"]["deployments"]) == 2 entry = data["data"]["snapshot"]["deployments"]["myFunc"] assert entry["name"] == "my_func" assert entry["version"] == "v1" assert entry["namespace"] == "serve" assert entry["httpRoute"] == "/my_func" assert entry["className"] == "my_func" assert entry["status"] == "RUNNING" assert entry["rayJobId"] is not None assert entry["startTime"] > 0 assert entry["endTime"] == 0 assert len(entry["actors"]) == 1 actor_id = next(iter(entry["actors"])) metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][ "serve"] assert metadata["deploymentName"] == "my_func" assert metadata["version"] == "v1" assert len(metadata["replicaTag"]) > 0 entry_nondetached = data["data"]["snapshot"]["deployments"][ "myFuncNondetached"] assert entry_nondetached["name"] == "my_func_nondetached" assert entry_nondetached["version"] == "v1" assert entry_nondetached["namespace"] == "" assert entry_nondetached["httpRoute"] == "/my_func_nondetached" assert entry_nondetached["className"] == "my_func_nondetached" assert entry_nondetached["status"] == "RUNNING" assert entry_nondetached["rayJobId"] is not None assert entry_nondetached["startTime"] > 0 assert entry_nondetached["endTime"] == 0 assert len(entry_nondetached["actors"]) == 1 actor_id = next(iter(entry_nondetached["actors"])) metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][ "serve"] assert metadata["deploymentName"] == "my_func_nondetached" assert metadata["version"] == "v1" assert len(metadata["replicaTag"]) > 0
def test_node_info(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote class Actor: def getpid(self): return os.getpid() actors = [Actor.remote(), Actor.remote()] actor_pids = [actor.getpid.remote() for actor in actors] actor_pids = set(ray.get(actor_pids)) assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = ray_start_with_dashboard["node_id"] timeout_seconds = 10 start_time = time.time() last_ex = None while True: time.sleep(1) try: response = requests.get(webui_url + "/nodes?view=hostnamelist") response.raise_for_status() hostname_list = response.json() assert hostname_list["result"] is True, hostname_list["msg"] hostname_list = hostname_list["data"]["hostNameList"] assert len(hostname_list) == 1 hostname = hostname_list[0] response = requests.get(webui_url + f"/nodes/{node_id}") response.raise_for_status() detail = response.json() assert detail["result"] is True, detail["msg"] detail = detail["data"]["detail"] assert detail["hostname"] == hostname assert detail["raylet"]["state"] == "ALIVE" assert "raylet" in detail["cmdline"][0] assert len(detail["workers"]) >= 2 assert len(detail["actors"]) == 2, detail["actors"] assert len(detail["raylet"]["viewData"]) > 0 actor_worker_pids = set() for worker in detail["workers"]: if "ray::Actor" in worker["cmdline"][0]: actor_worker_pids.add(worker["pid"]) assert actor_worker_pids == actor_pids response = requests.get(webui_url + "/nodes?view=summary") response.raise_for_status() summary = response.json() assert summary["result"] is True, summary["msg"] assert len(summary["data"]["summary"]) == 1 summary = summary["data"]["summary"][0] assert summary["hostname"] == hostname assert summary["raylet"]["state"] == "ALIVE" assert "raylet" in summary["cmdline"][0] assert "workers" not in summary assert "actors" not in summary assert "viewData" not in summary["raylet"] break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_log(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote def write_log(s): print(s) test_log_text = "test_log_text" ray.get(write_log.remote(test_log_text)) assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = ray_start_with_dashboard["node_id"] timeout_seconds = 10 start_time = time.time() last_ex = None while True: time.sleep(1) try: response = requests.get(webui_url + "/log_index") response.raise_for_status() parser = LogUrlParser() parser.feed(response.text) all_nodes_log_urls = parser.get_urls() assert len(all_nodes_log_urls) == 1 response = requests.get(all_nodes_log_urls[0]) response.raise_for_status() parser = LogUrlParser() parser.feed(response.text) # Search test_log_text from all worker logs. parsed_url = urllib.parse.urlparse(all_nodes_log_urls[0]) paths = parser.get_urls() urls = [] for p in paths: if "worker" in p: urls.append(parsed_url._replace(path=p).geturl()) for u in urls: response = requests.get(u) response.raise_for_status() if test_log_text in response.text: break else: raise Exception(f"Can't find {test_log_text} from {urls}") # Test range request. response = requests.get(webui_url + "/logs/dashboard.log", headers={"Range": "bytes=43-51"}) response.raise_for_status() assert response.text == "Dashboard" # Test logUrl in node info. response = requests.get(webui_url + f"/nodes/{node_id}") response.raise_for_status() node_info = response.json() assert node_info["result"] is True node_info = node_info["data"]["detail"] assert "logUrl" in node_info assert node_info["logUrl"] in all_nodes_log_urls break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_aiohttp_cache(enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() value1_timestamps = [] while True: time.sleep(1) try: for x in range(10): response = requests.get(webui_url + "/test/aiohttp_cache/t1?value=1") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] value1_timestamps.append(timestamp) assert len(collections.Counter(value1_timestamps)) > 1 break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.") sub_path_timestamps = [] for x in range(10): response = requests.get(webui_url + f"/test/aiohttp_cache/tt{x}?value=1") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] sub_path_timestamps.append(timestamp) assert len(collections.Counter(sub_path_timestamps)) == 10 volatile_value_timestamps = [] for x in range(10): response = requests.get(webui_url + f"/test/aiohttp_cache/tt?value={x}") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] volatile_value_timestamps.append(timestamp) assert len(collections.Counter(volatile_value_timestamps)) == 10 response = requests.get(webui_url + "/test/aiohttp_cache/raise_exception") response.raise_for_status() result = response.json() assert result["result"] is False assert "KeyError" in result["msg"] volatile_value_timestamps = [] for x in range(10): response = requests.get(webui_url + f"/test/aiohttp_cache_lru/tt{x % 4}") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] volatile_value_timestamps.append(timestamp) assert len(collections.Counter(volatile_value_timestamps)) == 4 volatile_value_timestamps = [] data = collections.defaultdict(set) for x in [0, 1, 2, 3, 4, 5, 2, 1, 0, 3]: response = requests.get(webui_url + f"/test/aiohttp_cache_lru/t1?value={x}") response.raise_for_status() timestamp = response.json()["data"]["timestamp"] data[x].add(timestamp) volatile_value_timestamps.append(timestamp) assert len(collections.Counter(volatile_value_timestamps)) == 8 assert len(data[3]) == 2 assert len(data[0]) == 2
def test_get_job_info(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote class Actor: def getpid(self): return os.getpid() actor = Actor.remote() actor_pid = ray.get(actor.getpid.remote()) actor_id = actor._actor_id.hex() assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) ip = ray._private.services.get_node_ip_address() def _check(): resp = requests.get(f"{webui_url}/jobs?view=summary") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_summary = result["data"]["summary"] assert len(job_summary) == 1, resp.text one_job = job_summary[0] assert "jobId" in one_job job_id = one_job["jobId"] assert ray._raylet.JobID(hex_to_binary(one_job["jobId"])) assert "driverIpAddress" in one_job assert one_job["driverIpAddress"] == ip assert "driverPid" in one_job assert one_job["driverPid"] == str(os.getpid()) assert "config" in one_job assert type(one_job["config"]) is dict assert "isDead" in one_job assert one_job["isDead"] is False assert "timestamp" in one_job one_job_summary_keys = one_job.keys() resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_detail = result["data"]["detail"] assert "jobInfo" in job_detail assert len(one_job_summary_keys - job_detail["jobInfo"].keys()) == 0 assert "jobActors" in job_detail job_actors = job_detail["jobActors"] assert len(job_actors) == 1, resp.text one_job_actor = job_actors[actor_id] assert "taskSpec" in one_job_actor assert type(one_job_actor["taskSpec"]) is dict assert "functionDescriptor" in one_job_actor["taskSpec"] assert type(one_job_actor["taskSpec"]["functionDescriptor"]) is dict assert "pid" in one_job_actor assert one_job_actor["pid"] == actor_pid check_actor_keys = [ "name", "timestamp", "address", "actorId", "jobId", "state" ] for k in check_actor_keys: assert k in one_job_actor assert "jobWorkers" in job_detail job_workers = job_detail["jobWorkers"] assert len(job_workers) == 1, resp.text one_job_worker = job_workers[0] check_worker_keys = [ "cmdline", "pid", "cpuTimes", "memoryInfo", "cpuPercent", "coreWorkerStats", "language", "jobId" ] for k in check_worker_keys: assert k in one_job_worker timeout_seconds = 10 start_time = time.time() last_ex = None while True: time.sleep(1) try: _check() break except (AssertionError, KeyError, IndexError) as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_submit_job_validation(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) job_root_dir = os.path.join( os.path.dirname(ray_start_with_dashboard["session_dir"]), "job") shutil.rmtree(job_root_dir, ignore_errors=True) def _ensure_available_nodes(): resp = requests.post(f"{webui_url}/jobs") resp.raise_for_status() result = resp.json() assert result["result"] is False return "no nodes available" not in result["msg"] wait_for_condition(_ensure_available_nodes, timeout=5) # Invalid value. resp = requests.post(f"{webui_url}/jobs", json={ "language": "Unsupported", "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, "driver_entry": "python_file_name_without_ext", }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert "language" in msg and "Unsupported" in msg, resp.text # Missing required field. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["missing", "driver_entry"]), resp.text # Incorrect value type. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": ["http://xxx/yyy.zip"] }, "driver_entry": "python_file_name_without_ext", }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["working_dir", "str"]), resp.text # Invalid key. resp = requests.post(f"{webui_url}/jobs", json={ "language": job_consts.PYTHON, "runtime_env": { "working_dir": "http://xxx/yyy.zip" }, "driver_entry": "python_file_name_without_ext", "invalid_key": 1, }) resp.raise_for_status() result = resp.json() assert result["result"] is False msg = result["msg"] assert all(p in msg for p in ["unexpected", "invalid_key"]), resp.text