def test_submit_job(disable_aiohttp_cache, enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) job = _prepare_job_for_test(webui_url) job_root_dir = os.path.join( os.path.dirname(ray_start_with_dashboard["session_dir"]), "job") shutil.rmtree(job_root_dir, ignore_errors=True) job_id = None job_submitted = False def _check_running(): nonlocal job_id nonlocal job_submitted if not job_submitted: resp = requests.post(f"{webui_url}/jobs", json=job) resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_submitted = True resp = requests.get(f"{webui_url}/jobs?view=summary") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text summary = result["data"]["summary"] assert len(summary) == 2 # TODO(fyrestone): Return a job id when POST /jobs # The larger job id is the one we submitted. job_ids = sorted(s["jobId"] for s in summary) job_id = job_ids[1] resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_info = result["data"]["detail"]["jobInfo"] assert job_info["jobId"] == job_id resp = requests.get(f"{webui_url}/jobs/{job_id}") resp.raise_for_status() result = resp.json() assert result["result"] is True, resp.text job_info = result["data"]["detail"]["jobInfo"] assert job_info["isDead"] is False job_actors = result["data"]["detail"]["jobActors"] job_workers = result["data"]["detail"]["jobWorkers"] assert len(job_actors) > 0 assert len(job_workers) > 0 wait_until_succeeded_without_exception( _check_running, exceptions=(AssertionError, KeyError, IndexError), timeout_ms=30 * 1000, raise_last_ex=True)
def test_multi_nodes_info(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster: Cluster = ray_start_cluster_head assert wait_until_server_available(cluster.webui_url) is True webui_url = cluster.webui_url webui_url = format_web_url(webui_url) cluster.add_node() cluster.add_node() def _check_nodes(): try: response = requests.get(webui_url + "/nodes?view=summary") response.raise_for_status() summary = response.json() assert summary["result"] is True, summary["msg"] summary = summary["data"]["summary"] assert len(summary) == 3 for node_info in summary: node_id = node_info["raylet"]["nodeId"] response = requests.get(webui_url + f"/nodes/{node_id}") response.raise_for_status() detail = response.json() assert detail["result"] is True, detail["msg"] detail = detail["data"]["detail"] assert detail["raylet"]["state"] == "ALIVE" response = requests.get(webui_url + "/test/dump?key=agents") response.raise_for_status() agents = response.json() assert len(agents["data"]["agents"]) == 3 return True except Exception as ex: logger.info(ex) return False wait_for_condition(_check_nodes, timeout=15)
def test_kill_actor(ray_start_with_dashboard): @ray.remote class Actor: def __init__(self): pass def f(self): ray.worker.show_in_dashboard("test") return os.getpid() a = Actor.remote() worker_pid = ray.get(a.f.remote()) # noqa webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) def actor_killed(pid): """Check For the existence of a unix pid.""" try: os.kill(pid, 0) except OSError: return True else: return False def get_actor(): resp = requests.get(f"{webui_url}/logical/actor_groups") resp.raise_for_status() actor_groups_resp = resp.json() assert actor_groups_resp["result"] is True, actor_groups_resp["msg"] actor_groups = actor_groups_resp["data"]["actorGroups"] actor = actor_groups["Actor"]["entries"][0] return actor def kill_actor_using_dashboard(actor): resp = requests.get( webui_url + "/logical/kill_actor", params={ "actorId": actor["actorId"], "ipAddress": actor["ipAddress"], "port": actor["port"], }, ) resp.raise_for_status() resp_json = resp.json() assert resp_json["result"] is True, "msg" in resp_json start = time.time() last_exc = None while time.time() - start <= 10: try: actor = get_actor() kill_actor_using_dashboard(actor) last_exc = None break except (KeyError, AssertionError) as e: last_exc = e time.sleep(0.1) assert last_exc is None
def test_profiling(shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=6) @ray.remote(num_cpus=2) class Actor: def getpid(self): return os.getpid() c = Actor.remote() actor_pid = ray.get(c.getpid.remote()) webui_url = addresses["webui_url"] assert (wait_until_server_available(webui_url) is True) webui_url = format_web_url(webui_url) start_time = time.time() launch_profiling = None while True: # Sometimes some startup time is required if time.time() - start_time > 15: raise RayTestTimeoutException( "Timed out while collecting profiling stats, " f"launch_profiling: {launch_profiling}") launch_profiling = requests.get( webui_url + "/api/launch_profiling", params={ "ip": ray.nodes()[0]["NodeManagerAddress"], "pid": actor_pid, "duration": 5 }).json() if launch_profiling["result"]: profiling_info = launch_profiling["data"]["profilingInfo"] break time.sleep(1) logger.info(profiling_info)
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache, enable_test_module): address = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(address) address = format_web_url(address) entrypoint_cmd = ("python -c\"" "import ray;" "ray.init();" "import time;" "time.sleep(5);" "import sys;" "sys.exit(1);" "\"") client = JobSubmissionClient(address) job_id = client.submit_job(entrypoint=entrypoint_cmd) def wait_for_job_to_fail(): data = _get_snapshot(address) for job_entry in data["data"]["snapshot"]["jobs"].values(): if job_entry["status"] is not None: assert job_entry["config"]["metadata"][ "jobSubmissionId"] == job_id assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"} assert job_entry["statusMessage"] is not None return job_entry["status"] == "FAILED" return False wait_for_condition(wait_for_job_to_fail, timeout=30)
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache, enable_test_module): address = ray_start_with_dashboard.address_info["webui_url"] assert wait_until_server_available(address) address = format_web_url(address) job_sleep_time_s = 5 entrypoint_cmd = ('python -c"' "import ray;" "ray.init();" "import time;" f"time.sleep({job_sleep_time_s});" "import sys;" "sys.exit(1);" '"') start_time_s = int(time.time()) client = JobSubmissionClient(address) runtime_env = {"env_vars": {"RAY_TEST_456": "456"}} metadata = {"ray_test_789": "789"} job_id = client.submit_job(entrypoint=entrypoint_cmd, metadata=metadata, runtime_env=runtime_env) def wait_for_job_to_fail(): data = _get_snapshot(address) legacy_job_failed = False job_failed = False # Test legacy job snapshot (one driver per job). for job_entry in data["data"]["snapshot"]["jobs"].values(): if job_entry["status"] is not None: assert job_entry["config"]["metadata"][ "jobSubmissionId"] == job_id assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"} assert job_entry["statusMessage"] is not None legacy_job_failed = job_entry["status"] == "FAILED" # Test new jobs snapshot (0 to N drivers per job). for job_submission_id, entry in data["data"]["snapshot"][ "jobSubmission"].items(): if entry["status"] is not None: assert entry["status"] in {"PENDING", "RUNNING", "FAILED"} assert entry["message"] is not None # TODO(architkulkarni): Disable automatic camelcase. assert entry["runtimeEnv"] == { "envVars": { "RAYTest456": "456" } } assert entry["metadata"] == {"rayTest789": "789"} assert entry["errorType"] is None assert abs(entry["startTime"] - start_time_s) <= 2 if entry["status"] == "FAILED": job_failed = True assert entry[ "endTime"] >= entry["startTime"] + job_sleep_time_s return legacy_job_failed and job_failed wait_for_condition(wait_for_job_to_fail, timeout=10)
def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only): address_info = ray.init(num_cpus=1, include_dashboard=True) assert (wait_until_server_available(address_info["webui_url"]) is True) webui_url = address_info["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 10 start_time = time.time() while True: time.sleep(1) try: response = requests.get(webui_url + "/test/dump", proxies={ "http": None, "https": None }) response.raise_for_status() try: response.json() assert response.ok except Exception as ex: logger.info("failed response: %s", response.text) raise ex break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.")
def test_log_proxy(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: # Test range request. response = requests.get( f"{webui_url}/log_proxy?url={webui_url}/logs/dashboard.log", headers={"Range": "bytes=44-52"}) response.raise_for_status() assert response.text == "Dashboard" # Test 404. response = requests.get(f"{webui_url}/log_proxy?" f"url={webui_url}/logs/not_exist_file.log") assert response.status_code == 404 break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_memory_table(disable_aiohttp_cache, ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) @ray.remote class ActorWithObjs: def __init__(self): self.obj_ref = ray.put([1, 2, 3]) def get_obj(self): return ray.get(self.obj_ref) my_obj = ray.put([1, 2, 3] * 100) # noqa actors = [ActorWithObjs.remote() for _ in range(2)] # noqa results = ray.get([actor.get_obj.remote() for actor in actors]) # noqa webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) resp = requests.get(webui_url + "/memory/set_fetch", params={"shouldFetch": "true"}) resp.raise_for_status() def check_mem_table(): resp = requests.get(f"{webui_url}/memory/memory_table") resp_data = resp.json() assert resp_data["result"] latest_memory_table = resp_data["data"]["memoryTable"] summary = latest_memory_table["summary"] # 1 ref per handle and per object the actor has a ref to assert summary["totalActorHandles"] == len(actors) * 2 # 1 ref for my_obj. 2 refs for self.obj_ref for each actor. assert summary["totalLocalRefCount"] == 3 assert wait_until_succeeded_without_exception( check_mem_table, (AssertionError,), timeout_ms=10000 )
def test_event_message_limit(small_event_line_limit, disable_aiohttp_cache, ray_start_with_dashboard): event_read_line_length_limit = small_event_line_limit assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) session_dir = ray_start_with_dashboard["session_dir"] event_dir = os.path.join(session_dir, "logs", "events") job_id = ray.JobID.from_int(100).hex() events = [] # Sample event equals with limit. sample_event = _get_event("", job_id=job_id) message_len = event_read_line_length_limit - len(json.dumps(sample_event)) for i in range(10): sample_event = copy.deepcopy(sample_event) sample_event["event_id"] = binary_to_hex(np.random.bytes(18)) sample_event["message"] = str(i) * message_len assert len(json.dumps(sample_event)) == event_read_line_length_limit events.append(sample_event) # Sample event longer than limit. sample_event = copy.deepcopy(sample_event) sample_event["event_id"] = binary_to_hex(np.random.bytes(18)) sample_event["message"] = "2" * (message_len + 1) assert len(json.dumps(sample_event)) > event_read_line_length_limit events.append(sample_event) for i in range(event_consts.EVENT_READ_LINE_COUNT_LIMIT): events.append(_get_event(str(i), job_id=job_id)) with open(os.path.join(event_dir, "tmp.log"), "w") as f: f.writelines([(json.dumps(e) + "\n") for e in events]) try: os.remove(os.path.join(event_dir, "event_GCS.log")) except Exception: pass os.rename(os.path.join(event_dir, "tmp.log"), os.path.join(event_dir, "event_GCS.log")) def _check_events(): try: resp = requests.get(f"{webui_url}/events") resp.raise_for_status() result = resp.json() all_events = result["data"]["events"] assert (len(all_events[job_id]) >= event_consts.EVENT_READ_LINE_COUNT_LIMIT + 10) messages = [e["message"] for e in all_events[job_id]] for i in range(10): assert str(i) * message_len in messages assert "2" * (message_len + 1) not in messages assert str(event_consts.EVENT_READ_LINE_COUNT_LIMIT - 1) in messages return True except Exception as ex: logger.exception(ex) return False wait_for_condition(_check_events, timeout=15)
def test_snapshot(ray_start_with_dashboard): driver_template = """ import ray ray.init(address="{address}", namespace="my_namespace") @ray.remote class Pinger: def ping(self): return "pong" a = Pinger.options(lifetime={lifetime}, name={name}).remote() ray.get(a.ping.remote()) """ detached_driver = driver_template.format( address=ray_start_with_dashboard["redis_address"], lifetime="'detached'", name="'abc'") named_driver = driver_template.format( address=ray_start_with_dashboard["redis_address"], lifetime="None", name="'xyz'") unnamed_driver = driver_template.format( address=ray_start_with_dashboard["redis_address"], lifetime="None", name="None") run_string_as_driver(detached_driver) run_string_as_driver(named_driver) run_string_as_driver(unnamed_driver) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) response = requests.get(f"{webui_url}/api/snapshot") response.raise_for_status() data = response.json() schema_path = os.path.join(os.path.dirname(dashboard.__file__), "modules/snapshot/snapshot_schema.json") pprint.pprint(data) jsonschema.validate(instance=data, schema=json.load(open(schema_path))) assert len(data["data"]["snapshot"]["actors"]) == 3 assert len(data["data"]["snapshot"]["jobs"]) == 4 assert len(data["data"]["snapshot"]["deployments"]) == 0 for actor_id, entry in data["data"]["snapshot"]["actors"].items(): assert entry["jobId"] in data["data"]["snapshot"]["jobs"] assert entry["actorClass"] == "Pinger" assert entry["startTime"] >= 0 if entry["isDetached"]: assert entry["endTime"] == 0, entry else: assert entry["endTime"] > 0, entry assert "runtimeEnv" in entry assert data["data"]["snapshot"]["rayCommit"] == ray.__commit__ assert data["data"]["snapshot"]["rayVersion"] == ray.__version__
def test_logs_max_count(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): """Test that each Ray worker cannot cache more than 1000 logs at a time. """ cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class LoggingActor: def go(self, n): i = 0 while i < n: print(f"On number {i}") i += 1 def get_pid(self): return os.getpid() la = LoggingActor.remote() la_pid = str(ray.get(la.get_pid.remote())) ray.get(la.go.remote(MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD)) def check_logs(): node_logs_response = requests.get(f"{webui_url}/node_logs", params={"ip": node_ip}) node_logs_response.raise_for_status() node_logs = node_logs_response.json() assert node_logs["result"] assert type(node_logs["data"]["logs"]) is dict assert la_pid in node_logs["data"]["logs"] log_lengths = len(node_logs["data"]["logs"][la_pid]) assert log_lengths >= MAX_LOGS_TO_CACHE assert log_lengths <= MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD actor_one_logs_response = requests.get(f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(la_pid) }) actor_one_logs_response.raise_for_status() actor_one_logs = actor_one_logs_response.json() assert actor_one_logs["result"] assert type(actor_one_logs["data"]["logs"]) is dict log_lengths = len(actor_one_logs["data"]["logs"][la_pid]) assert log_lengths >= MAX_LOGS_TO_CACHE assert log_lengths <= MAX_LOGS_TO_CACHE * LOG_PRUNE_THREASHOLD assert wait_until_succeeded_without_exception(check_logs, (AssertionError, ), timeout_ms=10000)
def test_actors(disable_aiohttp_cache, ray_start_with_dashboard): @ray.remote class Foo: def __init__(self, num): self.num = num def do_task(self): return self.num @ray.remote(num_gpus=1) class InfeasibleActor: pass foo_actors = [Foo.remote(4), Foo.remote(5)] infeasible_actor = InfeasibleActor.remote() # noqa results = [actor.do_task.remote() for actor in foo_actors] # noqa webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: resp = requests.get(f"{webui_url}/logical/actors") resp_json = resp.json() resp_data = resp_json["data"] actors = resp_data["actors"] assert len(actors) == 3 one_entry = list(actors.values())[0] assert "jobId" in one_entry assert "taskSpec" in one_entry assert "functionDescriptor" in one_entry["taskSpec"] assert type(one_entry["taskSpec"]["functionDescriptor"]) is dict assert "address" in one_entry assert type(one_entry["address"]) is dict assert "state" in one_entry assert "name" in one_entry assert "numRestarts" in one_entry assert "pid" in one_entry all_pids = {entry["pid"] for entry in actors.values()} assert 0 in all_pids # The infeasible actor assert len(all_pids) > 1 break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__) if last_ex else [] ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_get_all_node_details(disable_aiohttp_cache, ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) @ray.remote class ActorWithObjs: def __init__(self): print("I also log a line") self.obj_ref = ray.put([1, 2, 3]) def get_obj(self): return ray.get(self.obj_ref) actors = [ActorWithObjs.remote() for _ in range(2)] # noqa timeout_seconds = 20 start_time = time.time() last_ex = None def check_node_details(): resp = requests.get(f"{webui_url}/nodes?view=details") resp_json = resp.json() resp_data = resp_json["data"] clients = resp_data["clients"] node = clients[0] assert len(clients) == 1 assert len(node.get("actors")) == 2 # Workers information should be in the detailed payload assert "workers" in node assert "logCount" in node # Two lines printed by ActorWithObjs assert node["logCount"] >= 2 print(node["workers"]) assert len(node["workers"]) == 2 assert node["workers"][0]["logCount"] == 1 while True: time.sleep(1) try: check_node_details() break except (AssertionError, KeyError, IndexError) as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = ( traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__ ) if last_ex else [] ) ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_logs(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class LoggingActor: def go(self, n): i = 0 while i < n: print(f"On number {i}") i += 1 def get_pid(self): return os.getpid() la = LoggingActor.remote() la2 = LoggingActor.remote() la_pid = str(ray.get(la.get_pid.remote())) la2_pid = str(ray.get(la2.get_pid.remote())) ray.get(la.go.remote(4)) ray.get(la2.go.remote(1)) def check_logs(): node_logs_response = requests.get(f"{webui_url}/node_logs", params={"ip": node_ip}) node_logs_response.raise_for_status() node_logs = node_logs_response.json() assert node_logs["result"] assert type(node_logs["data"]["logs"]) is dict assert all(pid in node_logs["data"]["logs"] for pid in (la_pid, la2_pid)) assert len(node_logs["data"]["logs"][la2_pid]) == 1 actor_one_logs_response = requests.get(f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(la_pid) }) actor_one_logs_response.raise_for_status() actor_one_logs = actor_one_logs_response.json() assert actor_one_logs["result"] assert type(actor_one_logs["data"]["logs"]) is dict assert len(actor_one_logs["data"]["logs"][la_pid]) == 4 assert wait_until_succeeded_without_exception(check_logs, (AssertionError, ), timeout_ms=1000)
def test_logs_clean_up( enable_test_module, disable_aiohttp_cache, ray_start_cluster_head ): """Check if logs from the dead pids are GC'ed.""" cluster = ray_start_cluster_head assert wait_until_server_available(cluster.webui_url) is True webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class LoggingActor: def go(self, n): i = 0 while i < n: print(f"On number {i}") i += 1 def get_pid(self): return os.getpid() la = LoggingActor.remote() la_pid = str(ray.get(la.get_pid.remote())) ray.get(la.go.remote(1)) def check_logs(): node_logs_response = requests.get( f"{webui_url}/node_logs", params={"ip": node_ip} ) node_logs_response.raise_for_status() node_logs = node_logs_response.json() assert node_logs["result"] assert la_pid in node_logs["data"]["logs"] assert wait_until_succeeded_without_exception( check_logs, (AssertionError,), timeout_ms=1000 ) ray.kill(la) def check_logs_not_exist(): node_logs_response = requests.get( f"{webui_url}/node_logs", params={"ip": node_ip} ) node_logs_response.raise_for_status() node_logs = node_logs_response.json() assert node_logs["result"] assert la_pid not in node_logs["data"]["logs"] assert wait_until_succeeded_without_exception( check_logs_not_exist, (AssertionError,), timeout_ms=10000 )
def test_logs_experimental_list(ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) # test that list logs is comprehensive response = requests.get(webui_url + "/api/experimental/logs/list") response.raise_for_status() logs = json.loads(response.text) assert len(logs) == 1 node_id = next(iter(logs)) # test worker logs outs = logs[node_id]["worker_outs"] errs = logs[node_id]["worker_outs"] core_worker_logs = logs[node_id]["python_core_worker_logs"] assert len(outs) == len(errs) == len(core_worker_logs) assert len(outs) > 0 for file in ["debug_state_gcs.txt", "gcs_server.out", "gcs_server.err"]: assert file in logs[node_id]["gcs_server"] for file in ["raylet.out", "raylet.err"]: assert file in logs[node_id]["raylet"] for file in ["dashboard_agent.log", "dashboard.log"]: assert file in logs[node_id]["dashboard"] return True # Test that logs/list can be filtered response = requests.get(webui_url + "/api/experimental/logs/list?filters=gcs") response.raise_for_status() logs = json.loads(response.text) assert len(logs) == 1 node_id = next(iter(logs)) assert "gcs_server" in logs[node_id] for category in logs[node_id]: if category != "gcs_server": assert len(logs[node_id][category]) == 0 response = requests.get(webui_url + "/api/experimental/logs/list?filters=worker") response.raise_for_status() logs = json.loads(response.text) assert len(logs) == 1 node_id = next(iter(logs)) worker_log_categories = [ "python_core_worker_logs", "worker_outs", "worker_errors", ] assert all([cat in logs[node_id] for cat in worker_log_categories]) for category in logs[node_id]: if category not in worker_log_categories: assert len(logs[node_id][category]) == 0
def test_event_basic(disable_aiohttp_cache, ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) webui_url = format_web_url(ray_start_with_dashboard["webui_url"]) session_dir = ray_start_with_dashboard["session_dir"] event_dir = os.path.join(session_dir, "logs", "events") job_id = ray.JobID.from_int(100).hex() source_type_gcs = event_pb2.Event.SourceType.Name(event_pb2.Event.GCS) source_type_raylet = event_pb2.Event.SourceType.Name( event_pb2.Event.RAYLET) test_count = 20 for source_type in [source_type_gcs, source_type_raylet]: test_log_file = os.path.join(event_dir, f"event_{source_type}.log") test_logger = _test_logger( __name__ + str(random.random()), test_log_file, max_bytes=2000, backup_count=1000, ) for i in range(test_count): sample_event = _get_event(str(i), job_id=job_id, source_type=source_type) test_logger.info("%s", json.dumps(sample_event)) def _check_events(): try: resp = requests.get(f"{webui_url}/events") resp.raise_for_status() result = resp.json() all_events = result["data"]["events"] job_events = all_events[job_id] assert len(job_events) >= test_count * 2 source_messages = {} for e in job_events: source_type = e["sourceType"] message = e["message"] source_messages.setdefault(source_type, set()).add(message) assert len(source_messages[source_type_gcs]) >= test_count assert len(source_messages[source_type_raylet]) >= test_count data = {str(i) for i in range(test_count)} assert data & source_messages[source_type_gcs] == data assert data & source_messages[source_type_raylet] == data return True except Exception as ex: logger.exception(ex) return False wait_for_condition(_check_events, timeout=15)
def test_get_cluster_status(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard webui_url = address_info["webui_url"] webui_url = format_web_url(webui_url) # Check that the cluster_status endpoint works without the underlying data # from the GCS, but returns nothing. def get_cluster_status(): response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() print(response.json()) assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] is None assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"] wait_until_succeeded_without_exception(get_cluster_status, (requests.RequestException, )) # Populate the GCS field, check that the data is returned from the # endpoint. address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis(host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) gcs_client = ray._private.gcs_utils.GcsClient.create_from_redis(client) ray.experimental.internal_kv._initialize_internal_kv(gcs_client) ray.experimental.internal_kv._internal_kv_put( DEBUG_AUTOSCALING_STATUS_LEGACY, "hello") ray.experimental.internal_kv._internal_kv_put(DEBUG_AUTOSCALING_ERROR, "world") response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert response.json()["data"]["autoscalingStatus"] == "hello" assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] == "world" assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
def test_errors(enable_test_module, disable_aiohttp_cache, ray_start_cluster_head): cluster = ray_start_cluster_head assert (wait_until_server_available(cluster.webui_url) is True) webui_url = cluster.webui_url webui_url = format_web_url(webui_url) nodes = ray.nodes() assert len(nodes) == 1 node_ip = nodes[0]["NodeManagerAddress"] @ray.remote class ErrorActor(): def go(self): raise ValueError("This is an error") def get_pid(self): return os.getpid() ea = ErrorActor.remote() ea_pid = ea.get_pid.remote() ea.go.remote() def check_errs(): node_errs_response = requests.get(f"{webui_url}/node_logs", params={"ip": node_ip}) node_errs_response.raise_for_status() node_errs = node_errs_response.json() assert node_errs["result"] assert "errors" in node_errs["data"] assert type(node_errs["data"]["errors"]) is dict assert ea_pid in node_errs["data"]["errors"] assert len(node_errs["data"]["errors"][ea_pid]) == 1 actor_err_response = requests.get(f"{webui_url}/node_logs", params={ "ip": node_ip, "pid": str(ea_pid) }) actor_err_response.raise_for_status() actor_errs = actor_err_response.json() assert actor_errs["result"] assert type(actor_errs["data"]["errors"]) is dict assert len(actor_errs["data"]["errors"][ea_pid]) == 4 wait_until_succeeded_without_exception(check_errs, (AssertionError, ), timeout_ms=1000)
def test_nodes_update(enable_test_module, ray_start_with_dashboard): assert wait_until_server_available( ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 10 start_time = time.time() while True: time.sleep(1) try: response = requests.get(webui_url + "/test/dump") response.raise_for_status() try: dump_info = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert dump_info["result"] is True dump_data = dump_info["data"] assert len(dump_data["nodes"]) == 1 assert len(dump_data["agents"]) == 1 assert len(dump_data["nodeIdToIp"]) == 1 assert len(dump_data["nodeIdToHostname"]) == 1 assert dump_data["nodes"].keys( ) == dump_data["nodeIdToHostname"].keys() response = requests.get(webui_url + "/test/notified_agents") response.raise_for_status() try: notified_agents = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert notified_agents["result"] is True notified_agents = notified_agents["data"] assert len(notified_agents) == 1 assert notified_agents == dump_data["agents"] break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.")
def test_http_get(enable_test_module, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) target_url = webui_url + "/test/dump" timeout_seconds = 30 start_time = time.time() while True: time.sleep(3) try: response = requests.get(webui_url + "/test/http_get?url=" + target_url) response.raise_for_status() try: dump_info = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert dump_info["result"] is True dump_data = dump_info["data"] assert len(dump_data["agents"]) == 1 node_id, ports = next(iter(dump_data["agents"].items())) ip = ray_start_with_dashboard["node_ip_address"] http_port, grpc_port = ports response = requests.get( f"http://{ip}:{http_port}" f"/test/http_get_from_agent?url={target_url}") response.raise_for_status() try: dump_info = response.json() except Exception as ex: logger.info("failed response: %s", response.text) raise ex assert dump_info["result"] is True break except (AssertionError, requests.exceptions.ConnectionError) as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.")
def test_log_proxy(ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) timeout_seconds = 5 start_time = time.time() last_ex = None test_log_text = "test_log_text" test_file = "test.log" with open( f"{ray.worker.global_worker.node.get_logs_dir_path()}/{test_file}", "w" ) as f: f.write(test_log_text) while True: time.sleep(1) try: # Test range request. response = requests.get( f"{webui_url}/log_proxy?url={webui_url}/logs/{test_file}", headers={"Range": "bytes=2-5"}, ) response.raise_for_status() assert response.text == test_log_text[2:6] # Test 404. response = requests.get( f"{webui_url}/log_proxy?" f"url={webui_url}/logs/not_exist_file.log" ) assert response.status_code == 404 break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = ( traceback.format_exception( type(last_ex), last_ex, last_ex.__traceback__ ) if last_ex else [] ) ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")
def test_active_component_activities(ray_start_with_dashboard): # Verify drivers which don't have namespace starting with _ray_internal_job_info_ # are considered active. driver_template = """ import ray ray.init(address="auto", namespace="{namespace}") """ run_string_as_driver_nonblocking( driver_template.format(namespace="my_namespace")) run_string_as_driver_nonblocking( driver_template.format(namespace="my_namespace")) run_string_as_driver_nonblocking( driver_template.format(namespace="_ray_internal_job_info_id1")) # Wait 1 sec for drivers to start time.sleep(1) # Verify drivers are considered active after running script webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) response = requests.get(f"{webui_url}/api/component_activities") response.raise_for_status() # Validate schema of response data = response.json() schema_path = os.path.join( os.path.dirname(dashboard.__file__), "modules/snapshot/component_activities_schema.json", ) pprint.pprint(data) jsonschema.validate(instance=data, schema=json.load(open(schema_path))) # Validate ray_activity_response field can be cast to RayActivityResponse object driver_ray_activity_response = RayActivityResponse(**data["driver"]) assert driver_ray_activity_response.is_active == "ACTIVE" # Drivers with namespace starting with "_ray_internal_job_info_" are not # considered active drivers. Three active drivers are the two # run with namespace "my_namespace" and the one started # from ray_start_with_dashboard assert driver_ray_activity_response.reason == "Number of active drivers: 3"
def test_temporary_uri_reference(monkeypatch, expiration_s): """Test that temporary GCS URI references are deleted after expiration_s.""" monkeypatch.setenv("RAY_RUNTIME_ENV_TEMPORARY_REFERENCE_EXPIRATION_S", str(expiration_s)) # We can't use a fixture with a shared Ray runtime because we need to set the # expiration_s env var before Ray starts. with _ray_start(include_dashboard=True, num_cpus=1) as ctx: headers = { "Connection": "keep-alive", "Authorization": "TOK:<MY_TOKEN>" } address = ctx.address_info["webui_url"] assert wait_until_server_available(address) client = JobSubmissionClient(format_web_url(address), headers=headers) with tempfile.TemporaryDirectory() as tmp_dir: path = Path(tmp_dir) hello_file = path / "hi.txt" with hello_file.open(mode="w") as f: f.write("hi\n") start = time.time() client.submit_job(entrypoint="echo hi", runtime_env={"working_dir": tmp_dir}) # Give time for deletion to occur if expiration_s is 0. time.sleep(2) # Need to connect to Ray to check internal_kv. # ray.init(address="auto") print("Starting Internal KV checks at time ", time.time() - start) if expiration_s > 0: assert not check_internal_kv_gced() wait_for_condition(check_internal_kv_gced, timeout=2 * expiration_s) assert expiration_s < time.time() - start < 2 * expiration_s print("Internal KV was GC'ed at time ", time.time() - start) else: wait_for_condition(check_internal_kv_gced) print("Internal KV was GC'ed at time ", time.time() - start)
def test_multi_node_churn( enable_test_module, disable_aiohttp_cache, ray_start_cluster_head ): cluster: Cluster = ray_start_cluster_head assert wait_until_server_available(cluster.webui_url) is True webui_url = format_web_url(cluster.webui_url) def cluster_chaos_monkey(): worker_nodes = [] while True: time.sleep(5) if len(worker_nodes) < 2: worker_nodes.append(cluster.add_node()) continue should_add_node = random.randint(0, 1) if should_add_node: worker_nodes.append(cluster.add_node()) else: node_index = random.randrange(0, len(worker_nodes)) node_to_remove = worker_nodes.pop(node_index) cluster.remove_node(node_to_remove) def get_index(): resp = requests.get(webui_url) resp.raise_for_status() def get_nodes(): resp = requests.get(webui_url + "/nodes?view=summary") resp.raise_for_status() summary = resp.json() assert summary["result"] is True, summary["msg"] assert summary["data"]["summary"] t = threading.Thread(target=cluster_chaos_monkey, daemon=True) t.start() t_st = datetime.now() duration = timedelta(seconds=60) while datetime.now() < t_st + duration: get_index() time.sleep(2)
def test_node_physical_stats(enable_test_module, shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=6) @ray.remote(num_cpus=1) class Actor: def getpid(self): return os.getpid() actors = [Actor.remote() for _ in range(6)] actor_pids = ray.get([actor.getpid.remote() for actor in actors]) actor_pids = set(actor_pids) webui_url = addresses["webui_url"] assert (wait_until_server_available(webui_url) is True) webui_url = format_web_url(webui_url) def _check_workers(): try: resp = requests.get(webui_url + "/test/dump?key=node_physical_stats") resp.raise_for_status() result = resp.json() assert result["result"] is True node_physical_stats = result["data"]["nodePhysicalStats"] assert len(node_physical_stats) == 1 current_stats = node_physical_stats[addresses["node_id"]] # Check Actor workers current_actor_pids = set() for worker in current_stats["workers"]: if "ray::Actor" in worker["cmdline"][0]: current_actor_pids.add(worker["pid"]) assert current_actor_pids == actor_pids # Check raylet cmdline assert "raylet" in current_stats["cmdline"][0] return True except Exception as ex: logger.info(ex) return False wait_for_condition(_check_workers, timeout=10)
def test_kill_actor_gcs(ray_start_with_dashboard): # Start the dashboard webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) @ray.remote class Actor: def f(self): ray.worker.show_in_dashboard("test") return os.getpid() def loop(self): while True: time.sleep(1) print("Looping...") # Create an actor a = Actor.remote() worker_pid = ray.get(a.f.remote()) # noqa actor_id = a._ray_actor_id.hex() # Kill the actor _kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=False) assert _actor_killed_loop(worker_pid) # Create an actor and have it loop a = Actor.remote() worker_pid = ray.get(a.f.remote()) # noqa actor_id = a._ray_actor_id.hex() a.loop.remote() # Try to kill the actor, it should not die since a task is running _kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=False) assert not _actor_killed_loop(worker_pid, timeout_secs=1) # Force kill the actor _kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=True) assert _actor_killed_loop(worker_pid)
def test_get_cluster_status(ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) address_info = ray_start_with_dashboard webui_url = address_info["webui_url"] webui_url = format_web_url(webui_url) # Check that the cluster_status endpoint works without the underlying data # from the GCS, but returns nothing. def get_cluster_status(): response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() print(response.json()) assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] is None assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"] wait_until_succeeded_without_exception(get_cluster_status, (requests.RequestException, )) gcs_client = make_gcs_client(address_info) ray.experimental.internal_kv._initialize_internal_kv(gcs_client) ray.experimental.internal_kv._internal_kv_put( DEBUG_AUTOSCALING_STATUS_LEGACY, "hello") ray.experimental.internal_kv._internal_kv_put(DEBUG_AUTOSCALING_ERROR, "world") response = requests.get(f"{webui_url}/api/cluster_status") response.raise_for_status() assert response.json()["result"] assert "autoscalingStatus" in response.json()["data"] assert response.json()["data"]["autoscalingStatus"] == "hello" assert "autoscalingError" in response.json()["data"] assert response.json()["data"]["autoscalingError"] == "world" assert "clusterStatus" in response.json()["data"] assert "loadMetricsReport" in response.json()["data"]["clusterStatus"]
def test_nil_node(enable_test_module, disable_aiohttp_cache, ray_start_with_dashboard): assert wait_until_server_available( ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] assert wait_until_server_available(webui_url) webui_url = format_web_url(webui_url) @ray.remote(num_gpus=1) class InfeasibleActor: pass infeasible_actor = InfeasibleActor.remote() # noqa timeout_seconds = 5 start_time = time.time() last_ex = None while True: time.sleep(1) try: resp = requests.get(f"{webui_url}/logical/actors") resp_json = resp.json() resp_data = resp_json["data"] actors = resp_data["actors"] assert len(actors) == 1 response = requests.get(webui_url + "/test/dump?key=node_actors") response.raise_for_status() result = response.json() assert actor_consts.NIL_NODE_ID not in result["data"]["nodeActors"] break except Exception as ex: last_ex = ex finally: if time.time() > start_time + timeout_seconds: ex_stack = (traceback.format_exception(type(last_ex), last_ex, last_ex.__traceback__) if last_ex else []) ex_stack = "".join(ex_stack) raise Exception(f"Timed out while testing, {ex_stack}")