def nodes(ctx, format: str): url = ctx.obj["api_server_url"] format = AvailableFormat(format) print( get_state_api_output_to_print( list_nodes(api_server_url=url, _explain=_should_explain(format)), format=format, ) )
def test_list_nodes(shutdown_only): ray.init() def verify(): node_data = list(list_nodes().values())[0] correct_state = node_data["state"] == "ALIVE" is_id_hex = is_hex(node_data["node_id"]) correct_id = ray.nodes()[0]["NodeID"] == node_data["node_id"] return correct_state and is_id_hex and correct_id wait_for_condition(verify) print(list_nodes())
def nodes(ctx, format: str, filter: List[Tuple[str, str]]): url = ctx.obj["api_server_url"] format = AvailableFormat(format) print( get_state_api_output_to_print( list_nodes( api_server_url=url, filters=filter, _explain=_should_explain(format), ), format=format, ) )
def verify(): head_node = list_nodes()[0] # When glob filter is not provided, it should provide all logs logs = list_logs(node_id=head_node["node_id"]) assert "raylet" in logs assert "gcs_server" in logs assert "dashboard" in logs assert "agent" in logs assert "internal" in logs assert "driver" in logs assert "autoscaler" in logs # Test glob works. logs = list_logs(node_id=head_node["node_id"], glob_filter="raylet*") assert len(logs) == 1 return True
def test_log_get(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) ray.init(address=cluster.address) head_node = list_nodes()[0] cluster.add_node(num_cpus=1) @ray.remote(num_cpus=1) class Actor: def print(self, i): for _ in range(i): print("1") def getpid(self): import os return os.getpid() """ Test filename match """ def verify(): # By default, node id should be configured to the head node. for log in get_log( node_id=head_node["node_id"], filename="raylet.out", tail=10 ): # + 1 since the last line is just empty. assert len(log.split("\n")) == 11 return True wait_for_condition(verify) """ Test worker pid / IP match """ a = Actor.remote() pid = ray.get(a.getpid.remote()) ray.get(a.print.remote(20)) def verify(): # By default, node id should be configured to the head node. for log in get_log(node_ip=head_node["node_ip"], pid=pid, tail=10): # + 1 since the last line is just empty. assert len(log.split("\n")) == 11 return True wait_for_condition(verify) """ Test actor logs. """ actor_id = a._actor_id.hex() def verify(): # By default, node id should be configured to the head node. for log in get_log(actor_id=actor_id, tail=10): # + 1 since the last line is just empty. assert len(log.split("\n")) == 11 return True wait_for_condition(verify) with pytest.raises(NotImplementedError): for _ in get_log(task_id=123, tail=10): pass
def test_logs_stream_and_tail(ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = list_nodes()[0]["node_id"] def verify_basic(): stream_response = requests.get( webui_url + f"/api/v0/logs/file?node_id={node_id}&filename=gcs_server.out&lines=5", stream=True, ) if stream_response.status_code != 200: raise ValueError(stream_response.content.decode("utf-8")) lines = [] for line in stream_response.iter_lines(): lines.append(line.decode("utf-8")) return len(lines) == 5 or len(lines) == 6 wait_for_condition(verify_basic) @ray.remote class Actor: def write_log(self, strings): for s in strings: print(s) def getpid(self): return os.getpid() test_log_text = "test_log_text_日志_{}" actor = Actor.remote() ray.get(actor.write_log.remote([test_log_text.format("XXXXXX")])) # Test stream and fetching by actor id stream_response = requests.get( webui_url + "/api/v0/logs/stream?&lines=2" + f"&actor_id={actor._ray_actor_id.hex()}", stream=True, ) if stream_response.status_code != 200: raise ValueError(stream_response.content.decode("utf-8")) stream_iterator = stream_response.iter_content(chunk_size=None) # NOTE: Prefix 1 indicates the stream has succeeded. assert ( next(stream_iterator).decode("utf-8") == "1:actor_name:Actor\n" + test_log_text.format("XXXXXX") + "\n" ) streamed_string = "" for i in range(5): strings = [] for j in range(100): strings.append(test_log_text.format(f"{100*i + j:06d}")) ray.get(actor.write_log.remote(strings)) string = "" for s in strings: string += s + "\n" streamed_string += string # NOTE: Prefix 1 indicates the stream has succeeded. assert next(stream_iterator).decode("utf-8") == "1" + string del stream_response # Test tailing log by actor id LINES = 150 file_response = requests.get( webui_url + f"/api/v0/logs/file?&lines={LINES}" + "&actor_id=" + actor._ray_actor_id.hex(), ).content.decode("utf-8") # NOTE: Prefix 1 indicates the stream has succeeded. assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :]) # Test query by pid & node_ip instead of actor id. node_ip = list(ray.nodes())[0]["NodeManagerAddress"] pid = ray.get(actor.getpid.remote()) file_response = requests.get( webui_url + f"/api/v0/logs/file?node_ip={node_ip}&lines={LINES}" + f"&pid={pid}", ).content.decode("utf-8") # NOTE: Prefix 1 indicates the stream has succeeded. assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :])
def test_logs_list(ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = list_nodes()[0]["node_id"] def verify(): response = requests.get(webui_url + f"/api/v0/logs?node_id={node_id}") response.raise_for_status() result = json.loads(response.text) assert result["result"] logs = result["data"]["result"] # Test worker logs outs = logs["worker_out"] errs = logs["worker_err"] core_worker_logs = logs["core_worker"] assert len(outs) == len(errs) == len(core_worker_logs) assert len(outs) > 0 # Test gcs / raylet / dashboard for file in ["gcs_server.out", "gcs_server.err"]: assert file in logs["gcs_server"] for file in ["raylet.out", "raylet.err"]: assert file in logs["raylet"] for file in ["dashboard.log"]: assert file in logs["dashboard"] for file in ["dashboard_agent.log"]: assert file in logs["agent"] return True wait_for_condition(verify) def verify_filter(): # Test that logs/list can be filtered response = requests.get( webui_url + f"/api/v0/logs?node_id={node_id}&glob=*gcs*" ) response.raise_for_status() result = json.loads(response.text) assert result["result"] logs = result["data"]["result"] assert "gcs_server" in logs assert "internal" in logs assert len(logs) == 2 assert "gcs_server.out" in logs["gcs_server"] assert "gcs_server.err" in logs["gcs_server"] assert "debug_state_gcs.txt" in logs["internal"] return True wait_for_condition(verify_filter) def verify_worker_logs(): response = requests.get( webui_url + f"/api/v0/logs?node_id={node_id}&glob=*worker*" ) response.raise_for_status() result = json.loads(response.text) assert result["result"] logs = result["data"]["result"] worker_log_categories = [ "core_worker", "worker_out", "worker_err", ] assert all([cat in logs for cat in worker_log_categories]) num_workers = len( list(filter(lambda w: w["worker_type"] == "WORKER", list_workers())) ) assert ( len(logs["worker_out"]) == len(logs["worker_err"]) == len(logs["worker_out"]) ) assert num_workers == len(logs["worker_out"]) return True wait_for_condition(verify_worker_logs)
def nodes(ctx): url = ctx.obj["api_server_url"] pprint(list_nodes(url))
def verify(): node_data = list(list_nodes().values())[0] correct_state = node_data["state"] == "ALIVE" is_id_hex = is_hex(node_data["node_id"]) correct_id = ray.nodes()[0]["NodeID"] == node_data["node_id"] return correct_state and is_id_hex and correct_id
def nodes(ctx, format: str): url = ctx.obj["api_server_url"] print( get_state_api_output_to_print(list_nodes(api_server_url=url), format=AvailableFormat(format)))