def node_stats(node_manager_address=None, node_manager_port=None, include_memory_info=True): """Returns NodeStats object describing memory usage in the cluster.""" import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info. assert (node_manager_address is not None and node_manager_port is not None) raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = grpc.insecure_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) node_stats = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest( include_memory_info=include_memory_info), timeout=30.0) return node_stats
def memory_summary(node_manager_address=None, node_manager_port=None, stats_only=False): """Returns a formatted string describing memory usage in the cluster.""" import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info, that Raylet internally # asks all nodes in the cluster for memory stats. if (node_manager_address is None or node_manager_port is None): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) else: raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = grpc.insecure_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.FormatGlobalMemoryInfo( node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0) if stats_only: return store_stats_summary(reply) return reply.memory_summary + "\n" + store_stats_summary(reply, stats_only)
def kill_raylet(ip, port, graceful=False): raylet_address = f"{ip}:{port}" channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) print(f"Sending a shutdown request to {ip}:{port}") stub.ShutdownRaylet( node_manager_pb2.ShutdownRayletRequest(graceful=graceful))
def get_store_stats(state, node_manager_address=None, node_manager_port=None): """Returns a formatted string describing memory usage in the cluster.""" import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info, that Raylet internally # asks all nodes in the cluster for memory stats. if (node_manager_address is None or node_manager_port is None): # We should ask for a raylet that is alive. raylet = None for node in state.node_table(): if node["Alive"]: raylet = node break assert raylet is not None, "Every raylet is dead" raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) else: raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = grpc.insecure_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.FormatGlobalMemoryInfo( node_manager_pb2.FormatGlobalMemoryInfoRequest( include_memory_info=False), timeout=30.0) return store_stats_summary(reply)
def memory_summary(node_manager_address=None, node_manager_port=None, stats_only=False): """Returns a formatted string describing memory usage in the cluster.""" import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info, that Raylet internally # asks all nodes in the cluster for memory stats. if (node_manager_address is None or node_manager_port is None): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) else: raylet_address = "{}:{}".format(node_manager_address, node_manager_port) channel = grpc.insecure_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.FormatGlobalMemoryInfo( node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0) store_summary = "--- Aggregate object store stats across all nodes ---\n" store_summary += ( "Plasma memory usage {} MiB, {} objects, {}% full\n".format( int(reply.store_stats.object_store_bytes_used / (1024 * 1024)), reply.store_stats.num_local_objects, round( 100 * reply.store_stats.object_store_bytes_used / reply.store_stats.object_store_bytes_avail, 2))) if reply.store_stats.spill_time_total_s > 0: store_summary += ( "Spilled {} MiB, {} objects, avg write throughput {} MiB/s\n". format( int(reply.store_stats.spilled_bytes_total / (1024 * 1024)), reply.store_stats.spilled_objects_total, int(reply.store_stats.spilled_bytes_total / (1024 * 1024) / reply.store_stats.spill_time_total_s))) if reply.store_stats.restore_time_total_s > 0: store_summary += ( "Restored {} MiB, {} objects, avg read throughput {} MiB/s\n". format( int(reply.store_stats.restored_bytes_total / (1024 * 1024)), reply.store_stats.restored_objects_total, int(reply.store_stats.restored_bytes_total / (1024 * 1024) / reply.store_stats.restore_time_total_s))) if reply.store_stats.consumed_bytes > 0: store_summary += ("Objects consumed by Ray tasks: {} MiB.".format( int(reply.store_stats.consumed_bytes / (1024 * 1024)))) if stats_only: return store_summary return reply.memory_summary + "\n" + store_summary
def _kill_raylet(self, ip, port, graceful=False): raylet_address = f"{ip}:{port}" channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) try: stub.ShutdownRaylet( node_manager_pb2.ShutdownRayletRequest(graceful=graceful)) except _InactiveRpcError: assert not graceful
def update_nodes(self): with self.nodes_lock: self.nodes = ray.nodes() self.stubs = [] for node in self.nodes: channel = grpc.insecure_channel("{}:{}".format( node["NodeManagerAddress"], node["NodeManagerPort"])) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self.stubs.append(stub)
def kill_raylet(ip, port, graceful=True): raylet_address = f"{ip}:{port}" channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) print(f"Sending a shutdown request to {ip}:{port}") try: stub.ShutdownRaylet( node_manager_pb2.ShutdownRayletRequest(graceful=graceful)) except _InactiveRpcError: assert not graceful
async def _update_stubs(self, change): if change.old: ip, port = change.old self._stubs.pop(ip) if change.new: ip, node_info = change.new address = "{}:{}".format(ip, int(node_info["nodeManagerPort"])) channel = aiogrpc.insecure_channel(address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self._stubs[ip] = stub
def get_num_workers(): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) return len([ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).workers_stats if not worker.is_driver ])
def get_workers(): raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) return [ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).core_workers_stats if worker.worker_type != common_pb2.DRIVER ]
async def _update_stubs(self, change): if change.old: node_id, node_info = change.old self._stubs.pop(node_id) if change.new: node_id, node_info = change.new address = "{}:{}".format(node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])) options = (("grpc.enable_http_proxy", 0), ) channel = aiogrpc.insecure_channel(address, options=options) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self._stubs[node_id] = stub
async def _update_stubs(self, change): if change.old: node_id, node_info = change.old self._stubs.pop(node_id) if change.new: # TODO(fyrestone): Handle exceptions. node_id, node_info = change.new address = "{}:{}".format(node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])) options = (("grpc.enable_http_proxy", 0), ) channel = ray._private.utils.init_grpc_channel( address, options, asynchronous=True) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self._stubs[node_id] = stub
def _get_raylet_stub(): global _STUB if _STUB is None: import grpc from ray.core.generated import node_manager_pb2_grpc raylet_address = _get_raylet_address() channel = grpc.insecure_channel( raylet_address, options=[ ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH), ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH), ], ) _STUB = node_manager_pb2_grpc.NodeManagerServiceStub(channel) return _STUB
def test_initial_workers(shutdown_only): # `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores ray.init(num_cpus=1, include_dashboard=True, _internal_config=json.dumps({"enable_multi_tenancy": True})) raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) wait_for_condition(lambda: len([ worker for worker in stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest()).workers_stats if not worker.is_driver ]) == 1, timeout=10)
def memory_summary(): """Returns a formatted string describing memory usage in the cluster.""" import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc # We can ask any Raylet for the global memory info. raylet = ray.nodes()[0] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.FormatGlobalMemoryInfo( node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0) return reply.memory_summary
def test_worker_stats(ray_start_regular): raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest()) # Check that there is one connected driver. drivers = [worker for worker in reply.workers_stats if worker.is_driver] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.workers_stats) < num_cpus + 1: time.sleep(1) reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest()) continue # Check that the rest of the processes are workers, 1 for each CPU. print(reply) assert len(reply.workers_stats) >= num_cpus + 1 views = [view.view_name for view in reply.view_data] assert "redis_latency" in views assert "local_available_resource" in views # Check that all processes are Python. pids = [worker.pid for worker in reply.workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl): what is with travis/mi when running in Travis? assert ("python" in process or "ray" in process or "travis/mi" in process) break
def update_nodes(self): with self.nodes_lock: self.nodes = ray.nodes() node_ids = [node["NodeID"] for node in self.nodes] # First remove node connections of disconnected nodes. for node_id in self.stubs.keys(): if node_id not in node_ids: stub = self.stubs.pop(node_id) stub.close() # Now add node connections of new nodes. for node in self.nodes: node_id = node["NodeID"] if node_id not in self.stubs: channel = grpc.insecure_channel("{}:{}".format( node["NodeManagerAddress"], node["NodeManagerPort"])) stub = node_manager_pb2_grpc.NodeManagerServiceStub( channel) self.stubs[node_id] = stub
def stat(address): if not address: address = services.find_redis_address_or_die() logger.info("Connecting to Ray instance at {}.".format(address)) ray.init(address=address) import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc for raylet in ray.nodes(): raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) logger.info("Querying raylet {}".format(raylet_address)) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.GetNodeStats(node_manager_pb2.GetNodeStatsRequest(), timeout=2.0) print(reply)
def stat(address): """Get the current metrics protobuf from a Ray cluster (developer tool).""" if not address: address = services.find_redis_address_or_die() logger.info("Connecting to Ray instance at {}.".format(address)) ray.init(address=address) import grpc from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc for raylet in ray.nodes(): raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) logger.info("Querying raylet {}".format(raylet_address)) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(include_memory_info=False), timeout=2.0) print(reply)
def _update_nodes(self): with self.nodes_lock: self.nodes = ray.nodes() node_ids = [node["NodeID"] for node in self.nodes] # First remove node connections of disconnected nodes. for node_id in self.stubs.keys(): if node_id not in node_ids: stub = self.stubs.pop(node_id) stub.close() reporter_stub = self.reporter_stubs.pop(node_id) reporter_stub.close() # Now add node connections of new nodes. for node in self.nodes: node_id = node["NodeID"] if node_id not in self.stubs: node_ip = node["NodeManagerAddress"] channel = grpc.insecure_channel("{}:{}".format( node_ip, node["NodeManagerPort"])) stub = node_manager_pb2_grpc.NodeManagerServiceStub( channel) self.stubs[node_id] = stub # Block wait until the reporter for the node starts. while True: reporter_port = self.redis_client.get( "REPORTER_PORT:{}".format(node_ip)) if reporter_port: break reporter_channel = grpc.insecure_channel("{}:{}".format( node_ip, int(reporter_port))) reporter_stub = reporter_pb2_grpc.ReporterServiceStub( reporter_channel) self.reporter_stubs[node_id] = reporter_stub assert len(self.stubs) == len( self.reporter_stubs), (self.stubs.keys(), self.reporter_stubs.keys())
def test_worker_stats(shutdown_only): ray.init(num_cpus=1, include_webui=False) raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply reply = try_get_node_stats() # Check that there is one connected driver. drivers = [worker for worker in reply.workers_stats if worker.is_driver] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid @ray.remote def f(): ray.show_in_webui("test") return os.getpid() @ray.remote class Actor(object): def __init__(self): pass def f(self): ray.show_in_webui("test") return os.getpid() # Test show_in_webui for remote functions. worker_pid = ray.get(f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display == "test": target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display == "" assert target_worker_present # Test show_in_webui for remote actors. a = Actor.remote() worker_pid = ray.get(a.f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display == "test": target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display == "" assert target_worker_present timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.workers_stats) < num_cpus + 1: time.sleep(1) reply = try_get_node_stats() continue # Check that the rest of the processes are workers, 1 for each CPU. assert len(reply.workers_stats) == num_cpus + 1 views = [view.view_name for view in reply.view_data] assert "redis_latency" in views assert "local_available_resource" in views # Check that all processes are Python. pids = [worker.pid for worker in reply.workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl) why does travis/mi end up in the process list assert ("python" in process or "ray" in process or "travis" in process) break
def _kill_raylet(self, ip, port, graceful=False): raylet_address = f"{ip}:{port}" channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) stub.ShutdownRaylet( node_manager_pb2.ShutdownRayletRequest(graceful=graceful))
def test_worker_stats(shutdown_only): ray.init(num_cpus=1, include_dashboard=True) raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = init_grpc_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply reply = try_get_node_stats() # Check that there is one connected driver. drivers = [ worker for worker in reply.core_workers_stats if worker.worker_type == common_pb2.DRIVER ] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid @ray.remote def f(): ray.worker.show_in_dashboard("test") return os.getpid() @ray.remote class Actor: def __init__(self): pass def f(self): ray.worker.show_in_dashboard("test") return os.getpid() # Test show_in_dashboard for remote functions. worker_pid = ray.get(f.remote()) reply = try_get_node_stats() target_worker_present = False for stats in reply.core_workers_stats: if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True assert stats.pid == worker_pid else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present # Test show_in_dashboard for remote actors. a = Actor.remote() worker_pid = ray.get(a.f.remote()) reply = try_get_node_stats() target_worker_present = False for stats in reply.core_workers_stats: if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present if _WIN32: timeout_seconds = 40 else: timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.core_workers_stats) < num_cpus + 2: time.sleep(1) reply = try_get_node_stats() continue # Check that the rest of the processes are workers, 1 for each CPU. assert len(reply.core_workers_stats) == num_cpus + 2 # Check that all processes are Python. pids = [worker.pid for worker in reply.core_workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl) why does travis/mi end up in the process list assert ("python" in process or "mini" in process or "conda" in process or "travis" in process or "runner" in process or "pytest" in process or "ray" in process), process break
def test_worker_stats(shutdown_only): addresses = ray.init(num_cpus=1, include_webui=True) raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats( node_manager_pb2.GetNodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply reply = try_get_node_stats() # Check that there is one connected driver. drivers = [worker for worker in reply.workers_stats if worker.is_driver] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid @ray.remote def f(): ray.show_in_webui("test") return os.getpid() @ray.remote class Actor: def __init__(self): pass def f(self): ray.show_in_webui("test") return os.getpid() # Test show_in_webui for remote functions. worker_pid = ray.get(f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present # Test show_in_webui for remote actors. a = Actor.remote() worker_pid = ray.get(a.f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display[""] == '{"message": "test", "dtype": "text"}': target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display[""] == "" # Empty proto assert target_worker_present timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.workers_stats) < num_cpus + 1: time.sleep(1) reply = try_get_node_stats() continue # Check that the rest of the processes are workers, 1 for each CPU. assert len(reply.workers_stats) == num_cpus + 1 views = [view.view_name for view in reply.view_data] assert "local_available_resource" in views # Check that all processes are Python. pids = [worker.pid for worker in reply.workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl) why does travis/mi end up in the process list assert ("python" in process or "ray" in process or "travis" in process) break # Test kill_actor. def actor_killed(PID): """Check For the existence of a unix pid.""" try: os.kill(PID, 0) except OSError: return True else: return False assert (wait_until_server_available(addresses["webui_url"]) is True) webui_url = addresses["webui_url"] webui_url = webui_url.replace("localhost", "http://127.0.0.1") for worker in reply.workers_stats: if worker.is_driver: continue requests.get(webui_url + "/api/kill_actor", params={ "actor_id": ray.utils.binary_to_hex( worker.core_worker_stats.actor_id), "ip_address": worker.core_worker_stats.ip_address, "port": worker.core_worker_stats.port }) timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException("Timed out while killing actors") if all( actor_killed(worker.pid) for worker in reply.workers_stats if not worker.is_driver): break