def current_node_id(self) -> "NodeID": from ray import NodeID return NodeID(self._fetch_runtime_context().node_id)
def test_gcs_drain(ray_start_cluster_head, error_pubsub): """ Prepare the cluster. """ cluster = ray_start_cluster_head head_node_id = ray.nodes()[0]["NodeID"] NUM_NODES = 2 for _ in range(NUM_NODES): cluster.add_node(num_cpus=1) worker_node_ids = [] for n in ray.nodes(): if n["NodeID"] != head_node_id: worker_node_ids.append(n["NodeID"]) """ Warm up the cluster. """ @ray.remote(num_cpus=1) class A: def ready(self): pass actors = [A.remote() for _ in range(NUM_NODES)] ray.get([actor.ready.remote() for actor in actors]) """ Test batch drain. """ # Prepare requests. gcs_server_addr = cluster.gcs_address options = (("grpc.enable_http_proxy", 0), ) channel = grpc.insecure_channel(gcs_server_addr, options) stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(channel) r = gcs_service_pb2.DrainNodeRequest() for worker_id in worker_node_ids: data = r.drain_node_data.add() data.node_id = NodeID.from_hex(worker_id).binary() stub.DrainNode(r) p = error_pubsub # Error shouldn't be printed to the driver. errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR, timeout=5) assert len(errors) == 0 # There should be only a head node since we drained worker nodes. # NOTE: In the current implementation we kill nodes when draining them. # This check should be removed once we implement # the proper drain behavior. try: wait_for_condition(lambda: len(search_raylet(cluster)) == 1) except Exception: print("More than one raylets are detected.") print(search_raylet(cluster)) """ Make sure the API is idempotent. """ for _ in range(10): stub.DrainNode(r) p = error_pubsub # Error shouldn't be printed to the driver. errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR, timeout=5) assert len(errors) == 0 """ Make sure the GCS states are updated properly. """ for n in ray.nodes(): node_id = n["NodeID"] is_alive = n["Alive"] if node_id == head_node_id: assert is_alive if node_id in worker_node_ids: assert not is_alive """ Make sure head node is not dead and functional. """ a = A.options(num_cpus=0).remote() ray.get(a.ready.remote())