Esempio n. 1
0
    def current_node_id(self) -> "NodeID":
        from ray import NodeID

        return NodeID(self._fetch_runtime_context().node_id)
Esempio n. 2
0
def test_gcs_drain(ray_start_cluster_head, error_pubsub):
    """
    Prepare the cluster.
    """
    cluster = ray_start_cluster_head
    head_node_id = ray.nodes()[0]["NodeID"]
    NUM_NODES = 2
    for _ in range(NUM_NODES):
        cluster.add_node(num_cpus=1)
    worker_node_ids = []
    for n in ray.nodes():
        if n["NodeID"] != head_node_id:
            worker_node_ids.append(n["NodeID"])
    """
    Warm up the cluster.
    """
    @ray.remote(num_cpus=1)
    class A:
        def ready(self):
            pass

    actors = [A.remote() for _ in range(NUM_NODES)]
    ray.get([actor.ready.remote() for actor in actors])
    """
    Test batch drain.
    """
    # Prepare requests.
    gcs_server_addr = cluster.gcs_address
    options = (("grpc.enable_http_proxy", 0), )
    channel = grpc.insecure_channel(gcs_server_addr, options)
    stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(channel)
    r = gcs_service_pb2.DrainNodeRequest()
    for worker_id in worker_node_ids:
        data = r.drain_node_data.add()
        data.node_id = NodeID.from_hex(worker_id).binary()
    stub.DrainNode(r)

    p = error_pubsub
    # Error shouldn't be printed to the driver.
    errors = get_error_message(p,
                               1,
                               ray_constants.REMOVED_NODE_ERROR,
                               timeout=5)
    assert len(errors) == 0
    # There should be only a head node since we drained worker nodes.
    # NOTE: In the current implementation we kill nodes when draining them.
    # This check should be removed once we implement
    # the proper drain behavior.
    try:
        wait_for_condition(lambda: len(search_raylet(cluster)) == 1)
    except Exception:
        print("More than one raylets are detected.")
        print(search_raylet(cluster))
    """
    Make sure the API is idempotent.
    """
    for _ in range(10):
        stub.DrainNode(r)
    p = error_pubsub
    # Error shouldn't be printed to the driver.
    errors = get_error_message(p,
                               1,
                               ray_constants.REMOVED_NODE_ERROR,
                               timeout=5)
    assert len(errors) == 0
    """
    Make sure the GCS states are updated properly.
    """
    for n in ray.nodes():
        node_id = n["NodeID"]
        is_alive = n["Alive"]
        if node_id == head_node_id:
            assert is_alive
        if node_id in worker_node_ids:
            assert not is_alive
    """
    Make sure head node is not dead and functional.
    """
    a = A.options(num_cpus=0).remote()
    ray.get(a.ready.remote())