def test_middleware(): from starlette.middleware import Middleware from starlette.middleware.cors import CORSMiddleware port = new_port() serve.start(http_port=port, http_middlewares=[ Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"]) ]) ray.get(block_until_http_ready.remote(f"http://127.0.0.1:{port}/-/routes")) # Snatched several test cases from Starlette # https://github.com/encode/starlette/blob/master/tests/ # middleware/test_cors.py headers = { "Origin": "https://example.org", "Access-Control-Request-Method": "GET", } root = f"http://localhost:{port}" resp = requests.options(root, headers=headers) assert resp.headers["access-control-allow-origin"] == "*" resp = requests.get(f"{root}/-/routes", headers=headers) assert resp.headers["access-control-allow-origin"] == "*" ray.shutdown()
def test_serve_metrics(serve_instance): client = serve_instance @serve.accept_batch def batcher(starlette_requests): return ["hello"] * len(starlette_requests) client.create_backend("metrics", batcher) client.create_endpoint("metrics", backend="metrics", route="/metrics") # send 10 concurrent requests url = "http://127.0.0.1:8000/metrics" ray.get([block_until_http_ready.remote(url) for _ in range(10)]) def verify_metrics(do_assert=False): try: resp = requests.get("http://127.0.0.1:9999").text # Requests will fail if we are crashing the controller except requests.ConnectionError: return False expected_metrics = [ # counter "num_router_requests_total", "num_http_requests_total", "backend_queued_queries_total", "backend_request_counter_requests_total", "backend_worker_starts_restarts_total", # histogram "backend_processing_latency_ms_bucket", "backend_processing_latency_ms_count", "backend_processing_latency_ms_sum", "backend_queuing_latency_ms_bucket", "backend_queuing_latency_ms_count", "backend_queuing_latency_ms_sum", # gauge "replica_processing_queries", "replica_queued_queries", # handle "serve_handle_request_counter", # ReplicaSet "backend_queued_queries" ] for metric in expected_metrics: # For the final error round if do_assert: assert metric in resp # For the wait_for_condition else: if metric not in resp: return False return True try: wait_for_condition(verify_metrics, retry_interval_ms=500) except RuntimeError: verify_metrics()
def test_serve_metrics_for_successful_connection(serve_instance): @serve.deployment(name="metrics") async def f(request): return "hello" f.deploy() # send 10 concurrent requests url = "http://127.0.0.1:8000/metrics" handle = f.get_handle() ray.get([block_until_http_ready.remote(url) for _ in range(10)]) ray.get([handle.remote(url) for _ in range(10)]) def verify_metrics(do_assert=False): try: resp = requests.get("http://127.0.0.1:9999").text # Requests will fail if we are crashing the controller except requests.ConnectionError: return False expected_metrics = [ # counter "serve_num_router_requests", "serve_num_http_requests", "serve_deployment_queued_queries", "serve_deployment_request_counter", "serve_deployment_replica_starts", # histogram "deployment_processing_latency_ms_bucket", "deployment_processing_latency_ms_count", "deployment_processing_latency_ms_sum", "serve_deployment_processing_latency_ms", # gauge "serve_replica_processing_queries", # handle "serve_handle_request_counter", ] for metric in expected_metrics: # For the final error round if do_assert: assert metric in resp # For the wait_for_condition else: if metric not in resp: return False return True try: wait_for_condition(verify_metrics, retry_interval_ms=500) except RuntimeError: verify_metrics(do_assert=True)
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node() cluster.add_node() ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_port=8005) # noqa: F841 def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
def test_multiple_routers(ray_cluster): cluster = ray_cluster head_node = cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) ray.init(head_node.address) node_ids = ray._private.state.node_ids() assert len(node_ids) == 2 serve.start(http_options=dict(port=8005, location="EveryNode")) def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name( SERVE_PROXY_NAME, serve.context._global_client._controller_name, node_id, ) ) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) original_proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(original_proxy_names[0], namespace=SERVE_NAMESPACE) ray.get_actor(original_proxy_names[1], namespace=SERVE_NAMESPACE) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill( ray.get_actor(get_proxy_names()[0], namespace=SERVE_NAMESPACE), no_restart=True ) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) (third_proxy,) = set(get_proxy_names()) - set(original_proxy_names) def get_third_actor(): try: ray.get_actor(third_proxy, namespace=SERVE_NAMESPACE) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy, namespace=SERVE_NAMESPACE) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node() cluster.add_node() ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 serve.init(http_port=8005) def actor_name(index): return SERVE_PROXY_NAME + "-{}-{}".format(node_ids[0], index) # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(actor_name(0)) ray.get_actor(actor_name(1)) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(actor_name(0)), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() def get_third_actor(): try: ray.get_actor(actor_name(2)) return True except ValueError: return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(actor_name(2)) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()