def test_start_idempotent(serve_instance): @serve.deployment(name="start") def func(*args): pass func.deploy() assert "start" in serve.list_deployments() serve.start(detached=True) serve.start() serve.start(detached=True) serve.start() assert "start" in serve.list_deployments()
def run(self, logger: Union[Logger, _CliLogger] = logger): """Deploys all deployments in this Application and logs status. This function keeps looping and printing status, so it must be manually killed (e.g. using ctrl-C). When it recieves a kill signal, it tears down all deployments that it deployed. If there are no deployments left, it also shuts down the rest of Serve, including the controller. This is meant to help interactive development. Args: logger: Any Python object that implements the standard logger interface. """ try: serve.start(detached=True) self.deploy() logger.info("\nDeployed successfully!\n") while True: statuses = serve_application_status_to_schema( get_deployment_statuses() ).json(indent=4) logger.info(f"{statuses}") time.sleep(10) except KeyboardInterrupt: logger.info("Got SIGINT (KeyboardInterrupt). Removing deployments.") for deployment in self._deployments.values(): deployment.delete() if len(serve.list_deployments()) == 0: logger.info("No deployments left. Shutting down Serve.") serve.shutdown() sys.exit()
def serve_instance(_shared_serve_instance): yield _shared_serve_instance # Clear all state between tests to avoid naming collisions. for deployment in serve.list_deployments().values(): deployment.delete() # Clear the ServeHandle cache between tests to avoid them piling up. _shared_serve_instance.handle_cache.clear()
def test_serve_namespace(ray_start_stop): """ Check that the Dashboard's Serve can interact with the Python API when they both start in the "serve namespace" """ one = dict( name="one", num_replicas=1, route_prefix="/one", ray_actor_options={"runtime_env": { "py_modules": [test_module_uri] }}, import_path="test_module.test.one", ) put_response = requests.put(GET_OR_PUT_URL, json={"deployments": [one]}, timeout=30) assert put_response.status_code == 200 ray.init(address="auto", namespace="serve") serve.start() deployments = serve.list_deployments() assert len(deployments) == 1 assert "one" in deployments serve.shutdown()
async def get_all_deployments(self, req: Request) -> Response: deployments = list(serve.list_deployments().values()) serve_application_schema = serve_application_to_schema( deployments=deployments) return Response( text=serve_application_schema.json(), content_type="application/json", )
def serve_instance(_shared_serve_instance): yield _shared_serve_instance # Clear all state between tests to avoid naming collisions. _shared_serve_instance.delete_deployments(serve.list_deployments().keys()) # Clear the ServeHandle cache between tests to avoid them piling up. _shared_serve_instance.handle_cache.clear() # Clear deployment generation shared state between tests DeploymentNameGenerator.reset()
def test_shutdown(ray_shutdown): ray.init(num_cpus=16) serve.start(http_options=dict(port=8003)) @serve.deployment def f(): pass serve.run(f.bind()) serve_controller_name = serve.context._global_client._controller_name actor_names = [ serve_controller_name, format_actor_name( SERVE_PROXY_NAME, serve.context._global_client._controller_name, get_all_node_ids()[0][0], ), ] def check_alive(): alive = True for actor_name in actor_names: try: ray.get_actor(actor_name, namespace=SERVE_NAMESPACE) except ValueError: alive = False return alive wait_for_condition(check_alive) serve.shutdown() with pytest.raises(RayServeException): serve.list_deployments() def check_dead(): for actor_name in actor_names: try: ray.get_actor(actor_name, namespace=SERVE_NAMESPACE) return False except ValueError: pass return True wait_for_condition(check_dead)
def test_run_delete_old_deployments(serve_instance): """Check that serve.run() can remove all old deployments""" @serve.deployment(name="f", route_prefix="/test1") def f(): return "got f" @serve.deployment(name="g", route_prefix="/test2") def g(): return "got g" ingress_handle = serve.run(f.bind()) assert ray.get(ingress_handle.remote()) == "got f" ingress_handle = serve.run(g.bind()) assert ray.get(ingress_handle.remote()) == "got g" assert "g" in serve.list_deployments() assert "f" not in serve.list_deployments()
def main(num_replicas: Optional[int], trial_length: Optional[str], max_batch_size: Optional[int]): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "1": num_replicas = num_replicas or DEFAULT_SMOKE_TEST_NUM_REPLICA trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info( f"Running local / smoke test with {num_replicas} replicas ..\n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to num_nodes = int(math.ceil(num_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes ..\n") serve_client = setup_local_single_node_cluster(num_nodes)[0] else: num_replicas = num_replicas or DEFAULT_FULL_TEST_NUM_REPLICA trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with {num_replicas} replicas ..\n") logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with {num_replicas} target replicas ....\n") deploy_replicas(num_replicas, max_batch_size) logger.info("Warming up cluster ....\n") warm_up_one_cluster.remote(10, http_host, http_port, "echo") logger.info(f"Starting wrk trial on all nodes for {trial_length} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? all_endpoints = list(serve.list_deployments().keys()) all_metrics, all_wrk_stdout = run_wrk_on_all_nodes( trial_length, NUM_CONNECTIONS, http_host, http_port, all_endpoints=all_endpoints) aggregated_metrics = aggregate_all_metrics(all_metrics) logger.info("Wrk stdout on each node: ") for wrk_stdout in all_wrk_stdout: logger.info(wrk_stdout) logger.info("Final aggregated metrics: ") for key, val in aggregated_metrics.items(): logger.info(f"{key}: {val}") save_test_results( aggregated_metrics, default_output_file="/tmp/single_deployment_1k_noop_replica.json")
def test_serve_shutdown(ray_shutdown): serve.start(detached=True) @serve.deployment class A: def __call__(self, *args): return "hi" A.deploy() assert len(serve.list_deployments()) == 1 serve.shutdown() serve.start(detached=True) assert len(serve.list_deployments()) == 0 A.deploy() assert len(serve.list_deployments()) == 1
async def get_all_deployments(self, req: Request) -> Response: deployments = list(serve.list_deployments().values()) # statuses = get_deployment_statuses() # serve_application_schema = serve_application_to_schema( # deployments=deployments, statuses=statuses # ) serve_application_schema = serve_application_to_schema(deployments=deployments) return Response( text=json.dumps(serve_application_schema.json()), content_type="application/json", )
def get_random_async_handle(self): # sync get_handle() and expected to be called only a few times # during deployment warmup so each deployment has reference to # all other handles to send recursive inference call if len(self.all_deployment_async_handles) < len(all_deployment_names): deployments = list(serve.list_deployments().values()) self.all_deployment_async_handles = [ deployment.get_handle(sync=False) for deployment in deployments ] return random.choice(self.all_deployment_async_handles)
def test_serve_shutdown(ray_shutdown): ray.init(namespace="serve") serve.start(detached=True) @serve.deployment class A: def __call__(self, *args): return "hi" serve.run(A.bind()) assert len(serve.list_deployments()) == 1 serve.shutdown() serve.start(detached=True) assert len(serve.list_deployments()) == 0 serve.run(A.bind()) assert len(serve.list_deployments()) == 1
def test_connect(detached, ray_shutdown): # Check that you can make API calls from within a deployment for both # detached and non-detached instances. ray.init(num_cpus=16, namespace="serve") serve.start(detached=detached) @serve.deployment def connect_in_deployment(*args): connect_in_deployment.options(name="deployment-ception").deploy() connect_in_deployment.deploy() ray.get(connect_in_deployment.get_handle().remote()) assert "deployment-ception" in serve.list_deployments()
async def put_all_deployments(self, req: Request) -> Response: app = Application.from_dict(await req.json()) serve.run(app, _blocking=False) new_names = set() for deployment in app.deployments.values(): new_names.add(deployment.name) all_deployments = serve.list_deployments() all_names = set(all_deployments.keys()) names_to_delete = all_names.difference(new_names) internal_get_global_client().delete_deployments(names_to_delete) return Response()
async def put_all_deployments(self, req: Request) -> Response: app = Application.from_dict(await req.json()) app.deploy(blocking=False) new_names = set() for deployment in app: new_names.add(deployment.name) all_deployments = serve.list_deployments() all_names = set(all_deployments.keys()) names_to_delete = all_names.difference(new_names) for name in names_to_delete: all_deployments[name].delete() return Response()
def test_idempotence_after_controller_death(ray_start_stop, use_command: bool): """Check that CLI is idempotent even if controller dies.""" config_file_name = os.path.join(os.path.dirname(__file__), "test_config_files", "two_deployments.yaml") success_message_fragment = b"Sent deploy request successfully!" deploy_response = subprocess.check_output( ["serve", "deploy", config_file_name]) assert success_message_fragment in deploy_response ray.init(address="auto", namespace="serve") serve.start(detached=True) assert len(serve.list_deployments()) == 2 # Kill controller if use_command: subprocess.check_output(["serve", "shutdown"]) else: serve.shutdown() info_response = subprocess.check_output(["serve", "config"]) info = yaml.safe_load(info_response) assert "deployments" in info assert len(info["deployments"]) == 0 deploy_response = subprocess.check_output( ["serve", "deploy", config_file_name]) assert success_message_fragment in deploy_response # Restore testing controller serve.start(detached=True) assert len(serve.list_deployments()) == 2 serve.shutdown() ray.shutdown()
async def put_all_deployments(self, req: Request) -> Response: serve_application_text = await req.text() serve_application_schema = ServeApplicationSchema.parse_raw( serve_application_text, content_type="application/json") deployments = schema_to_serve_application(serve_application_schema) deploy_group(deployments, _blocking=False) new_names = set() for deployment in serve_application_schema.deployments: new_names.add(deployment.name) all_deployments = serve.list_deployments() all_names = set(all_deployments.keys()) names_to_delete = all_names.difference(new_names) for name in names_to_delete: all_deployments[name].delete() return Response()
def get_deployment(self, name, use_list_api): if use_list_api: return serve.list_deployments()[name] else: return serve.get_deployment(name)
def test_deploy(ray_start_stop): # Deploys some valid config files and checks that the deployments work # Initialize serve in test to enable calling serve.list_deployments() ray.init(address="auto", namespace=RAY_INTERNAL_DASHBOARD_NAMESPACE) serve.start(detached=True) # Create absolute file names to YAML config files three_deployments = os.path.join( os.path.dirname(__file__), "test_config_files", "three_deployments.yaml" ) two_deployments = os.path.join( os.path.dirname(__file__), "test_config_files", "two_deployments.yaml" ) deny_deployment = os.path.join( os.path.dirname(__file__), "test_config_files", "deny_access.yaml" ) # Dictionary mapping test config file names to expected deployment names # and configurations. These should match the values specified in the YAML # files. configs = { three_deployments: { "shallow": { "num_replicas": 1, "response": "Hello shallow world!", }, "deep": { "num_replicas": 1, "response": "Hello deep world!", }, "one": { "num_replicas": 2, "response": "2", }, }, two_deployments: { "shallow": { "num_replicas": 3, "response": "Hello shallow world!", }, "one": { "num_replicas": 2, "response": "2", }, }, } request_url = "http://localhost:8000/" success_message_fragment = b"Sent deploy request successfully!" # Check idempotence: for _ in range(2): for config_file_name, expected_deployments in configs.items(): deploy_response = subprocess.check_output( ["serve", "deploy", config_file_name] ) assert success_message_fragment in deploy_response for name, deployment_config in expected_deployments.items(): wait_for_condition( lambda: ( requests.get(f"{request_url}{name}").text == deployment_config["response"] ), timeout=15, ) running_deployments = serve.list_deployments() # Check that running deployment names match expected deployment names assert set(running_deployments.keys()) == expected_deployments.keys() for name, deployment in running_deployments.items(): assert ( deployment.num_replicas == expected_deployments[name]["num_replicas"] ) # Deploy a deployment without HTTP access deploy_response = subprocess.check_output(["serve", "deploy", deny_deployment]) assert success_message_fragment in deploy_response wait_for_condition( lambda: requests.get(f"{request_url}shallow").status_code == 404, timeout=15 ) assert ( ray.get(serve.get_deployment("shallow").get_handle().remote()) == "Hello shallow world!" ) ray.shutdown()
def main( min_replicas: Optional[int], max_replicas: Optional[int], num_deployments: Optional[int], trial_length: Optional[str], ): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py if is_smoke_test(): min_replicas = min_replicas or DEFAULT_SMOKE_TEST_MIN_NUM_REPLICA max_replicas = max_replicas or DEFAULT_SMOKE_TEST_MAX_NUM_REPLICA num_deployments = num_deployments or DEFAULT_SMOKE_TEST_NUM_DEPLOYMENTS trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info(f"Running smoke test with min {min_replicas} and max " f"{max_replicas} replicas, {num_deployments} deployments " f".. \n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to num_nodes = int(math.ceil(max_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes .. \n") serve_client = setup_local_single_node_cluster(num_nodes)[0] else: min_replicas = min_replicas or DEFAULT_FULL_TEST_MIN_NUM_REPLICA max_replicas = max_replicas or DEFAULT_FULL_TEST_MAX_NUM_REPLICA num_deployments = num_deployments or DEFAULT_FULL_TEST_NUM_DEPLOYMENTS trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with min {min_replicas} and max " f"{max_replicas} replicas, {num_deployments} deployments " f".. \n") logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with min {min_replicas} and max {max_replicas}" f"target replicas ....\n") setup_multi_deployment_replicas(min_replicas, max_replicas, num_deployments) logger.info("Warming up cluster ....\n") endpoint_refs = [] all_endpoints = list(serve.list_deployments().keys()) for endpoint in all_endpoints: endpoint_refs.append( warm_up_one_cluster.options(num_cpus=0).remote( 10, http_host, http_port, endpoint)) for endpoint in ray.get(endpoint_refs): logger.info(f"Finished warming up {endpoint}") logger.info(f"Starting wrk trial on all nodes for {trial_length} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? all_metrics, all_wrk_stdout = run_wrk_on_all_nodes( trial_length, NUM_CONNECTIONS, http_host, http_port, all_endpoints=all_endpoints) aggregated_metrics = aggregate_all_metrics(all_metrics) logger.info("Wrk stdout on each node: ") for wrk_stdout in all_wrk_stdout: logger.info(wrk_stdout) logger.info("Final aggregated metrics: ") for key, val in aggregated_metrics.items(): logger.info(f"{key}: {val}") save_test_results( aggregated_metrics, default_output_file="/tmp/autoscaling_multi_deployment.json")
async def get_all_deployments(self, req: Request) -> Response: app = Application(list(serve.list_deployments().values())) return Response( text=json.dumps(app.to_dict()), content_type="application/json", )
def serve_instance(_shared_serve_instance): yield _shared_serve_instance # Clear all state between tests to avoid naming collisions. for deployment in serve.list_deployments().values(): deployment.delete()
if __name__ == '__main__': # Start a Ray Serve instance. This will automatically start # or connect to an existing Ray cluster. serve.start() # Create two distinct deployments of the same class as # two replicas. Associate each deployment with a unique 'name'. # This name can be used as to fetch its respective serve handle. # See code below for method 1. Deployment.options(name="rep-1", num_replicas=2).deploy("/model/rep-1.pkl") Deployment.options(name="rep-2", num_replicas=2).deploy("/model/rep-2.pkl") # Get the current list of deployments print(serve.list_deployments()) print("ServerHandle API responses: " + "--" * 5) # Method 1) Access each deployment using the ServerHandle API for _ in range(2): for d_name in ["rep-1", "rep-2"]: # Get handle to the each deployment and invoke its method. # Which replica the request is dispatched to is determined # by the Router actor. handle = serve.get_deployment(d_name).get_handle() print(f"handle name : {d_name}") print(f"prediction : {ray.get(handle.remote(random()))}") print("-" * 2) print("HTTP responses: " + "--" * 5)
def list_deployments(self, **kwargs): return [{"name": name, "info": info} for name, info in serve.list_deployments().items()]
def get_deployment(self, name): try: return {"name": name, "info": serve.list_deployments()[name]} except KeyError: raise MlflowException(f"No deployment with name {name} found")
def test_deploy(ray_start_stop): # Deploys two valid config files and checks that the deployments work # Initialize serve in test to enable calling serve.list_deployments() ray.init(address="auto", namespace=RAY_INTERNAL_DASHBOARD_NAMESPACE) serve.start(detached=True) # Create absolute file names to YAML config files three_deployments = os.path.join(os.path.dirname(__file__), "test_config_files", "three_deployments.yaml") two_deployments = os.path.join(os.path.dirname(__file__), "test_config_files", "two_deployments.yaml") # Dictionary mapping test config file names to expected deployment names # and configurations. These should match the values specified in the YAML # files. configs = { three_deployments: { "shallow": { "num_replicas": 1, "response": "Hello shallow world!", }, "deep": { "num_replicas": 1, "response": "Hello deep world!", }, "one": { "num_replicas": 2, "response": "2", }, }, two_deployments: { "shallow": { "num_replicas": 3, "response": "Hello shallow world!", }, "one": { "num_replicas": 2, "response": "2", }, }, } request_url = "http://localhost:8000/" success_message_fragment = b"Sent deploy request successfully!" for config_file_name, expected_deployments in configs.items(): deploy_response = subprocess.check_output( ["serve", "deploy", config_file_name]) assert success_message_fragment in deploy_response running_deployments = serve.list_deployments() # Check that running deployment names match expected deployment names assert set(running_deployments.keys()) == expected_deployments.keys() for name, deployment in running_deployments.items(): assert deployment.num_replicas == expected_deployments[name][ "num_replicas"] for name, deployment_config in expected_deployments.items(): assert (requests.get(f"{request_url}{name}").text == deployment_config["response"])
def test_ray_client(ray_client_instance): ray.util.connect(ray_client_instance, namespace="default_test_namespace") start = """ import ray ray.util.connect("{}", namespace="default_test_namespace") from ray import serve serve.start(detached=True) """.format(ray_client_instance) run_string_as_driver(start) deploy = """ import ray ray.util.connect("{}", namespace="default_test_namespace") from ray import serve @serve.deployment(name="test1", route_prefix="/hello") def f(*args): return "hello" f.deploy() """.format(ray_client_instance) run_string_as_driver(deploy) assert "test1" in serve.list_deployments() assert requests.get("http://*****:*****@app.get("/") def hello(): return "hello" @serve.deployment @serve.ingress(app) class A: pass A.deploy() """.format(ray_client_instance) run_string_as_driver(fastapi) assert requests.get("http://localhost:8000/A").json() == "hello" serve.shutdown() ray.util.disconnect()