def test_not_killing_replicas(serve_instance): class BatchSimple: def __init__(self): self.count = 0 @serve.accept_batch def __call__(self, flask_request, temp=None): batch_size = serve.context.batch_size return [1] * batch_size serve.create_endpoint("bsimple", "/bsimple") b_config = BackendConfig(num_replicas=3, max_batch_size=2) serve.create_backend(BatchSimple, "bsimple:v1", backend_config=b_config) global_state = serve.api._get_global_state() old_replica_tag_list = global_state.backend_table.list_replicas( "bsimple:v1") bnew_config = serve.get_backend_config("bsimple:v1") # change the config bnew_config.max_batch_size = 5 # set the config serve.set_backend_config("bsimple:v1", bnew_config) new_replica_tag_list = global_state.backend_table.list_replicas( "bsimple:v1") global_state.refresh_actor_handle_cache() new_all_tag_list = list(global_state.actor_handle_cache.keys()) # the old and new replica tag list should be identical # and should be subset of all_tag_list assert set(old_replica_tag_list) <= set(new_all_tag_list) assert set(old_replica_tag_list) == set(new_replica_tag_list)
def test_killing_replicas(serve_instance): class Simple: def __init__(self): self.count = 0 def __call__(self, flask_request, temp=None): return temp serve.create_endpoint("simple", "/simple") b_config = BackendConfig(num_replicas=3, num_cpus=2) serve.create_backend(Simple, "simple:v1", backend_config=b_config) global_state = serve.api._get_global_state() old_replica_tag_list = global_state.backend_table.list_replicas( "simple:v1") bnew_config = serve.get_backend_config("simple:v1") # change the config bnew_config.num_cpus = 1 # set the config serve.set_backend_config("simple:v1", bnew_config) new_replica_tag_list = global_state.backend_table.list_replicas( "simple:v1") global_state.refresh_actor_handle_cache() new_all_tag_list = list(global_state.actor_handle_cache.keys()) # the new_replica_tag_list must be subset of all_tag_list assert set(new_replica_tag_list) <= set(new_all_tag_list) # the old_replica_tag_list must not be subset of all_tag_list assert not set(old_replica_tag_list) <= set(new_all_tag_list)
def test_scaling_replicas(serve_instance): class Counter: def __init__(self): self.count = 0 def __call__(self, _): self.count += 1 return self.count serve.create_endpoint("counter", "/increment") # Keep checking the routing table until /increment is populated while "/increment" not in requests.get("http://127.0.0.1:8000/").json(): time.sleep(0.2) b_config = BackendConfig(num_replicas=2) serve.create_backend(Counter, "counter:v1", backend_config=b_config) serve.link("counter", "counter:v1") counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json()["result"] counter_result.append(resp) # If the load is shared among two replicas. The max result cannot be 10. assert max(counter_result) < 10 b_config = serve.get_backend_config("counter:v1") b_config.num_replicas = 1 serve.set_backend_config("counter:v1", b_config) counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json()["result"] counter_result.append(resp) # Give some time for a replica to spin down. But majority of the request # should be served by the only remaining replica. assert max(counter_result) - min(counter_result) > 6
]) ray.init(address=cluster.address, include_webui=True, webui_host="0.0.0.0") serve.init(blocking=True, kv_store_connector=lambda ns: RayInternalKVStore(ns)) @serve.route("/echo") @serve.accept_batch def echo(_): time.sleep(0.01) # Sleep for 10ms ray.show_in_webui(str(serve.context.batch_size), key="Current batch size") return ["hi {}".format(i) for i in range(serve.context.batch_size)] print("Scaling to 30 replicas") config = serve.get_backend_config("echo:v0") config.num_replicas = 30 config.max_batch_size = 16 serve.set_backend_config("echo:v0", config) print("Warming up") for _ in range(5): resp = requests.get("http://127.0.0.1:8000/echo").json() print(resp) time.sleep(0.5) connections = int(config.num_replicas * config.max_batch_size * 0.75) proc = subprocess.Popen([ "./hey_linux_amd64", "-c", str(connections), "-z", "360m", "http://127.0.0.1:8000/echo" ],
serve.init(blocking=True) # an endpoint is associated with an http URL. serve.create_endpoint("my_endpoint", "/echo") # a backend can be a function or class. # it can be made to be invoked from web as well as python. def echo_v1(flask_request, response="hello from python!"): if serve.context.web: response = flask_request.url return response serve.create_backend(echo_v1, "echo:v1") backend_config_v1 = serve.get_backend_config("echo:v1") # We can link an endpoint to a backend, the means all the traffic # goes to my_endpoint will now goes to echo:v1 backend. serve.link("my_endpoint", "echo:v1") print(requests.get("http://127.0.0.1:8000/echo").json()) # The service will be reachable from http print(ray.get(serve.get_handle("my_endpoint").remote(response="hello"))) # as well as within the ray system. # We can also add a new backend and split the traffic. def echo_v2(flask_request):