def scale(backend_tag, num_replicas): if num_replicas <= 0: click.Abort( "Cannot set number of replicas to be smaller or equal to 0.") ray.init(address="auto") serve.init() serve.scale(backend_tag, num_replicas)
def test_scaling_replicas(serve_instance): class Counter: def __init__(self): self.count = 0 def __call__(self, _): self.count += 1 return self.count serve.create_endpoint("counter", "/increment") # Keep checking the routing table until /increment is populated while "/increment" not in requests.get("http://127.0.0.1:8000/").json(): time.sleep(0.2) serve.create_backend(Counter, "counter:v1") serve.link("counter", "counter:v1") serve.scale("counter:v1", 2) counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json()["result"] counter_result.append(resp) # If the load is shared among two replicas. The max result cannot be 10. assert max(counter_result) < 10 serve.scale("counter:v1", 1) counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json()["result"] counter_result.append(resp) # Give some time for a replica to spin down. But majority of the request # should be served by the only remaining replica. assert max(counter_result) - min(counter_result) > 6
print(requests.get("http://127.0.0.1:8000/echo").json()) # The service will be reachable from http print(ray.get(serve.get_handle("my_endpoint").remote(response="hello"))) # as well as within the ray system. # We can also add a new backend and split the traffic. def echo_v2(flask_request): # magic, only from web. return "something new" serve.create_backend(echo_v2, "echo:v2") # The two backend will now split the traffic 50%-50%. serve.split("my_endpoint", {"echo:v1": 0.5, "echo:v2": 0.5}) # Observe requests are now split between two backends. for _ in range(10): print(requests.get("http://127.0.0.1:8000/echo").json()) time.sleep(0.5) # You can also scale each backend independently. serve.scale("echo:v1", 2) serve.scale("echo:v2", 2) # As well as retrieving relevant system metrics print(pformat_color_json(serve.stat()))