def test_not_killing_replicas(serve_instance): class BatchSimple: def __init__(self): self.count = 0 @serve.accept_batch def __call__(self, flask_request, temp=None): batch_size = serve.context.batch_size return [1] * batch_size serve.create_endpoint("bsimple", "/bsimple") b_config = BackendConfig(num_replicas=3, max_batch_size=2) serve.create_backend(BatchSimple, "bsimple:v1", backend_config=b_config) master_actor = serve.api._get_master_actor() old_replica_tag_list = ray.get( master_actor._list_replicas.remote("bsimple:v1")) bnew_config = serve.get_backend_config("bsimple:v1") # change the config bnew_config.max_batch_size = 5 # set the config serve.set_backend_config("bsimple:v1", bnew_config) new_replica_tag_list = ray.get( master_actor._list_replicas.remote("bsimple:v1")) new_all_tag_list = [] for worker_dict in ray.get( master_actor.get_all_worker_handles.remote()).values(): new_all_tag_list.extend(list(worker_dict.keys())) # the old and new replica tag list should be identical # and should be subset of all_tag_list assert set(old_replica_tag_list) <= set(new_all_tag_list) assert set(old_replica_tag_list) == set(new_replica_tag_list)
def test_not_killing_replicas(serve_instance): class BatchSimple: def __init__(self): self.count = 0 @serve.accept_batch def __call__(self, flask_request, temp=None): batch_size = serve.context.batch_size return [1] * batch_size serve.create_endpoint("bsimple", "/bsimple") b_config = BackendConfig(num_replicas=3, max_batch_size=2) serve.create_backend(BatchSimple, "bsimple:v1", backend_config=b_config) global_state = serve.api._get_global_state() old_replica_tag_list = global_state.backend_table.list_replicas( "bsimple:v1") bnew_config = serve.get_backend_config("bsimple:v1") # change the config bnew_config.max_batch_size = 5 # set the config serve.set_backend_config("bsimple:v1", bnew_config) new_replica_tag_list = global_state.backend_table.list_replicas( "bsimple:v1") global_state.refresh_actor_handle_cache() new_all_tag_list = list(global_state.actor_handle_cache.keys()) # the old and new replica tag list should be identical # and should be subset of all_tag_list assert set(old_replica_tag_list) <= set(new_all_tag_list) assert set(old_replica_tag_list) == set(new_replica_tag_list)
def test_killing_replicas(serve_instance): class Simple: def __init__(self): self.count = 0 def __call__(self, flask_request, temp=None): return temp serve.create_endpoint("simple", "/simple") b_config = BackendConfig(num_replicas=3, num_cpus=2) serve.create_backend(Simple, "simple:v1", backend_config=b_config) master_actor = serve.api._get_master_actor() old_replica_tag_list = ray.get( master_actor._list_replicas.remote("simple:v1")) bnew_config = serve.get_backend_config("simple:v1") # change the config bnew_config.num_cpus = 1 # set the config serve.set_backend_config("simple:v1", bnew_config) new_replica_tag_list = ray.get( master_actor._list_replicas.remote("simple:v1")) new_all_tag_list = [] for worker_dict in ray.get( master_actor.get_all_worker_handles.remote()).values(): new_all_tag_list.extend(list(worker_dict.keys())) # the new_replica_tag_list must be subset of all_tag_list assert set(new_replica_tag_list) <= set(new_all_tag_list) # the old_replica_tag_list must not be subset of all_tag_list assert not set(old_replica_tag_list) <= set(new_all_tag_list)
def test_killing_replicas(serve_instance): class Simple: def __init__(self): self.count = 0 def __call__(self, flask_request, temp=None): return temp serve.create_endpoint("simple", "/simple") b_config = BackendConfig(num_replicas=3, num_cpus=2) serve.create_backend(Simple, "simple:v1", backend_config=b_config) global_state = serve.api._get_global_state() old_replica_tag_list = global_state.backend_table.list_replicas( "simple:v1") bnew_config = serve.get_backend_config("simple:v1") # change the config bnew_config.num_cpus = 1 # set the config serve.set_backend_config("simple:v1", bnew_config) new_replica_tag_list = global_state.backend_table.list_replicas( "simple:v1") global_state.refresh_actor_handle_cache() new_all_tag_list = list(global_state.actor_handle_cache.keys()) # the new_replica_tag_list must be subset of all_tag_list assert set(new_replica_tag_list) <= set(new_all_tag_list) # the old_replica_tag_list must not be subset of all_tag_list assert not set(old_replica_tag_list) <= set(new_all_tag_list)
def test_worker_replica_failure(serve_instance): serve.http_proxy.MAX_ACTOR_DEAD_RETRIES = 0 serve.init() serve.create_endpoint( "replica_failure", "/replica_failure", methods=["GET"]) class Worker: # Assumes that two replicas are started. Will hang forever in the # constructor for any workers that are restarted. def __init__(self, path): self.should_hang = False if not os.path.exists(path): with open(path, "w") as f: f.write("1") else: with open(path, "r") as f: num = int(f.read()) with open(path, "w") as f: if num == 2: self.should_hang = True else: f.write(str(num + 1)) if self.should_hang: while True: pass def __call__(self): pass temp_path = tempfile.gettempdir() + "/" + serve.utils.get_random_letters() serve.create_backend(Worker, "replica_failure", temp_path) backend_config = serve.get_backend_config("replica_failure") backend_config.num_replicas = 2 serve.set_backend_config("replica_failure", backend_config) serve.set_traffic("replica_failure", {"replica_failure": 1.0}) # Wait until both replicas have been started. responses = set() while len(responses) == 1: responses.add( request_with_retries("/replica_failure", timeout=0.1).text) time.sleep(0.1) # Kill one of the replicas. handles = _get_worker_handles("replica_failure") assert len(handles) == 2 ray.kill(handles[0]) # Check that the other replica still serves requests. for _ in range(10): while True: try: # The timeout needs to be small here because the request to # the restarting worker will hang. request_with_retries("/replica_failure", timeout=0.1) break except TimeoutError: time.sleep(0.1)
def test_scaling_replicas(serve_instance): class Counter: def __init__(self): self.count = 0 def __call__(self, _): self.count += 1 return self.count serve.create_endpoint("counter", "/increment") # Keep checking the routing table until /increment is populated while "/increment" not in requests.get( "http://127.0.0.1:8000/-/routes").json(): time.sleep(0.2) b_config = BackendConfig(num_replicas=2) serve.create_backend(Counter, "counter:v1", backend_config=b_config) serve.set_traffic("counter", {"counter:v1": 1.0}) counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json() counter_result.append(resp) # If the load is shared among two replicas. The max result cannot be 10. assert max(counter_result) < 10 b_config = serve.get_backend_config("counter:v1") b_config.num_replicas = 1 serve.set_backend_config("counter:v1", b_config) counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json() counter_result.append(resp) # Give some time for a replica to spin down. But majority of the request # should be served by the only remaining replica. assert max(counter_result) - min(counter_result) > 6
def set_max_batch_size(self, new_max_batch_size, backend_tag=None): backend_tag = self._ensure_backend_unique(backend_tag) config = serve.get_backend_config(backend_tag) config.max_batch_size = new_max_batch_size serve.set_backend_config(backend_tag, config)
def scale(self, new_num_replicas, backend_tag=None): backend_tag = self._ensure_backend_unique(backend_tag) config = serve.get_backend_config(backend_tag) config.num_replicas = new_num_replicas serve.set_backend_config(backend_tag, config)
serve.init(blocking=True, kv_store_connector=lambda ns: RayInternalKVStore(ns)) @serve.route("/echo") @serve.accept_batch def echo(_): time.sleep(0.01) # Sleep for 10ms ray.show_in_webui(str(serve.context.batch_size), key="Current batch size") return ["hi {}".format(i) for i in range(serve.context.batch_size)] print("Scaling to 30 replicas") config = serve.get_backend_config("echo:v0") config.num_replicas = 30 config.max_batch_size = 16 serve.set_backend_config("echo:v0", config) print("Warming up") for _ in range(5): resp = requests.get("http://127.0.0.1:8000/echo").json() print(resp) time.sleep(0.5) connections = int(config.num_replicas * config.max_batch_size * 0.75) proc = subprocess.Popen([ "./hey_linux_amd64", "-c", str(connections), "-z", "360m", "http://127.0.0.1:8000/echo" ], stdout=PIPE, stderr=PIPE) print("started load testing")
# as well as within the ray system. # We can also add a new backend and split the traffic. def echo_v2(flask_request): # magic, only from web. return "something new" serve.create_backend(echo_v2, "echo:v2") backend_config_v2 = serve.get_backend_config("echo:v2") # The two backend will now split the traffic 50%-50%. serve.split("my_endpoint", {"echo:v1": 0.5, "echo:v2": 0.5}) # Observe requests are now split between two backends. for _ in range(10): print(requests.get("http://127.0.0.1:8000/echo").text) time.sleep(0.5) # You can also change number of replicas # for each backend independently. backend_config_v1.num_replicas = 2 serve.set_backend_config("echo:v1", backend_config_v1) backend_config_v2.num_replicas = 2 serve.set_backend_config("echo:v2", backend_config_v2) # As well as retrieving relevant system metrics print(pformat_color_json(serve.stat()))