def run(self): tensor_data = construct_tensor(self.config) for batch_size, pipeline_length in product( self.config["max_batch_sizes"], self.config["pipeline_lengths"]): df_row = dict( batch_size=batch_size, pipeline_length=pipeline_length, tensor_type=self.config["tensor_type"], tensor_shape="x".join( [str(shape) for shape in self.config["tensor_shape"]]), serving_type=self.config["serving_type"], arrival_process=self.config["arrival_process"], ) # initialize serve serve_benchmark.init(start_server=False) chain_pipeline = Chain(max_batch_size=batch_size, pipeline_length=pipeline_length) # warmup ray.wait([chain_pipeline.remote(tensor_data) for _ in range(200)], 200) # throughput calculation start_time = time.perf_counter() ray.wait( [ chain_pipeline.remote(tensor_data) for _ in range(self.config["num_requests"]) ], self.config["num_requests"], ) end_time = time.perf_counter() duration = end_time - start_time qps = self.config["num_requests"] / duration df_row.update(throughput_qps=qps) serve_benchmark.clear_trace() # closed loop latency calculation closed_loop_latencies = list() for _ in range(self.config["num_requests"]): start_time = time.perf_counter() ray.wait([chain_pipeline.remote(tensor_data)], 1) end_time = time.perf_counter() latency = end_time - start_time closed_loop_latencies.append(latency) pprint(df_row) # percentile_values = df_row.update(latency_s=closed_loop_latencies) self._df = self._df.append(df_row, ignore_index=True) # cleanup del closed_loop_latencies, chain_pipeline serve_benchmark.shutdown()
def test_e2e(serve_instance): serve_benchmark.init() # so we have access to global state serve_benchmark.create_endpoint("endpoint", "/api", methods=["GET", "POST"]) result = serve_benchmark.api._get_global_state().route_table.list_service() assert result["/api"] == "endpoint" retry_count = 5 timeout_sleep = 0.5 while True: try: resp = requests.get("http://127.0.0.1:8000/-/routes", timeout=0.5).json() assert resp == {"/api": ["endpoint", ["GET", "POST"]]} break except Exception as e: time.sleep(timeout_sleep) timeout_sleep *= 2 retry_count -= 1 if retry_count == 0: assert False, ("Route table hasn't been updated after 3 tries." "The latest error was {}").format(e) def function(flask_request): return {"method": flask_request.method} serve_benchmark.create_backend(function, "echo:v1") serve_benchmark.link("endpoint", "echo:v1") resp = requests.get("http://127.0.0.1:8000/api").json()["method"] assert resp == "GET" resp = requests.post("http://127.0.0.1:8000/api").json()["method"] assert resp == "POST"
def serve_instance(): _, new_db_path = tempfile.mkstemp(suffix=".test.db") serve_benchmark.init( kv_store_path=new_db_path, blocking=True, ray_init_kwargs={"num_cpus": 36}, ) yield os.remove(new_db_path)
def run(self): for vertex_config in self.config["vertex_configs"]: serve_benchmark.init(start_server=False) filename_query = "arrival_trace.jsonl" route = "/prepoc" pipeline = ImagePrepocPipeline(vertex_config, self.config["model_type"]) vertex_config_name = json.dumps(vertex_config) df_row = dict( vertex_config=vertex_config_name, serving_type=self.config["serving_type"], arrival_process=self.config["arrival_process"], ) image_path = os.path.join(ROOT_DIR, self.config["image_file_path"]) tensor_data = base64.b64encode(open(image_path, "rb").read()) throughput_qps = self._throughput_calculation( pipeline, tensor_data, self.config["num_requests"]) df_row.update(throughput_qps=throughput_qps) pprint(df_row) # closed loop latency calculation closed_loop_latencies = list() for _ in tqdm(range(30)): start_time = time.perf_counter() ready, _ = ray.wait([pipeline.remote(tensor_data)], 1) end_time = time.perf_counter() latency = end_time - start_time closed_loop_latencies.append(latency) df_row.update(latency_s=closed_loop_latencies) self._df = self._df.append(df_row, ignore_index=True) # cleanup del closed_loop_latencies, pipeline serve_benchmark.shutdown()
def run(self): for vertex_config in self.config["vertex_configs"]: for arrival_config in self.config["arrival_config"]: serve_benchmark.init(start_server=False) filename_query = "arrival_trace.jsonl" route = "/prepoc" pipeline = ImagePrepocPipeline(vertex_config, self.config["model_type"]) vertex_config_name = json.dumps(vertex_config) df_row = dict( vertex_config=vertex_config_name, serving_type=self.config["serving_type"], arrival_process=json.dumps(arrival_config), ) image_path = os.path.join(ROOT_DIR, self.config["image_file_path"]) throughput_qps = self._throughput_calculation( pipeline, image_path, arrival_config["num_requests"]) df_row.update(throughput_qps=throughput_qps) pprint(df_row) http_actor = HTTPProxyActor.remote( host="127.0.0.1", port=8000, serving_backend=self.config["serving_type"], filename=filename_query, ) ray.get( http_actor.register_route.remote(route, pipeline.chain_handle)) go_client_path = os.path.join(ROOT_DIR, self.config["client_path"]) arrival_curve = generate_fixed_arrival_process( **arrival_config).tolist() arrival_curve_str = [str(x) for x in arrival_curve] print(f"arrival_curve lenght: {len(arrival_curve_str)}") ls_output = subprocess.Popen([ "go", "run", go_client_path, image_path, route, *arrival_curve_str, ]) ls_output.communicate() latency_s = get_latency(filename_query) os.remove(filename_query) df_row.update(latency_s=latency_s) self._df = self._df.append(df_row, ignore_index=True) # cleanup del latency_s, pipeline, arrival_curve, arrival_curve_str serve_benchmark.shutdown()
import time import tempfile import json import ray import click import torch import base64 from benchmarking import serve_benchmark serve_benchmark.init(start_server=False) batch_size = 1 num_queries = 2000 # raw_image_data = base64.b64encode(open("./elephant.jpg", "rb").read()) image_data = torch.zeros((224, 224, 3)) @serve_benchmark.accept_batch def noop(_, data): return data @click.command() @click.option("--num-replicas", type=int, default=1) @click.option("--batch-size", type=int, default=1) @click.option( "--method", type=click.Choice(["chain", "group"]), default="chain" )