def run(self): for vertex_config in self.config["vertex_configs"]: for arrival_config in self.config["arrival_config"]: serve_reference.init(start_server=False) filename_query = "arrival_trace.jsonl" route = "/prepoc" pipeline = ImagePrepocPipeline(vertex_config, self.config["model_type"]) vertex_config_name = json.dumps(vertex_config) df_row = dict( vertex_config=vertex_config_name, serving_type=self.config["serving_type"], arrival_process=json.dumps(arrival_config), ) image_path = os.path.join(ROOT_DIR, self.config["image_file_path"]) throughput_qps = self._throughput_calculation( pipeline, image_path, arrival_config["num_requests"]) df_row.update(throughput_qps=throughput_qps) pprint(df_row) http_actor = HTTPProxyActor.remote( host="127.0.0.1", port=8000, serving_backend=self.config["serving_type"], filename=filename_query, ) ray.get( http_actor.register_route.remote(route, pipeline.chain_handle)) go_client_path = os.path.join(ROOT_DIR, self.config["client_path"]) arrival_curve = generate_fixed_arrival_process( **arrival_config).tolist() arrival_curve_str = [str(x) for x in arrival_curve] ls_output = subprocess.Popen([ "go", "run", go_client_path, image_path, route, *arrival_curve_str, ]) ls_output.communicate() latency_s = get_latency(filename_query) os.remove(filename_query) df_row.update(latency_s=latency_s) self._df = self._df.append(df_row, ignore_index=True) # cleanup del latency_s, pipeline, arrival_curve, arrival_curve_str serve_reference.shutdown()
def test_e2e(serve_instance): serve_reference.init() # so we have access to global state serve_reference.create_endpoint("endpoint", "/api", methods=["GET", "POST"]) result = serve_reference.api._get_global_state().route_table.list_service() assert result["/api"] == "endpoint" retry_count = 5 timeout_sleep = 0.5 while True: try: resp = requests.get("http://127.0.0.1:8000/-/routes", timeout=0.5).json() assert resp == {"/api": ["endpoint", ["GET", "POST"]]} break except Exception as e: time.sleep(timeout_sleep) timeout_sleep *= 2 retry_count -= 1 if retry_count == 0: assert False, ("Route table hasn't been updated after 3 tries." "The latest error was {}").format(e) def function(flask_request): return {"method": flask_request.method} serve_reference.create_backend(function, "echo:v1") serve_reference.link("endpoint", "echo:v1") resp = requests.get("http://127.0.0.1:8000/api").json()["method"] assert resp == "GET" resp = requests.post("http://127.0.0.1:8000/api").json()["method"] assert resp == "POST"
def main(batch_size, num_warmups, num_queries, return_type): serve_reference.init() def noop(_, *args, **kwargs): bs = serve_reference.context.batch_size assert (bs == batch_size ), f"worker received {bs} which is not what expected" result = "" if return_type == "torch": result = torch.zeros((3, 224, 224)) if bs is None: # No batching return result else: return [result] * bs if batch_size: noop = serve_reference.accept_batch(noop) with serve_reference.using_router("noop"): serve_reference.create_endpoint("noop", "/noop") config = serve_reference.BackendConfig(max_batch_size=batch_size) serve_reference.create_backend(noop, "noop", backend_config=config) serve_reference.link("noop", "noop") handle = serve_reference.get_handle("noop") latency = [] for i in tqdm(range(num_warmups + num_queries)): if i == num_warmups: serve_reference.clear_trace() start = time.perf_counter() if not batch_size: ray.get( # This is how to pass a higher level metadata to the tracing # context handle.options(tracing_metadata={ "demo": "pipeline-id" }).remote()) else: ray.get(handle.enqueue_batch(val=[1] * batch_size)) # ray.get([handle.remote() for _ in range(batch_size)]) end = time.perf_counter() latency.append(end - start) # Remove initial samples latency = latency[num_warmups:] series = pd.Series(latency) * 1000 print("Latency for single noop backend (ms)") print(series.describe(percentiles=[0.5, 0.9, 0.95, 0.99])) _, trace_file = tempfile.mkstemp(suffix=".json") with open(trace_file, "w") as f: json.dump(serve_reference.get_trace(), f) print(f"trace file written to {trace_file}")
def serve_instance(): _, new_db_path = tempfile.mkstemp(suffix=".test.db") serve_reference.init( kv_store_path=new_db_path, blocking=True, ray_init_kwargs={"num_cpus": 36}, ) yield os.remove(new_db_path)
def run(self): tensor_data = construct_tensor(self.config) for batch_size, pipeline_length in product( self.config["max_batch_sizes"], self.config["pipeline_lengths"]): df_row = dict( batch_size=batch_size, pipeline_length=pipeline_length, tensor_type=self.config["tensor_type"], tensor_shape="x".join( [str(shape) for shape in self.config["tensor_shape"]]), serving_type=self.config["serving_type"], arrival_process=self.config["arrival_process"], ) # initialize serve serve_reference.init(start_server=False) chain_pipeline = Chain(max_batch_size=batch_size, pipeline_length=pipeline_length) # warmup ready_refs, _ = ray.wait( [chain_pipeline.remote(tensor_data) for _ in range(200)], 200) ray.wait(ready_refs, num_returns=200) del ready_refs qps = self._throughput_calculation(chain_pipeline, tensor_data) df_row.update(throughput_qps=qps) serve_reference.clear_trace() # closed loop latency calculation closed_loop_latencies = list() for _ in range(self.config["num_requests"]): start_time = time.perf_counter() ready, _ = ray.wait([chain_pipeline.remote(tensor_data)], 1) ray.wait(ready, 1) end_time = time.perf_counter() latency = end_time - start_time closed_loop_latencies.append(latency) pprint(df_row) # percentile_values = df_row.update(latency_s=closed_loop_latencies) self._df = self._df.append(df_row, ignore_index=True) # cleanup del closed_loop_latencies, chain_pipeline serve_reference.shutdown()
def main(): TAG = "Resnet18" min_img_size = 224 transform = transforms.Compose([ transforms.Resize(min_img_size), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], ), ]) for num_replica in range(1, 9): # initialize serve serve_reference.init(start_server=False) serve_handle = None with serve_reference.using_router(TAG): serve_reference.create_endpoint(TAG) config = serve_reference.BackendConfig(max_batch_size=8, num_replicas=num_replica, num_gpus=1) serve_reference.create_backend( PredictModelPytorch, TAG, transform, "resnet18", True, backend_config=config, ) serve_reference.link(TAG, TAG) serve_handle = serve_reference.get_handle(TAG) img = base64.b64encode(open("elephant.jpg", "rb").read()) # warmup ready_refs, _ = ray.wait( [serve_handle.remote(data=img) for _ in range(200)], 200) complete_oids, _ = ray.wait(ray.get(ready_refs), num_returns=200) del ready_refs del complete_oids qps = throughput_calculation(serve_handle, {"data": img}, 2000) print(f"[Resnet18] Batch Size: 8 Replica: {num_replica} " f"Throughput: {qps} QPS") serve_reference.shutdown()
def run(self): for vertex_config in self.config["vertex_configs"]: serve_reference.init(start_server=False) filename_query = "arrival_trace.jsonl" route = "/prepoc" pipeline = ImagePrepocPipeline(vertex_config, self.config["model_type"]) vertex_config_name = json.dumps(vertex_config) df_row = dict( vertex_config=vertex_config_name, serving_type=self.config["serving_type"], arrival_process=self.config["arrival_process"], ) image_path = os.path.join(ROOT_DIR, self.config["image_file_path"]) tensor_data = base64.b64encode(open(image_path, "rb").read()) throughput_qps = self._throughput_calculation( pipeline, tensor_data, self.config["num_requests"]) df_row.update(throughput_qps=throughput_qps) pprint(df_row) # closed loop latency calculation closed_loop_latencies = list() for _ in range(self.config["num_requests"]): start_time = time.perf_counter() ready, _ = ray.wait([pipeline.remote(tensor_data)], 1) ray.wait(ready, 1) end_time = time.perf_counter() latency = end_time - start_time closed_loop_latencies.append(latency) df_row.update(latency_s=closed_loop_latencies) self._df = self._df.append(df_row, ignore_index=True) # cleanup del closed_loop_latencies, pipeline serve_reference.shutdown()
import time import tempfile import json import ray import click import torch import base64 from benchmarking import serve_reference serve_reference.init(start_server=False) batch_size = 1 num_queries = 2000 raw_image_data = base64.b64encode(open("./elephant.jpg", "rb").read()) image_data = ray.put(raw_image_data) @serve_reference.accept_batch def noop(_, sleep_time=[], data=[]): time.sleep(sleep_time[0]) return [torch.ones((1, 224, 224, 3))] * serve_reference.context.batch_size @click.command() @click.option("--num-replicas", type=int, default=1) @click.option("--method", type=click.Choice(["chain", "group"]), default="chain")