def run(self):

        tensor_data = construct_tensor(self.config)
        for batch_size, pipeline_length in product(
                self.config["max_batch_sizes"],
                self.config["pipeline_lengths"]):
            df_row = dict(
                batch_size=batch_size,
                pipeline_length=pipeline_length,
                tensor_type=self.config["tensor_type"],
                tensor_shape="x".join(
                    [str(shape) for shape in self.config["tensor_shape"]]),
                serving_type=self.config["serving_type"],
                arrival_process=self.config["arrival_process"],
            )

            # initialize serve
            serve_benchmark.init(start_server=False)

            chain_pipeline = Chain(max_batch_size=batch_size,
                                   pipeline_length=pipeline_length)

            # warmup
            ray.wait([chain_pipeline.remote(tensor_data) for _ in range(200)],
                     200)

            # throughput calculation
            start_time = time.perf_counter()
            ray.wait(
                [
                    chain_pipeline.remote(tensor_data)
                    for _ in range(self.config["num_requests"])
                ],
                self.config["num_requests"],
            )
            end_time = time.perf_counter()
            duration = end_time - start_time
            qps = self.config["num_requests"] / duration
            df_row.update(throughput_qps=qps)

            serve_benchmark.clear_trace()

            # closed loop latency calculation
            closed_loop_latencies = list()
            for _ in range(self.config["num_requests"]):
                start_time = time.perf_counter()
                ray.wait([chain_pipeline.remote(tensor_data)], 1)
                end_time = time.perf_counter()
                latency = end_time - start_time
                closed_loop_latencies.append(latency)

            pprint(df_row)
            # percentile_values =
            df_row.update(latency_s=closed_loop_latencies)

            self._df = self._df.append(df_row, ignore_index=True)

            # cleanup
            del closed_loop_latencies, chain_pipeline
            serve_benchmark.shutdown()
Ejemplo n.º 2
0
def test_e2e(serve_instance):
    serve_benchmark.init()  # so we have access to global state
    serve_benchmark.create_endpoint("endpoint",
                                    "/api",
                                    methods=["GET", "POST"])
    result = serve_benchmark.api._get_global_state().route_table.list_service()
    assert result["/api"] == "endpoint"

    retry_count = 5
    timeout_sleep = 0.5
    while True:
        try:
            resp = requests.get("http://127.0.0.1:8000/-/routes",
                                timeout=0.5).json()
            assert resp == {"/api": ["endpoint", ["GET", "POST"]]}
            break
        except Exception as e:
            time.sleep(timeout_sleep)
            timeout_sleep *= 2
            retry_count -= 1
            if retry_count == 0:
                assert False, ("Route table hasn't been updated after 3 tries."
                               "The latest error was {}").format(e)

    def function(flask_request):
        return {"method": flask_request.method}

    serve_benchmark.create_backend(function, "echo:v1")
    serve_benchmark.link("endpoint", "echo:v1")

    resp = requests.get("http://127.0.0.1:8000/api").json()["method"]
    assert resp == "GET"

    resp = requests.post("http://127.0.0.1:8000/api").json()["method"]
    assert resp == "POST"
Ejemplo n.º 3
0
def serve_instance():
    _, new_db_path = tempfile.mkstemp(suffix=".test.db")
    serve_benchmark.init(
        kv_store_path=new_db_path,
        blocking=True,
        ray_init_kwargs={"num_cpus": 36},
    )
    yield
    os.remove(new_db_path)
Ejemplo n.º 4
0
    def run(self):
        for vertex_config in self.config["vertex_configs"]:

            serve_benchmark.init(start_server=False)
            filename_query = "arrival_trace.jsonl"
            route = "/prepoc"

            pipeline = ImagePrepocPipeline(vertex_config,
                                           self.config["model_type"])
            vertex_config_name = json.dumps(vertex_config)
            df_row = dict(
                vertex_config=vertex_config_name,
                serving_type=self.config["serving_type"],
                arrival_process=self.config["arrival_process"],
            )

            image_path = os.path.join(ROOT_DIR, self.config["image_file_path"])
            tensor_data = base64.b64encode(open(image_path, "rb").read())

            throughput_qps = self._throughput_calculation(
                pipeline, tensor_data, self.config["num_requests"])
            df_row.update(throughput_qps=throughput_qps)

            pprint(df_row)

            # closed loop latency calculation
            closed_loop_latencies = list()
            for _ in tqdm(range(30)):
                start_time = time.perf_counter()
                ready, _ = ray.wait([pipeline.remote(tensor_data)], 1)
                end_time = time.perf_counter()
                latency = end_time - start_time
                closed_loop_latencies.append(latency)

            df_row.update(latency_s=closed_loop_latencies)

            self._df = self._df.append(df_row, ignore_index=True)

            # cleanup
            del closed_loop_latencies, pipeline
            serve_benchmark.shutdown()
Ejemplo n.º 5
0
    def run(self):
        for vertex_config in self.config["vertex_configs"]:
            for arrival_config in self.config["arrival_config"]:

                serve_benchmark.init(start_server=False)
                filename_query = "arrival_trace.jsonl"
                route = "/prepoc"

                pipeline = ImagePrepocPipeline(vertex_config,
                                               self.config["model_type"])
                vertex_config_name = json.dumps(vertex_config)
                df_row = dict(
                    vertex_config=vertex_config_name,
                    serving_type=self.config["serving_type"],
                    arrival_process=json.dumps(arrival_config),
                )

                image_path = os.path.join(ROOT_DIR,
                                          self.config["image_file_path"])

                throughput_qps = self._throughput_calculation(
                    pipeline, image_path, arrival_config["num_requests"])
                df_row.update(throughput_qps=throughput_qps)

                pprint(df_row)

                http_actor = HTTPProxyActor.remote(
                    host="127.0.0.1",
                    port=8000,
                    serving_backend=self.config["serving_type"],
                    filename=filename_query,
                )
                ray.get(
                    http_actor.register_route.remote(route,
                                                     pipeline.chain_handle))
                go_client_path = os.path.join(ROOT_DIR,
                                              self.config["client_path"])

                arrival_curve = generate_fixed_arrival_process(
                    **arrival_config).tolist()
                arrival_curve_str = [str(x) for x in arrival_curve]
                print(f"arrival_curve lenght: {len(arrival_curve_str)}")
                ls_output = subprocess.Popen([
                    "go",
                    "run",
                    go_client_path,
                    image_path,
                    route,
                    *arrival_curve_str,
                ])
                ls_output.communicate()

                latency_s = get_latency(filename_query)
                os.remove(filename_query)

                df_row.update(latency_s=latency_s)
                self._df = self._df.append(df_row, ignore_index=True)

                # cleanup
                del latency_s, pipeline, arrival_curve, arrival_curve_str
                serve_benchmark.shutdown()
Ejemplo n.º 6
0
import time
import tempfile
import json

import ray
import click
import torch
import base64

from benchmarking import serve_benchmark

serve_benchmark.init(start_server=False)

batch_size = 1
num_queries = 2000

# raw_image_data = base64.b64encode(open("./elephant.jpg", "rb").read())
image_data = torch.zeros((224, 224, 3))


@serve_benchmark.accept_batch
def noop(_, data):
    return data


@click.command()
@click.option("--num-replicas", type=int, default=1)
@click.option("--batch-size", type=int, default=1)
@click.option(
    "--method", type=click.Choice(["chain", "group"]), default="chain"
)