예제 #1
0
    def run(self):
        for vertex_config in self.config["vertex_configs"]:
            for arrival_config in self.config["arrival_config"]:

                serve_reference.init(start_server=False)
                filename_query = "arrival_trace.jsonl"
                route = "/prepoc"

                pipeline = ImagePrepocPipeline(vertex_config,
                                               self.config["model_type"])
                vertex_config_name = json.dumps(vertex_config)
                df_row = dict(
                    vertex_config=vertex_config_name,
                    serving_type=self.config["serving_type"],
                    arrival_process=json.dumps(arrival_config),
                )

                image_path = os.path.join(ROOT_DIR,
                                          self.config["image_file_path"])

                throughput_qps = self._throughput_calculation(
                    pipeline, image_path, arrival_config["num_requests"])
                df_row.update(throughput_qps=throughput_qps)

                pprint(df_row)

                http_actor = HTTPProxyActor.remote(
                    host="127.0.0.1",
                    port=8000,
                    serving_backend=self.config["serving_type"],
                    filename=filename_query,
                )
                ray.get(
                    http_actor.register_route.remote(route,
                                                     pipeline.chain_handle))
                go_client_path = os.path.join(ROOT_DIR,
                                              self.config["client_path"])

                arrival_curve = generate_fixed_arrival_process(
                    **arrival_config).tolist()
                arrival_curve_str = [str(x) for x in arrival_curve]
                ls_output = subprocess.Popen([
                    "go",
                    "run",
                    go_client_path,
                    image_path,
                    route,
                    *arrival_curve_str,
                ])
                ls_output.communicate()

                latency_s = get_latency(filename_query)
                os.remove(filename_query)

                df_row.update(latency_s=latency_s)
                self._df = self._df.append(df_row, ignore_index=True)

                # cleanup
                del latency_s, pipeline, arrival_curve, arrival_curve_str
                serve_reference.shutdown()
예제 #2
0
def test_e2e(serve_instance):
    serve_reference.init()  # so we have access to global state
    serve_reference.create_endpoint("endpoint",
                                    "/api",
                                    methods=["GET", "POST"])
    result = serve_reference.api._get_global_state().route_table.list_service()
    assert result["/api"] == "endpoint"

    retry_count = 5
    timeout_sleep = 0.5
    while True:
        try:
            resp = requests.get("http://127.0.0.1:8000/-/routes",
                                timeout=0.5).json()
            assert resp == {"/api": ["endpoint", ["GET", "POST"]]}
            break
        except Exception as e:
            time.sleep(timeout_sleep)
            timeout_sleep *= 2
            retry_count -= 1
            if retry_count == 0:
                assert False, ("Route table hasn't been updated after 3 tries."
                               "The latest error was {}").format(e)

    def function(flask_request):
        return {"method": flask_request.method}

    serve_reference.create_backend(function, "echo:v1")
    serve_reference.link("endpoint", "echo:v1")

    resp = requests.get("http://127.0.0.1:8000/api").json()["method"]
    assert resp == "GET"

    resp = requests.post("http://127.0.0.1:8000/api").json()["method"]
    assert resp == "POST"
예제 #3
0
def main(batch_size, num_warmups, num_queries, return_type):
    serve_reference.init()

    def noop(_, *args, **kwargs):
        bs = serve_reference.context.batch_size
        assert (bs == batch_size
                ), f"worker received {bs} which is not what expected"
        result = ""

        if return_type == "torch":
            result = torch.zeros((3, 224, 224))

        if bs is None:  # No batching
            return result
        else:
            return [result] * bs

    if batch_size:
        noop = serve_reference.accept_batch(noop)

    with serve_reference.using_router("noop"):
        serve_reference.create_endpoint("noop", "/noop")
        config = serve_reference.BackendConfig(max_batch_size=batch_size)
        serve_reference.create_backend(noop, "noop", backend_config=config)
        serve_reference.link("noop", "noop")
        handle = serve_reference.get_handle("noop")

    latency = []
    for i in tqdm(range(num_warmups + num_queries)):
        if i == num_warmups:
            serve_reference.clear_trace()

        start = time.perf_counter()

        if not batch_size:
            ray.get(
                # This is how to pass a higher level metadata to the tracing
                # context
                handle.options(tracing_metadata={
                    "demo": "pipeline-id"
                }).remote())
        else:
            ray.get(handle.enqueue_batch(val=[1] * batch_size))
            # ray.get([handle.remote() for _ in range(batch_size)])

        end = time.perf_counter()
        latency.append(end - start)

    # Remove initial samples
    latency = latency[num_warmups:]

    series = pd.Series(latency) * 1000
    print("Latency for single noop backend (ms)")
    print(series.describe(percentiles=[0.5, 0.9, 0.95, 0.99]))

    _, trace_file = tempfile.mkstemp(suffix=".json")
    with open(trace_file, "w") as f:
        json.dump(serve_reference.get_trace(), f)
    print(f"trace file written to {trace_file}")
예제 #4
0
def serve_instance():
    _, new_db_path = tempfile.mkstemp(suffix=".test.db")
    serve_reference.init(
        kv_store_path=new_db_path,
        blocking=True,
        ray_init_kwargs={"num_cpus": 36},
    )
    yield
    os.remove(new_db_path)
예제 #5
0
    def run(self):

        tensor_data = construct_tensor(self.config)
        for batch_size, pipeline_length in product(
                self.config["max_batch_sizes"],
                self.config["pipeline_lengths"]):
            df_row = dict(
                batch_size=batch_size,
                pipeline_length=pipeline_length,
                tensor_type=self.config["tensor_type"],
                tensor_shape="x".join(
                    [str(shape) for shape in self.config["tensor_shape"]]),
                serving_type=self.config["serving_type"],
                arrival_process=self.config["arrival_process"],
            )

            # initialize serve
            serve_reference.init(start_server=False)

            chain_pipeline = Chain(max_batch_size=batch_size,
                                   pipeline_length=pipeline_length)

            # warmup
            ready_refs, _ = ray.wait(
                [chain_pipeline.remote(tensor_data) for _ in range(200)], 200)
            ray.wait(ready_refs, num_returns=200)
            del ready_refs

            qps = self._throughput_calculation(chain_pipeline, tensor_data)
            df_row.update(throughput_qps=qps)

            serve_reference.clear_trace()

            # closed loop latency calculation
            closed_loop_latencies = list()
            for _ in range(self.config["num_requests"]):
                start_time = time.perf_counter()
                ready, _ = ray.wait([chain_pipeline.remote(tensor_data)], 1)
                ray.wait(ready, 1)
                end_time = time.perf_counter()
                latency = end_time - start_time
                closed_loop_latencies.append(latency)

            pprint(df_row)
            # percentile_values =
            df_row.update(latency_s=closed_loop_latencies)

            self._df = self._df.append(df_row, ignore_index=True)

            # cleanup
            del closed_loop_latencies, chain_pipeline
            serve_reference.shutdown()
예제 #6
0
def main():

    TAG = "Resnet18"
    min_img_size = 224
    transform = transforms.Compose([
        transforms.Resize(min_img_size),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        ),
    ])

    for num_replica in range(1, 9):
        # initialize serve
        serve_reference.init(start_server=False)

        serve_handle = None
        with serve_reference.using_router(TAG):
            serve_reference.create_endpoint(TAG)
            config = serve_reference.BackendConfig(max_batch_size=8,
                                                   num_replicas=num_replica,
                                                   num_gpus=1)
            serve_reference.create_backend(
                PredictModelPytorch,
                TAG,
                transform,
                "resnet18",
                True,
                backend_config=config,
            )
            serve_reference.link(TAG, TAG)
            serve_handle = serve_reference.get_handle(TAG)

        img = base64.b64encode(open("elephant.jpg", "rb").read())

        # warmup
        ready_refs, _ = ray.wait(
            [serve_handle.remote(data=img) for _ in range(200)], 200)
        complete_oids, _ = ray.wait(ray.get(ready_refs), num_returns=200)
        del ready_refs
        del complete_oids

        qps = throughput_calculation(serve_handle, {"data": img}, 2000)
        print(f"[Resnet18] Batch Size: 8 Replica: {num_replica} "
              f"Throughput: {qps} QPS")

        serve_reference.shutdown()
예제 #7
0
    def run(self):
        for vertex_config in self.config["vertex_configs"]:

            serve_reference.init(start_server=False)
            filename_query = "arrival_trace.jsonl"
            route = "/prepoc"

            pipeline = ImagePrepocPipeline(vertex_config,
                                           self.config["model_type"])
            vertex_config_name = json.dumps(vertex_config)
            df_row = dict(
                vertex_config=vertex_config_name,
                serving_type=self.config["serving_type"],
                arrival_process=self.config["arrival_process"],
            )

            image_path = os.path.join(ROOT_DIR, self.config["image_file_path"])
            tensor_data = base64.b64encode(open(image_path, "rb").read())

            throughput_qps = self._throughput_calculation(
                pipeline, tensor_data, self.config["num_requests"])
            df_row.update(throughput_qps=throughput_qps)

            pprint(df_row)

            # closed loop latency calculation
            closed_loop_latencies = list()
            for _ in range(self.config["num_requests"]):
                start_time = time.perf_counter()
                ready, _ = ray.wait([pipeline.remote(tensor_data)], 1)
                ray.wait(ready, 1)
                end_time = time.perf_counter()
                latency = end_time - start_time
                closed_loop_latencies.append(latency)

            df_row.update(latency_s=closed_loop_latencies)

            self._df = self._df.append(df_row, ignore_index=True)

            # cleanup
            del closed_loop_latencies, pipeline
            serve_reference.shutdown()
예제 #8
0
import time
import tempfile
import json

import ray
import click
import torch
import base64

from benchmarking import serve_reference

serve_reference.init(start_server=False)

batch_size = 1
num_queries = 2000

raw_image_data = base64.b64encode(open("./elephant.jpg", "rb").read())
image_data = ray.put(raw_image_data)


@serve_reference.accept_batch
def noop(_, sleep_time=[], data=[]):
    time.sleep(sleep_time[0])
    return [torch.ones((1, 224, 224, 3))] * serve_reference.context.batch_size


@click.command()
@click.option("--num-replicas", type=int, default=1)
@click.option("--method",
              type=click.Choice(["chain", "group"]),
              default="chain")