Example #1
0
def test_not_killing_replicas(serve_instance):
    class BatchSimple:
        def __init__(self):
            self.count = 0

        @serve.accept_batch
        def __call__(self, flask_request, temp=None):
            batch_size = serve.context.batch_size
            return [1] * batch_size

    serve.create_endpoint("bsimple", "/bsimple")
    b_config = BackendConfig(num_replicas=3, max_batch_size=2)
    serve.create_backend(BatchSimple, "bsimple:v1", backend_config=b_config)
    global_state = serve.api._get_global_state()
    old_replica_tag_list = global_state.backend_table.list_replicas(
        "bsimple:v1")

    bnew_config = serve.get_backend_config("bsimple:v1")
    # change the config
    bnew_config.max_batch_size = 5
    # set the config
    serve.set_backend_config("bsimple:v1", bnew_config)
    new_replica_tag_list = global_state.backend_table.list_replicas(
        "bsimple:v1")
    global_state.refresh_actor_handle_cache()
    new_all_tag_list = list(global_state.actor_handle_cache.keys())

    # the old and new replica tag list should be identical
    # and should be subset of all_tag_list
    assert set(old_replica_tag_list) <= set(new_all_tag_list)
    assert set(old_replica_tag_list) == set(new_replica_tag_list)
Example #2
0
def test_killing_replicas(serve_instance):
    class Simple:
        def __init__(self):
            self.count = 0

        def __call__(self, flask_request, temp=None):
            return temp

    serve.create_endpoint("simple", "/simple")
    b_config = BackendConfig(num_replicas=3, num_cpus=2)
    serve.create_backend(Simple, "simple:v1", backend_config=b_config)
    global_state = serve.api._get_global_state()
    old_replica_tag_list = global_state.backend_table.list_replicas(
        "simple:v1")

    bnew_config = serve.get_backend_config("simple:v1")
    # change the config
    bnew_config.num_cpus = 1
    # set the config
    serve.set_backend_config("simple:v1", bnew_config)
    new_replica_tag_list = global_state.backend_table.list_replicas(
        "simple:v1")
    global_state.refresh_actor_handle_cache()
    new_all_tag_list = list(global_state.actor_handle_cache.keys())

    # the new_replica_tag_list must be subset of all_tag_list
    assert set(new_replica_tag_list) <= set(new_all_tag_list)

    # the old_replica_tag_list must not be subset of all_tag_list
    assert not set(old_replica_tag_list) <= set(new_all_tag_list)
Example #3
0
def test_scaling_replicas(serve_instance):
    class Counter:
        def __init__(self):
            self.count = 0

        def __call__(self, _):
            self.count += 1
            return self.count

    serve.create_endpoint("counter", "/increment")

    # Keep checking the routing table until /increment is populated
    while "/increment" not in requests.get("http://127.0.0.1:8000/").json():
        time.sleep(0.2)

    b_config = BackendConfig(num_replicas=2)
    serve.create_backend(Counter, "counter:v1", backend_config=b_config)
    serve.link("counter", "counter:v1")

    counter_result = []
    for _ in range(10):
        resp = requests.get("http://127.0.0.1:8000/increment").json()["result"]
        counter_result.append(resp)

    # If the load is shared among two replicas. The max result cannot be 10.
    assert max(counter_result) < 10

    b_config = serve.get_backend_config("counter:v1")
    b_config.num_replicas = 1
    serve.set_backend_config("counter:v1", b_config)

    counter_result = []
    for _ in range(10):
        resp = requests.get("http://127.0.0.1:8000/increment").json()["result"]
        counter_result.append(resp)
    # Give some time for a replica to spin down. But majority of the request
    # should be served by the only remaining replica.
    assert max(counter_result) - min(counter_result) > 6
Example #4
0
serve.init(blocking=True, kv_store_connector=lambda ns: RayInternalKVStore(ns))


@serve.route("/echo")
@serve.accept_batch
def echo(_):
    time.sleep(0.01)  # Sleep for 10ms
    ray.show_in_webui(str(serve.context.batch_size), key="Current batch size")
    return ["hi {}".format(i) for i in range(serve.context.batch_size)]


print("Scaling to 30 replicas")
config = serve.get_backend_config("echo:v0")
config.num_replicas = 30
config.max_batch_size = 16
serve.set_backend_config("echo:v0", config)

print("Warming up")
for _ in range(5):
    resp = requests.get("http://127.0.0.1:8000/echo").json()
    print(resp)
    time.sleep(0.5)

connections = int(config.num_replicas * config.max_batch_size * 0.75)
proc = subprocess.Popen([
    "./hey_linux_amd64", "-c",
    str(connections), "-z", "360m", "http://127.0.0.1:8000/echo"
],
                        stdout=PIPE,
                        stderr=PIPE)
print("started load testing")
Example #5
0
# as well as within the ray system.


# We can also add a new backend and split the traffic.
def echo_v2(flask_request):
    # magic, only from web.
    return "something new"


serve.create_backend(echo_v2, "echo:v2")
backend_config_v2 = serve.get_backend_config("echo:v2")

# The two backend will now split the traffic 50%-50%.
serve.split("my_endpoint", {"echo:v1": 0.5, "echo:v2": 0.5})

# Observe requests are now split between two backends.
for _ in range(10):
    print(requests.get("http://127.0.0.1:8000/echo").json())
    time.sleep(0.5)

# You can also change number of replicas
# for each backend independently.
backend_config_v1.num_replicas = 2
serve.set_backend_config("echo:v1", backend_config_v1)
backend_config_v2.num_replicas = 2
serve.set_backend_config("echo:v2", backend_config_v2)

# As well as retrieving relevant system metrics
print(pformat_color_json(serve.stat()))