Example #1
0
def scale(backend_tag, num_replicas):
    if num_replicas <= 0:
        click.Abort(
            "Cannot set number of replicas to be smaller or equal to 0.")
    ray.init(address="auto")
    serve.init()

    serve.scale(backend_tag, num_replicas)
Example #2
0
File: test_api.py Project: ujvl/ray
def test_scaling_replicas(serve_instance):
    class Counter:
        def __init__(self):
            self.count = 0

        def __call__(self, _):
            self.count += 1
            return self.count

    serve.create_endpoint("counter", "/increment")

    # Keep checking the routing table until /increment is populated
    while "/increment" not in requests.get("http://127.0.0.1:8000/").json():
        time.sleep(0.2)

    serve.create_backend(Counter, "counter:v1")
    serve.link("counter", "counter:v1")

    serve.scale("counter:v1", 2)

    counter_result = []
    for _ in range(10):
        resp = requests.get("http://127.0.0.1:8000/increment").json()["result"]
        counter_result.append(resp)

    # If the load is shared among two replicas. The max result cannot be 10.
    assert max(counter_result) < 10

    serve.scale("counter:v1", 1)

    counter_result = []
    for _ in range(10):
        resp = requests.get("http://127.0.0.1:8000/increment").json()["result"]
        counter_result.append(resp)
    # Give some time for a replica to spin down. But majority of the request
    # should be served by the only remaining replica.
    assert max(counter_result) - min(counter_result) > 6
Example #3
0
print(requests.get("http://127.0.0.1:8000/echo").json())
# The service will be reachable from http

print(ray.get(serve.get_handle("my_endpoint").remote(response="hello")))

# as well as within the ray system.


# We can also add a new backend and split the traffic.
def echo_v2(flask_request):
    # magic, only from web.
    return "something new"


serve.create_backend(echo_v2, "echo:v2")

# The two backend will now split the traffic 50%-50%.
serve.split("my_endpoint", {"echo:v1": 0.5, "echo:v2": 0.5})

# Observe requests are now split between two backends.
for _ in range(10):
    print(requests.get("http://127.0.0.1:8000/echo").json())
    time.sleep(0.5)

# You can also scale each backend independently.
serve.scale("echo:v1", 2)
serve.scale("echo:v2", 2)

# As well as retrieving relevant system metrics
print(pformat_color_json(serve.stat()))