Ejemplo n.º 1
0
    def test_bounds_checking(self):
        num_replicas = 10
        max_replicas = 11
        min_replicas = 9
        config = AutoscalingConfig(
            max_replicas=max_replicas,
            min_replicas=min_replicas,
            target_num_ongoing_requests_per_replica=100,
        )

        desired_num_replicas = calculate_desired_num_replicas(
            autoscaling_config=config,
            current_num_ongoing_requests=[150] * num_replicas)
        assert desired_num_replicas == max_replicas

        desired_num_replicas = calculate_desired_num_replicas(
            autoscaling_config=config,
            current_num_ongoing_requests=[50] * num_replicas)
        assert desired_num_replicas == min_replicas

        for i in range(50, 150):
            desired_num_replicas = calculate_desired_num_replicas(
                autoscaling_config=config,
                current_num_ongoing_requests=[i] * num_replicas,
            )
            assert min_replicas <= desired_num_replicas <= max_replicas
Ejemplo n.º 2
0
def test_replicas_delayed_startup():
    """Unit test simulating replicas taking time to start up."""
    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=200,
        target_num_ongoing_requests_per_replica=1,
        upscale_delay_s=0,
        downscale_delay_s=100000,
    )

    policy = BasicAutoscalingPolicy(config)

    new_num_replicas = policy.get_decision_num_replicas([100], 1)
    assert new_num_replicas == 100

    # New target is 100, but no new replicas finished spinning up during this
    # timestep.
    new_num_replicas = policy.get_decision_num_replicas([100], 100)
    assert new_num_replicas == 100

    # Two new replicas spun up during this timestep.
    new_num_replicas = policy.get_decision_num_replicas([100, 20, 3], 100)
    assert new_num_replicas == 123

    # A lot of queries got drained and a lot of replicas started up, but
    # new_num_replicas should not decrease, because of the downscale delay.
    new_num_replicas = policy.get_decision_num_replicas([6, 2, 1, 1], 123)
    assert new_num_replicas == 123
Ejemplo n.º 3
0
    def test_mutually_exclusive_num_replicas_and_autoscaling_config(self):
        # num_replicas and autoscaling_config cannot be set at the same time
        deployment_schema = self.get_minimal_deployment_schema()

        deployment_schema["num_replicas"] = 5
        deployment_schema["autoscaling_config"] = None
        DeploymentSchema.parse_obj(deployment_schema)

        deployment_schema["num_replicas"] = None
        deployment_schema["autoscaling_config"] = AutoscalingConfig().dict()
        DeploymentSchema.parse_obj(deployment_schema)

        deployment_schema["num_replicas"] = 5
        deployment_schema["autoscaling_config"] = AutoscalingConfig().dict()
        with pytest.raises(ValueError):
            DeploymentSchema.parse_obj(deployment_schema)
Ejemplo n.º 4
0
def test_imbalanced_replicas(ongoing_requests):
    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=10,
        target_num_ongoing_requests_per_replica=5,
        upscale_delay_s=0.0,
        downscale_delay_s=0.0,
    )

    policy = BasicAutoscalingPolicy(config)

    # Check that as long as the average number of ongoing requests equals
    # the target_num_ongoing_requests_per_replica, the number of replicas
    # stays the same
    if (
        sum(ongoing_requests) / len(ongoing_requests)
        == config.target_num_ongoing_requests_per_replica
    ):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=ongoing_requests,
            curr_target_num_replicas=4,
            current_handle_queued_queries=0,
        )
        assert new_num_replicas == 4

    # Check downscaling behavior when average number of requests
    # is lower than target_num_ongoing_requests_per_replica
    elif (
        sum(ongoing_requests) / len(ongoing_requests)
        < config.target_num_ongoing_requests_per_replica
    ):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=ongoing_requests,
            curr_target_num_replicas=4,
            current_handle_queued_queries=0,
        )

        if (
            config.target_num_ongoing_requests_per_replica
            - sum(ongoing_requests) / len(ongoing_requests)
            <= 1
        ):
            # Autoscaling uses a ceiling operator, which means a slightly low
            # current_num_ongoing_requests value is insufficient to downscale
            assert new_num_replicas == 4
        else:
            assert new_num_replicas == 3

    # Check upscaling behavior when average number of requests
    # is higher than target_num_ongoing_requests_per_replica
    else:
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=ongoing_requests,
            curr_target_num_replicas=4,
            current_handle_queued_queries=0,
        )
        assert new_num_replicas == 5
Ejemplo n.º 5
0
 def test_scale_down(self):
     config = AutoscalingConfig(min_replicas=0,
                                max_replicas=100,
                                target_num_ongoing_requests_per_replica=1)
     num_replicas = 10
     num_ongoing_requests = [0.5] * num_replicas
     desired_num_replicas = calculate_desired_num_replicas(
         autoscaling_config=config,
         current_num_ongoing_requests=num_ongoing_requests)
     assert 4 <= desired_num_replicas <= 6  # 10 * 0.5 = 5
Ejemplo n.º 6
0
 def test_scale_up(self):
     config = AutoscalingConfig(min_replicas=0,
                                max_replicas=100,
                                target_num_ongoing_requests_per_replica=1)
     num_replicas = 10
     num_ongoing_requests = [2.0] * num_replicas
     desired_num_replicas = calculate_desired_num_replicas(
         autoscaling_config=config,
         current_num_ongoing_requests=num_ongoing_requests)
     assert 19 <= desired_num_replicas <= 21  # 10 * 2 = 20
Ejemplo n.º 7
0
def test_autoscaling_config_validation():
    # Check validation over publicly exposed options

    with pytest.raises(ValidationError):
        # min_replicas must be nonnegative
        AutoscalingConfig(min_replicas=-1)

    with pytest.raises(ValidationError):
        # max_replicas must be positive
        AutoscalingConfig(max_replicas=0)

    with pytest.raises(ValidationError):
        # max_replicas must be nonnegative
        AutoscalingConfig(target_num_ongoing_requests_per_replica=-1)

    with pytest.raises(ValueError):
        # max_replicas must be greater than or equal to min_replicas
        AutoscalingConfig(min_replicas=100, max_replicas=1)

    # Default values should not raise an error
    AutoscalingConfig()
Ejemplo n.º 8
0
def test_single_replica_receives_all_requests(ongoing_requests):
    target_requests = 5

    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=50,
        target_num_ongoing_requests_per_replica=target_requests,
        upscale_delay_s=0.0,
        downscale_delay_s=0.0)

    policy = BasicAutoscalingPolicy(config)

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=ongoing_requests,
        curr_target_num_replicas=4)
    assert new_num_replicas == sum(ongoing_requests) / target_requests
Ejemplo n.º 9
0
def test_fluctuating_ongoing_requests(delay_s):
    """
    Simulates a workload that switches between too many and too few
    ongoing requests.
    """

    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=10,
        target_num_ongoing_requests_per_replica=50,
        upscale_delay_s=delay_s,
        downscale_delay_s=delay_s,
    )

    policy = BasicAutoscalingPolicy(config)

    if delay_s > 0:
        wait_periods = int(delay_s / CONTROL_LOOP_PERIOD_S)
        assert wait_periods > 1

    underload_requests, overload_requests = [20, 20], [100]
    trials = 1000

    new_num_replicas = None
    for trial in range(trials):
        if trial % 2 == 0:
            new_num_replicas = policy.get_decision_num_replicas(
                current_num_ongoing_requests=overload_requests,
                curr_target_num_replicas=1,
                current_handle_queued_queries=0,
            )
            if delay_s > 0:
                assert new_num_replicas == 1, trial
            else:
                assert new_num_replicas == 2, trial
        else:
            new_num_replicas = policy.get_decision_num_replicas(
                current_num_ongoing_requests=underload_requests,
                curr_target_num_replicas=2,
                current_handle_queued_queries=0,
            )
            if delay_s > 0:
                assert new_num_replicas == 2, trial
            else:
                assert new_num_replicas == 1, trial
    def test_smoothing_factor(self):
        config = AutoscalingConfig(min_replicas=0,
                                   max_replicas=100,
                                   target_num_ongoing_requests_per_replica=1,
                                   smoothing_factor=0.5)
        num_replicas = 10

        num_ongoing_requests = [4.0] * num_replicas
        desired_num_replicas = calculate_desired_num_replicas(
            autoscaling_config=config,
            current_num_ongoing_requests=num_ongoing_requests)
        assert 24 <= desired_num_replicas <= 26  # 10 + 0.5 * (40 - 10) = 25

        num_ongoing_requests = [0.25] * num_replicas
        desired_num_replicas = calculate_desired_num_replicas(
            autoscaling_config=config,
            current_num_ongoing_requests=num_ongoing_requests)
        assert 5 <= desired_num_replicas <= 8  # 10 + 0.5 * (2.5 - 10) = 6.25
Ejemplo n.º 11
0
def test_zero_default_proto():
    # Test that options set to zero (protobuf default value) still retain their
    # original value after being serialized and deserialized.
    config = DeploymentConfig(
        autoscaling_config={
            "min_replicas": 1,
            "max_replicas": 2,
            "smoothing_factor": 0.123,
            "downscale_delay_s": 0
        })
    serialized_config = config.to_proto_bytes()
    deserialized_config = DeploymentConfig.from_proto_bytes(serialized_config)
    new_delay_s = deserialized_config.autoscaling_config.downscale_delay_s
    assert new_delay_s == 0

    # Check that this test is not spuriously passing.
    default_downscale_delay_s = AutoscalingConfig().downscale_delay_s
    assert new_delay_s != default_downscale_delay_s
Ejemplo n.º 12
0
def test_upscale_downscale_delay():
    """Unit test for upscale_delay_s and downscale_delay_s."""

    upscale_delay_s = 30.0
    downscale_delay_s = 600.0

    config = AutoscalingConfig(
        min_replicas=1,
        max_replicas=2,
        target_num_ongoing_requests_per_replica=1,
        upscale_delay_s=30.0,
        downscale_delay_s=600.0,
    )

    policy = BasicAutoscalingPolicy(config)

    upscale_wait_periods = int(upscale_delay_s / CONTROL_LOOP_PERIOD_S)
    downscale_wait_periods = int(downscale_delay_s / CONTROL_LOOP_PERIOD_S)

    overload_requests = [100]

    # We should scale up only after enough consecutive scale-up decisions.
    for i in range(upscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=overload_requests,
            curr_target_num_replicas=1)
        assert new_num_replicas == 1, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=overload_requests,
        curr_target_num_replicas=1)
    assert new_num_replicas == 2

    no_requests = [0, 0]

    # We should scale down only after enough consecutive scale-down decisions.
    for i in range(downscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=no_requests,
            curr_target_num_replicas=2)
        assert new_num_replicas == 2, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=no_requests, curr_target_num_replicas=2)
    assert new_num_replicas == 1

    # Get some scale-up decisions, but not enough to trigger a scale up.
    for i in range(int(upscale_wait_periods / 2)):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=overload_requests,
            curr_target_num_replicas=1)
        assert new_num_replicas == 1, i

    # Interrupt with a scale-down decision.
    policy.get_decision_num_replicas(current_num_ongoing_requests=[0],
                                     curr_target_num_replicas=1)

    # The counter should be reset, so it should require `upscale_wait_periods`
    # more periods before we actually scale up.
    for i in range(upscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=overload_requests,
            curr_target_num_replicas=1)
        assert new_num_replicas == 1, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=overload_requests,
        curr_target_num_replicas=1)
    assert new_num_replicas == 2

    # Get some scale-down decisions, but not enough to trigger a scale down.
    for i in range(int(downscale_wait_periods / 2)):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=no_requests,
            curr_target_num_replicas=2)
        assert new_num_replicas == 2, i

    # Interrupt with a scale-up decision.
    policy.get_decision_num_replicas(current_num_ongoing_requests=[100, 100],
                                     curr_target_num_replicas=2)

    # The counter should be reset so it should require `downscale_wait_periods`
    # more periods before we actually scale down.
    for i in range(downscale_wait_periods):
        new_num_replicas = policy.get_decision_num_replicas(
            current_num_ongoing_requests=no_requests,
            curr_target_num_replicas=2)
        assert new_num_replicas == 2, i

    new_num_replicas = policy.get_decision_num_replicas(
        current_num_ongoing_requests=no_requests, curr_target_num_replicas=2)
    assert new_num_replicas == 1
Ejemplo n.º 13
0
def deployment(
    _func_or_class: Optional[Callable] = None,
    name: Optional[str] = None,
    version: Optional[str] = None,
    prev_version: Optional[str] = None,
    num_replicas: Optional[int] = None,
    init_args: Optional[Tuple[Any]] = None,
    route_prefix: Optional[str] = None,
    ray_actor_options: Optional[Dict] = None,
    user_config: Optional[Any] = None,
    max_concurrent_queries: Optional[int] = None,
    _autoscaling_config: Optional[dict] = None,
) -> Callable[[Callable], Deployment]:
    """Define a Serve deployment.

    Args:
        name (Optional[str]): Globally-unique name identifying this deployment.
            If not provided, the name of the class or function will be used.
        version (Optional[str]): Version of the deployment. This is used to
            indicate a code change for the deployment; when it is re-deployed
            with a version change, a rolling update of the replicas will be
            performed. If not provided, every deployment will be treated as a
            new version.
        prev_version (Optional[str]): Version of the existing deployment which
            is used as a precondition for the next deployment. If prev_version
            does not match with the existing deployment's version, the
            deployment will fail. If not provided, deployment procedure will
            not check the existing deployment's version.
        num_replicas (Optional[int]): The number of processes to start up that
            will handle requests to this deployment. Defaults to 1.
        init_args (Optional[Tuple]): Arguments to be passed to the class
            constructor when starting up deployment replicas. These can also be
            passed when you call `.deploy()` on the returned Deployment.
        route_prefix (Optional[str]): Requests to paths under this HTTP path
            prefix will be routed to this deployment. Defaults to '/{name}'.
            Routing is done based on longest-prefix match, so if you have
            deployment A with a prefix of '/a' and deployment B with a prefix
            of '/a/b', requests to '/a', '/a/', and '/a/c' go to A and requests
            to '/a/b', '/a/b/', and '/a/b/c' go to B. Routes must not end with
            a '/' unless they're the root (just '/'), which acts as a
            catch-all.
        ray_actor_options (dict): Options to be passed to the Ray actor
            constructor such as resource requirements.
        user_config (Optional[Any]): [experimental] Config to pass to the
            reconfigure method of the deployment. This can be updated
            dynamically without changing the version of the deployment and
            restarting its replicas. The user_config needs to be hashable to
            keep track of updates, so it must only contain hashable types, or
            hashable types nested in lists and dictionaries.
        max_concurrent_queries (Optional[int]): The maximum number of queries
            that will be sent to a replica of this deployment without receiving
            a response. Defaults to 100.

    Example:

    >>> @serve.deployment(name="deployment1", version="v1")
        class MyDeployment:
            pass

    >>> MyDeployment.deploy(*init_args)
    >>> MyDeployment.options(num_replicas=2, init_args=init_args).deploy()

    Returns:
        Deployment
    """

    config = BackendConfig()
    if num_replicas is not None:
        config.num_replicas = num_replicas

    if user_config is not None:
        config.user_config = user_config

    if max_concurrent_queries is not None:
        config.max_concurrent_queries = max_concurrent_queries

    if _autoscaling_config is not None:
        config.autoscaling_config = AutoscalingConfig.parse_obj(
            _autoscaling_config)

    def decorator(_func_or_class):
        return Deployment(
            _func_or_class,
            name if name is not None else _func_or_class.__name__,
            config,
            version=version,
            prev_version=prev_version,
            init_args=init_args,
            route_prefix=route_prefix,
            ray_actor_options=ray_actor_options,
            _internal=True,
        )

    # This handles both parametrized and non-parametrized usage of the
    # decorator. See the @serve.batch code for more details.
    return decorator(_func_or_class) if callable(_func_or_class) else decorator