def test_bounds_checking(self): num_replicas = 10 max_replicas = 11 min_replicas = 9 config = AutoscalingConfig( max_replicas=max_replicas, min_replicas=min_replicas, target_num_ongoing_requests_per_replica=100, ) desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=[150] * num_replicas) assert desired_num_replicas == max_replicas desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=[50] * num_replicas) assert desired_num_replicas == min_replicas for i in range(50, 150): desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=[i] * num_replicas, ) assert min_replicas <= desired_num_replicas <= max_replicas
def test_replicas_delayed_startup(): """Unit test simulating replicas taking time to start up.""" config = AutoscalingConfig( min_replicas=1, max_replicas=200, target_num_ongoing_requests_per_replica=1, upscale_delay_s=0, downscale_delay_s=100000, ) policy = BasicAutoscalingPolicy(config) new_num_replicas = policy.get_decision_num_replicas([100], 1) assert new_num_replicas == 100 # New target is 100, but no new replicas finished spinning up during this # timestep. new_num_replicas = policy.get_decision_num_replicas([100], 100) assert new_num_replicas == 100 # Two new replicas spun up during this timestep. new_num_replicas = policy.get_decision_num_replicas([100, 20, 3], 100) assert new_num_replicas == 123 # A lot of queries got drained and a lot of replicas started up, but # new_num_replicas should not decrease, because of the downscale delay. new_num_replicas = policy.get_decision_num_replicas([6, 2, 1, 1], 123) assert new_num_replicas == 123
def test_mutually_exclusive_num_replicas_and_autoscaling_config(self): # num_replicas and autoscaling_config cannot be set at the same time deployment_schema = self.get_minimal_deployment_schema() deployment_schema["num_replicas"] = 5 deployment_schema["autoscaling_config"] = None DeploymentSchema.parse_obj(deployment_schema) deployment_schema["num_replicas"] = None deployment_schema["autoscaling_config"] = AutoscalingConfig().dict() DeploymentSchema.parse_obj(deployment_schema) deployment_schema["num_replicas"] = 5 deployment_schema["autoscaling_config"] = AutoscalingConfig().dict() with pytest.raises(ValueError): DeploymentSchema.parse_obj(deployment_schema)
def test_imbalanced_replicas(ongoing_requests): config = AutoscalingConfig( min_replicas=1, max_replicas=10, target_num_ongoing_requests_per_replica=5, upscale_delay_s=0.0, downscale_delay_s=0.0, ) policy = BasicAutoscalingPolicy(config) # Check that as long as the average number of ongoing requests equals # the target_num_ongoing_requests_per_replica, the number of replicas # stays the same if ( sum(ongoing_requests) / len(ongoing_requests) == config.target_num_ongoing_requests_per_replica ): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4, current_handle_queued_queries=0, ) assert new_num_replicas == 4 # Check downscaling behavior when average number of requests # is lower than target_num_ongoing_requests_per_replica elif ( sum(ongoing_requests) / len(ongoing_requests) < config.target_num_ongoing_requests_per_replica ): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4, current_handle_queued_queries=0, ) if ( config.target_num_ongoing_requests_per_replica - sum(ongoing_requests) / len(ongoing_requests) <= 1 ): # Autoscaling uses a ceiling operator, which means a slightly low # current_num_ongoing_requests value is insufficient to downscale assert new_num_replicas == 4 else: assert new_num_replicas == 3 # Check upscaling behavior when average number of requests # is higher than target_num_ongoing_requests_per_replica else: new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4, current_handle_queued_queries=0, ) assert new_num_replicas == 5
def test_scale_down(self): config = AutoscalingConfig(min_replicas=0, max_replicas=100, target_num_ongoing_requests_per_replica=1) num_replicas = 10 num_ongoing_requests = [0.5] * num_replicas desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests) assert 4 <= desired_num_replicas <= 6 # 10 * 0.5 = 5
def test_scale_up(self): config = AutoscalingConfig(min_replicas=0, max_replicas=100, target_num_ongoing_requests_per_replica=1) num_replicas = 10 num_ongoing_requests = [2.0] * num_replicas desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests) assert 19 <= desired_num_replicas <= 21 # 10 * 2 = 20
def test_autoscaling_config_validation(): # Check validation over publicly exposed options with pytest.raises(ValidationError): # min_replicas must be nonnegative AutoscalingConfig(min_replicas=-1) with pytest.raises(ValidationError): # max_replicas must be positive AutoscalingConfig(max_replicas=0) with pytest.raises(ValidationError): # max_replicas must be nonnegative AutoscalingConfig(target_num_ongoing_requests_per_replica=-1) with pytest.raises(ValueError): # max_replicas must be greater than or equal to min_replicas AutoscalingConfig(min_replicas=100, max_replicas=1) # Default values should not raise an error AutoscalingConfig()
def test_single_replica_receives_all_requests(ongoing_requests): target_requests = 5 config = AutoscalingConfig( min_replicas=1, max_replicas=50, target_num_ongoing_requests_per_replica=target_requests, upscale_delay_s=0.0, downscale_delay_s=0.0) policy = BasicAutoscalingPolicy(config) new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4) assert new_num_replicas == sum(ongoing_requests) / target_requests
def test_fluctuating_ongoing_requests(delay_s): """ Simulates a workload that switches between too many and too few ongoing requests. """ config = AutoscalingConfig( min_replicas=1, max_replicas=10, target_num_ongoing_requests_per_replica=50, upscale_delay_s=delay_s, downscale_delay_s=delay_s, ) policy = BasicAutoscalingPolicy(config) if delay_s > 0: wait_periods = int(delay_s / CONTROL_LOOP_PERIOD_S) assert wait_periods > 1 underload_requests, overload_requests = [20, 20], [100] trials = 1000 new_num_replicas = None for trial in range(trials): if trial % 2 == 0: new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1, current_handle_queued_queries=0, ) if delay_s > 0: assert new_num_replicas == 1, trial else: assert new_num_replicas == 2, trial else: new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=underload_requests, curr_target_num_replicas=2, current_handle_queued_queries=0, ) if delay_s > 0: assert new_num_replicas == 2, trial else: assert new_num_replicas == 1, trial
def test_smoothing_factor(self): config = AutoscalingConfig(min_replicas=0, max_replicas=100, target_num_ongoing_requests_per_replica=1, smoothing_factor=0.5) num_replicas = 10 num_ongoing_requests = [4.0] * num_replicas desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests) assert 24 <= desired_num_replicas <= 26 # 10 + 0.5 * (40 - 10) = 25 num_ongoing_requests = [0.25] * num_replicas desired_num_replicas = calculate_desired_num_replicas( autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests) assert 5 <= desired_num_replicas <= 8 # 10 + 0.5 * (2.5 - 10) = 6.25
def test_zero_default_proto(): # Test that options set to zero (protobuf default value) still retain their # original value after being serialized and deserialized. config = DeploymentConfig( autoscaling_config={ "min_replicas": 1, "max_replicas": 2, "smoothing_factor": 0.123, "downscale_delay_s": 0 }) serialized_config = config.to_proto_bytes() deserialized_config = DeploymentConfig.from_proto_bytes(serialized_config) new_delay_s = deserialized_config.autoscaling_config.downscale_delay_s assert new_delay_s == 0 # Check that this test is not spuriously passing. default_downscale_delay_s = AutoscalingConfig().downscale_delay_s assert new_delay_s != default_downscale_delay_s
def test_upscale_downscale_delay(): """Unit test for upscale_delay_s and downscale_delay_s.""" upscale_delay_s = 30.0 downscale_delay_s = 600.0 config = AutoscalingConfig( min_replicas=1, max_replicas=2, target_num_ongoing_requests_per_replica=1, upscale_delay_s=30.0, downscale_delay_s=600.0, ) policy = BasicAutoscalingPolicy(config) upscale_wait_periods = int(upscale_delay_s / CONTROL_LOOP_PERIOD_S) downscale_wait_periods = int(downscale_delay_s / CONTROL_LOOP_PERIOD_S) overload_requests = [100] # We should scale up only after enough consecutive scale-up decisions. for i in range(upscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 1, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 2 no_requests = [0, 0] # We should scale down only after enough consecutive scale-down decisions. for i in range(downscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 2, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 1 # Get some scale-up decisions, but not enough to trigger a scale up. for i in range(int(upscale_wait_periods / 2)): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 1, i # Interrupt with a scale-down decision. policy.get_decision_num_replicas(current_num_ongoing_requests=[0], curr_target_num_replicas=1) # The counter should be reset, so it should require `upscale_wait_periods` # more periods before we actually scale up. for i in range(upscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 1, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 2 # Get some scale-down decisions, but not enough to trigger a scale down. for i in range(int(downscale_wait_periods / 2)): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 2, i # Interrupt with a scale-up decision. policy.get_decision_num_replicas(current_num_ongoing_requests=[100, 100], curr_target_num_replicas=2) # The counter should be reset so it should require `downscale_wait_periods` # more periods before we actually scale down. for i in range(downscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 2, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 1
def deployment( _func_or_class: Optional[Callable] = None, name: Optional[str] = None, version: Optional[str] = None, prev_version: Optional[str] = None, num_replicas: Optional[int] = None, init_args: Optional[Tuple[Any]] = None, route_prefix: Optional[str] = None, ray_actor_options: Optional[Dict] = None, user_config: Optional[Any] = None, max_concurrent_queries: Optional[int] = None, _autoscaling_config: Optional[dict] = None, ) -> Callable[[Callable], Deployment]: """Define a Serve deployment. Args: name (Optional[str]): Globally-unique name identifying this deployment. If not provided, the name of the class or function will be used. version (Optional[str]): Version of the deployment. This is used to indicate a code change for the deployment; when it is re-deployed with a version change, a rolling update of the replicas will be performed. If not provided, every deployment will be treated as a new version. prev_version (Optional[str]): Version of the existing deployment which is used as a precondition for the next deployment. If prev_version does not match with the existing deployment's version, the deployment will fail. If not provided, deployment procedure will not check the existing deployment's version. num_replicas (Optional[int]): The number of processes to start up that will handle requests to this deployment. Defaults to 1. init_args (Optional[Tuple]): Arguments to be passed to the class constructor when starting up deployment replicas. These can also be passed when you call `.deploy()` on the returned Deployment. route_prefix (Optional[str]): Requests to paths under this HTTP path prefix will be routed to this deployment. Defaults to '/{name}'. Routing is done based on longest-prefix match, so if you have deployment A with a prefix of '/a' and deployment B with a prefix of '/a/b', requests to '/a', '/a/', and '/a/c' go to A and requests to '/a/b', '/a/b/', and '/a/b/c' go to B. Routes must not end with a '/' unless they're the root (just '/'), which acts as a catch-all. ray_actor_options (dict): Options to be passed to the Ray actor constructor such as resource requirements. user_config (Optional[Any]): [experimental] Config to pass to the reconfigure method of the deployment. This can be updated dynamically without changing the version of the deployment and restarting its replicas. The user_config needs to be hashable to keep track of updates, so it must only contain hashable types, or hashable types nested in lists and dictionaries. max_concurrent_queries (Optional[int]): The maximum number of queries that will be sent to a replica of this deployment without receiving a response. Defaults to 100. Example: >>> @serve.deployment(name="deployment1", version="v1") class MyDeployment: pass >>> MyDeployment.deploy(*init_args) >>> MyDeployment.options(num_replicas=2, init_args=init_args).deploy() Returns: Deployment """ config = BackendConfig() if num_replicas is not None: config.num_replicas = num_replicas if user_config is not None: config.user_config = user_config if max_concurrent_queries is not None: config.max_concurrent_queries = max_concurrent_queries if _autoscaling_config is not None: config.autoscaling_config = AutoscalingConfig.parse_obj( _autoscaling_config) def decorator(_func_or_class): return Deployment( _func_or_class, name if name is not None else _func_or_class.__name__, config, version=version, prev_version=prev_version, init_args=init_args, route_prefix=route_prefix, ray_actor_options=ray_actor_options, _internal=True, ) # This handles both parametrized and non-parametrized usage of the # decorator. See the @serve.batch code for more details. return decorator(_func_or_class) if callable(_func_or_class) else decorator