def test_imbalanced_replicas(ongoing_requests): config = AutoscalingConfig( min_replicas=1, max_replicas=10, target_num_ongoing_requests_per_replica=5, upscale_delay_s=0.0, downscale_delay_s=0.0, ) policy = BasicAutoscalingPolicy(config) # Check that as long as the average number of ongoing requests equals # the target_num_ongoing_requests_per_replica, the number of replicas # stays the same if ( sum(ongoing_requests) / len(ongoing_requests) == config.target_num_ongoing_requests_per_replica ): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4, current_handle_queued_queries=0, ) assert new_num_replicas == 4 # Check downscaling behavior when average number of requests # is lower than target_num_ongoing_requests_per_replica elif ( sum(ongoing_requests) / len(ongoing_requests) < config.target_num_ongoing_requests_per_replica ): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4, current_handle_queued_queries=0, ) if ( config.target_num_ongoing_requests_per_replica - sum(ongoing_requests) / len(ongoing_requests) <= 1 ): # Autoscaling uses a ceiling operator, which means a slightly low # current_num_ongoing_requests value is insufficient to downscale assert new_num_replicas == 4 else: assert new_num_replicas == 3 # Check upscaling behavior when average number of requests # is higher than target_num_ongoing_requests_per_replica else: new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4, current_handle_queued_queries=0, ) assert new_num_replicas == 5
async def create_backend(self, backend_tag, backend_config, replica_config): """Register a new backend under the specified tag.""" async with self.write_lock: backend_worker = create_backend_worker( replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. self.backends[backend_tag] = BackendInfo(backend_worker, backend_config, replica_config) if backend_config.autoscaling_config is not None: self.autoscaling_policies[ backend_tag] = BasicAutoscalingPolicy( backend_tag, backend_config.autoscaling_config) self._scale_replicas(backend_tag, backend_config.num_replicas) # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. self._checkpoint() await self._start_pending_replicas() # Set the backend config inside the router # (particularly for max-batch-size). await asyncio.gather(*[ router.set_backend_config.remote(backend_tag, backend_config) for router in self.routers.values() ]) await self.broadcast_backend_config(backend_tag)
def deploy( self, name: str, deployment_config_proto_bytes: bytes, replica_config: ReplicaConfig, version: Optional[str], prev_version: Optional[str], route_prefix: Optional[str], deployer_job_id: "ray._raylet.JobID", ) -> Tuple[Optional[GoalId], bool]: if route_prefix is not None: assert route_prefix.startswith("/") deployment_config = DeploymentConfig.from_proto_bytes( deployment_config_proto_bytes) if prev_version is not None: existing_deployment_info = self.deployment_state_manager.get_deployment( name) if existing_deployment_info is None or not existing_deployment_info.version: raise ValueError( f"prev_version '{prev_version}' is specified but " "there is no existing deployment.") if existing_deployment_info.version != prev_version: raise ValueError( f"prev_version '{prev_version}' " "does not match with the existing " f"version '{existing_deployment_info.version}'.") autoscaling_config = deployment_config.autoscaling_config if autoscaling_config is not None: # TODO: is this the desired behaviour? Should this be a setting? deployment_config.num_replicas = autoscaling_config.min_replicas autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config) else: autoscaling_policy = None deployment_info = DeploymentInfo( actor_name=name, serialized_deployment_def=replica_config.serialized_deployment_def, version=version, deployment_config=deployment_config, replica_config=replica_config, deployer_job_id=deployer_job_id, start_time_ms=int(time.time() * 1000), autoscaling_policy=autoscaling_policy, ) # TODO(architkulkarni): When a deployment is redeployed, even if # the only change was num_replicas, the start_time_ms is refreshed. # Is this the desired behaviour? goal_id, updating = self.deployment_state_manager.deploy( name, deployment_info) if route_prefix is not None: endpoint_info = EndpointInfo(route=route_prefix) self.endpoint_state.update_endpoint(name, endpoint_info) return goal_id, updating
def test_single_replica_receives_all_requests(ongoing_requests): target_requests = 5 config = AutoscalingConfig( min_replicas=1, max_replicas=50, target_num_ongoing_requests_per_replica=target_requests, upscale_delay_s=0.0, downscale_delay_s=0.0) policy = BasicAutoscalingPolicy(config) new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=ongoing_requests, curr_target_num_replicas=4) assert new_num_replicas == sum(ongoing_requests) / target_requests
def test_fluctuating_ongoing_requests(delay_s): """ Simulates a workload that switches between too many and too few ongoing requests. """ config = AutoscalingConfig( min_replicas=1, max_replicas=10, target_num_ongoing_requests_per_replica=50, upscale_delay_s=delay_s, downscale_delay_s=delay_s, ) policy = BasicAutoscalingPolicy(config) if delay_s > 0: wait_periods = int(delay_s / CONTROL_LOOP_PERIOD_S) assert wait_periods > 1 underload_requests, overload_requests = [20, 20], [100] trials = 1000 new_num_replicas = None for trial in range(trials): if trial % 2 == 0: new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1, current_handle_queued_queries=0, ) if delay_s > 0: assert new_num_replicas == 1, trial else: assert new_num_replicas == 2, trial else: new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=underload_requests, curr_target_num_replicas=2, current_handle_queued_queries=0, ) if delay_s > 0: assert new_num_replicas == 2, trial else: assert new_num_replicas == 1, trial
async def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> UUID: """Register a new backend under the specified tag.""" async with self.write_lock: # Ensures this method is idempotent. backend_info = self.backend_state.get_backend(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return backend_replica = create_backend_replica( replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( worker_class=backend_replica, backend_config=backend_config, replica_config=replica_config) self.backend_state.add_backend(backend_tag, backend_info) metadata = backend_config.internal_metadata if metadata.autoscaling_config is not None: self.autoscaling_policies[ backend_tag] = BasicAutoscalingPolicy( backend_tag, metadata.autoscaling_config) try: # This call should be to run control loop self.actor_reconciler._scale_backend_replicas( self.backend_state.backends, backend_tag, backend_config.num_replicas) except RayServeException as e: del self.backend_state.backends[backend_tag] raise e return_uuid = self._create_event_with_result({ backend_tag: backend_info }) # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. self._checkpoint() await self.actor_reconciler._enqueue_pending_scale_changes_loop( self.backend_state) await self.actor_reconciler.backend_control_loop() self.notify_replica_handles_changed() # Set the backend config inside routers # (particularly for max_concurrent_queries). self.notify_backend_configs_changed() return return_uuid
def test_replicas_delayed_startup(): """Unit test simulating replicas taking time to start up.""" config = AutoscalingConfig( min_replicas=1, max_replicas=200, target_num_ongoing_requests_per_replica=1, upscale_delay_s=0, downscale_delay_s=100000, ) policy = BasicAutoscalingPolicy(config) new_num_replicas = policy.get_decision_num_replicas([100], 1) assert new_num_replicas == 100 # New target is 100, but no new replicas finished spinning up during this # timestep. new_num_replicas = policy.get_decision_num_replicas([100], 100) assert new_num_replicas == 100 # Two new replicas spun up during this timestep. new_num_replicas = policy.get_decision_num_replicas([100, 20, 3], 100) assert new_num_replicas == 123 # A lot of queries got drained and a lot of replicas started up, but # new_num_replicas should not decrease, because of the downscale delay. new_num_replicas = policy.get_decision_num_replicas([6, 2, 1, 1], 123) assert new_num_replicas == 123
def deploy( self, name: str, deployment_config_proto_bytes: bytes, replica_config_proto_bytes: bytes, route_prefix: Optional[str], deployer_job_id: Union["ray._raylet.JobID", bytes], ) -> bool: if route_prefix is not None: assert route_prefix.startswith("/") deployment_config = DeploymentConfig.from_proto_bytes( deployment_config_proto_bytes ) version = deployment_config.version replica_config = ReplicaConfig.from_proto_bytes( replica_config_proto_bytes, deployment_config.needs_pickle() ) autoscaling_config = deployment_config.autoscaling_config if autoscaling_config is not None: # TODO: is this the desired behaviour? Should this be a setting? deployment_config.num_replicas = autoscaling_config.min_replicas autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config) else: autoscaling_policy = None if isinstance(deployer_job_id, bytes): deployer_job_id = ray.JobID.from_int( int.from_bytes(deployer_job_id, "little") ) deployment_info = DeploymentInfo( actor_name=name, version=version, deployment_config=deployment_config, replica_config=replica_config, deployer_job_id=deployer_job_id, start_time_ms=int(time.time() * 1000), autoscaling_policy=autoscaling_policy, ) # TODO(architkulkarni): When a deployment is redeployed, even if # the only change was num_replicas, the start_time_ms is refreshed. # Is this the desired behaviour? updating = self.deployment_state_manager.deploy(name, deployment_info) if route_prefix is not None: endpoint_info = EndpointInfo(route=route_prefix) self.endpoint_state.update_endpoint(name, endpoint_info) else: self.endpoint_state.delete_endpoint(name) return updating
async def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> None: """Register a new backend under the specified tag.""" async with self.write_lock: # Ensures this method is idempotent. backend_info = self.configuration_store.get_backend(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return backend_worker = create_backend_worker( replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. self.configuration_store.add_backend( backend_tag, BackendInfo( worker_class=backend_worker, backend_config=backend_config, replica_config=replica_config)) metadata = backend_config.internal_metadata if metadata.autoscaling_config is not None: self.autoscaling_policies[ backend_tag] = BasicAutoscalingPolicy( backend_tag, metadata.autoscaling_config) try: self.actor_reconciler._scale_replicas( self.configuration_store.backends, backend_tag, backend_config.num_replicas) except RayServeException as e: del self.configuration_store.backends[backend_tag] raise e # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. self._checkpoint() await self.actor_reconciler._start_pending_replicas( self.configuration_store) # Set the backend config inside the router # (particularly for max-batch-size). await asyncio.gather(*[ router.set_backend_config.remote(backend_tag, backend_config) for router in self.actor_reconciler.router_handles() ]) await self.broadcast_backend_config(backend_tag)
async def _recover_from_checkpoint( self, config_store: ConfigurationStore, controller: "ServeController" ) -> Dict[BackendTag, BasicAutoscalingPolicy]: self._recover_actor_handles() autoscaling_policies = dict() # Push configuration state to the router. # TODO(edoakes): should we make this a pull-only model for simplicity? for endpoint, traffic_policy in config_store.traffic_policies.items(): await asyncio.gather(*[ router.set_traffic.remote(endpoint, traffic_policy) for router in self.router_handles() ]) for backend_tag, replica_dict in self.workers.items(): for replica_tag, worker in replica_dict.items(): await asyncio.gather(*[ router.add_new_worker.remote(backend_tag, replica_tag, worker) for router in self.router_handles() ]) for backend, info in config_store.backends.items(): await asyncio.gather(*[ router.set_backend_config.remote(backend, info.backend_config) for router in self.router_handles() ]) await controller.broadcast_backend_config(backend) metadata = info.backend_config.internal_metadata if metadata.autoscaling_config is not None: autoscaling_policies[backend] = BasicAutoscalingPolicy( backend, metadata.autoscaling_config) # Push configuration state to the routers. await asyncio.gather(*[ router.set_route_table.remote(config_store.routes) for router in self.router_handles() ]) # Start/stop any pending backend replicas. await self._start_pending_replicas(config_store) await self._stop_pending_replicas() # Remove any pending backends and endpoints. await self._remove_pending_backends() await self._remove_pending_endpoints() return autoscaling_policies
async def _recover_from_checkpoint( self, backend_state: BackendState, controller: "ServeController" ) -> Dict[BackendTag, BasicAutoscalingPolicy]: self._recover_actor_handles() autoscaling_policies = dict() for backend, info in backend_state.backends.items(): metadata = info.backend_config.internal_metadata if metadata.autoscaling_config is not None: autoscaling_policies[backend] = BasicAutoscalingPolicy( backend, metadata.autoscaling_config) # Start/stop any pending backend replicas. await self._enqueue_pending_scale_changes_loop(backend_state) return autoscaling_policies
async def _recover_from_checkpoint( self, config_store: ConfigurationStore, controller: "ServeController" ) -> Dict[BackendTag, BasicAutoscalingPolicy]: self._recover_actor_handles() autoscaling_policies = dict() for backend, info in config_store.backends.items(): metadata = info.backend_config.internal_metadata if metadata.autoscaling_config is not None: autoscaling_policies[backend] = BasicAutoscalingPolicy( backend, metadata.autoscaling_config) # Start/stop any pending backend replicas. await self._start_pending_backend_replicas(config_store) await self._stop_pending_backend_replicas() return autoscaling_policies
def test_upscale_downscale_delay(): """Unit test for upscale_delay_s and downscale_delay_s.""" upscale_delay_s = 30.0 downscale_delay_s = 600.0 config = AutoscalingConfig( min_replicas=1, max_replicas=2, target_num_ongoing_requests_per_replica=1, upscale_delay_s=30.0, downscale_delay_s=600.0, ) policy = BasicAutoscalingPolicy(config) upscale_wait_periods = int(upscale_delay_s / CONTROL_LOOP_PERIOD_S) downscale_wait_periods = int(downscale_delay_s / CONTROL_LOOP_PERIOD_S) overload_requests = [100] # We should scale up only after enough consecutive scale-up decisions. for i in range(upscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 1, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 2 no_requests = [0, 0] # We should scale down only after enough consecutive scale-down decisions. for i in range(downscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 2, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 1 # Get some scale-up decisions, but not enough to trigger a scale up. for i in range(int(upscale_wait_periods / 2)): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 1, i # Interrupt with a scale-down decision. policy.get_decision_num_replicas(current_num_ongoing_requests=[0], curr_target_num_replicas=1) # The counter should be reset, so it should require `upscale_wait_periods` # more periods before we actually scale up. for i in range(upscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 1, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=overload_requests, curr_target_num_replicas=1) assert new_num_replicas == 2 # Get some scale-down decisions, but not enough to trigger a scale down. for i in range(int(downscale_wait_periods / 2)): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 2, i # Interrupt with a scale-up decision. policy.get_decision_num_replicas(current_num_ongoing_requests=[100, 100], curr_target_num_replicas=2) # The counter should be reset so it should require `downscale_wait_periods` # more periods before we actually scale down. for i in range(downscale_wait_periods): new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 2, i new_num_replicas = policy.get_decision_num_replicas( current_num_ongoing_requests=no_requests, curr_target_num_replicas=2) assert new_num_replicas == 1
async def _recover_from_checkpoint(self, checkpoint_bytes): """Recover the instance state from the provided checkpoint. Performs the following operations: 1) Deserializes the internal state from the checkpoint. 2) Pushes the latest configuration to the routers in case we crashed before updating them. 3) Starts/stops any worker replicas that are pending creation or deletion. NOTE: this requires that self.write_lock is already acquired and will release it before returning. """ assert self.write_lock.locked() start = time.time() logger.info("Recovering from checkpoint") # Load internal state from the checkpoint data. ( self.routes, router_node_ids, self.backends, self.traffic_policies, self.replicas, self.replicas_to_start, self.replicas_to_stop, self.backends_to_remove, self.endpoints_to_remove, ) = pickle.loads(checkpoint_bytes) for node_id in router_node_ids: router_name = format_actor_name(SERVE_PROXY_NAME, self.instance_name, node_id) self.routers[node_id] = ray.get_actor(router_name) # Fetch actor handles for all of the backend replicas in the system. # All of these workers are guaranteed to already exist because they # would not be written to a checkpoint in self.workers until they # were created. for backend_tag, replica_tags in self.replicas.items(): for replica_tag in replica_tags: replica_name = format_actor_name(replica_tag, self.instance_name) self.workers[backend_tag][replica_tag] = ray.get_actor( replica_name) # Push configuration state to the router. # TODO(edoakes): should we make this a pull-only model for simplicity? for endpoint, traffic_policy in self.traffic_policies.items(): await asyncio.gather(*[ router.set_traffic.remote(endpoint, traffic_policy) for router in self.routers.values() ]) for backend_tag, replica_dict in self.workers.items(): for replica_tag, worker in replica_dict.items(): await asyncio.gather(*[ router.add_new_worker.remote(backend_tag, replica_tag, worker) for router in self.routers.values() ]) for backend, info in self.backends.items(): await asyncio.gather(*[ router.set_backend_config.remote(backend, info.backend_config) for router in self.routers.values() ]) await self.broadcast_backend_config(backend) if info.backend_config.autoscaling_config is not None: self.autoscaling_policies[backend] = BasicAutoscalingPolicy( backend, info.backend_config.autoscaling_config) # Push configuration state to the routers. await asyncio.gather(*[ router.set_route_table.remote(self.routes) for router in self.routers.values() ]) # Start/stop any pending backend replicas. await self._start_pending_replicas() await self._stop_pending_replicas() # Remove any pending backends and endpoints. await self._remove_pending_backends() await self._remove_pending_endpoints() logger.info( "Recovered from checkpoint in {:.3f}s".format(time.time() - start)) self.write_lock.release()