class RestartableLongPollerHost: def __init__(self) -> None: print("actor started") self.host = LongPollerHost() self.host.notify_changed("timer", time.time()) async def listen_for_change(self, key_to_ids): await asyncio.sleep(0.5) return await self.host.listen_for_change(key_to_ids) async def exit(self): sys.exit(1)
class RestartableLongPollerHost: def __init__(self) -> None: print("actor started") self.host = LongPollerHost() self.host.notify_changed("timer", time.time()) self.should_exit = False async def listen_for_change(self, key_to_ids): print("listening for change ", key_to_ids) return await self.host.listen_for_change(key_to_ids) async def set_exit(self): self.should_exit = True async def exit_if_set(self): if self.should_exit: print("actor exit") os._exit(1)
def __init__(self): from ray.serve.long_poll import LongPollerHost self.host = LongPollerHost() self.backend_replicas = defaultdict(list) self.backend_configs = dict() self.clear()
class MockControllerActor: def __init__(self): from ray.serve.long_poll import LongPollerHost self.host = LongPollerHost() self.backend_replicas = defaultdict(list) self.backend_configs = dict() self.clear() def clear(self): self.host.notify_changed("worker_handles", {}) self.host.notify_changed("traffic_policies", {}) self.host.notify_changed("backend_configs", {}) async def listen_for_change(self, snapshot_ids): return await self.host.listen_for_change(snapshot_ids) def set_traffic(self, endpoint, traffic_policy): self.host.notify_changed("traffic_policies", {endpoint: traffic_policy}) def add_new_replica(self, backend_tag, runner_actor, backend_config=BackendConfig()): self.backend_replicas[backend_tag].append(runner_actor) self.backend_configs[backend_tag] = backend_config self.host.notify_changed( "worker_handles", self.backend_replicas, ) self.host.notify_changed("backend_configs", self.backend_configs)
async def __init__(self, controller_name: str, http_host: str, http_port: str, http_middlewares: List[Any], detached: bool = False): # Used to read/write checkpoints. self.kv_store = RayInternalKVStore(namespace=controller_name) # ConfigurationStore self.configuration_store = ConfigurationStore() # ActorStateReconciler self.actor_reconciler = ActorStateReconciler(controller_name, detached) # backend -> AutoscalingPolicy self.autoscaling_policies = dict() # Dictionary of backend_tag -> router_name -> most recent queue length. self.backend_stats = defaultdict(lambda: defaultdict(dict)) # Used to ensure that only a single state-changing operation happens # at any given time. self.write_lock = asyncio.Lock() self.http_host = http_host self.http_port = http_port self.http_middlewares = http_middlewares # If starting the actor for the first time, starts up the other system # components. If recovering, fetches their actor handles. self.actor_reconciler._start_routers_if_needed(self.http_host, self.http_port, self.http_middlewares) # NOTE(edoakes): unfortunately, we can't completely recover from a # checkpoint in the constructor because we block while waiting for # other actors to start up, and those actors fetch soft state from # this actor. Because no other tasks will start executing until after # the constructor finishes, if we were to run this logic in the # constructor it could lead to deadlock between this actor and a child. # However we do need to guarantee that we have fully recovered from a # checkpoint before any other state-changing calls run. We address this # by acquiring the write_lock and then posting the task to recover from # a checkpoint to the event loop. Other state-changing calls acquire # this lock and will be blocked until recovering from the checkpoint # finishes. checkpoint = self.kv_store.get(CHECKPOINT_KEY) if checkpoint is None: logger.debug("No checkpoint found") else: await self.write_lock.acquire() asyncio.get_event_loop().create_task( self._recover_from_checkpoint(checkpoint)) # NOTE(simon): Currently we do all-to-all broadcast. This means # any listeners will receive notification for all changes. This # can be problem at scale, e.g. updating a single backend config # will send over the entire configs. In the future, we should # optimize the logic to support subscription by key. self.long_poll_host = LongPollerHost() self.notify_backend_configs_changed() self.notify_replica_handles_changed() self.notify_traffic_policies_changed() asyncio.get_event_loop().create_task(self.run_control_loop())
class ServeController: """Responsible for managing the state of the serving system. The controller implements fault tolerance by persisting its state in a new checkpoint each time a state change is made. If the actor crashes, the latest checkpoint is loaded and the state is recovered. Checkpoints are written/read using a provided KV-store interface. All hard state in the system is maintained by this actor and persisted via these checkpoints. Soft state required by other components is fetched by those actors from this actor on startup and updates are pushed out from this actor. All other actors started by the controller are named, detached actors so they will not fate share with the controller if it crashes. The following guarantees are provided for state-changing calls to the controller: - If the call succeeds, the change was made and will be reflected in the system even if the controller or other actors die unexpectedly. - If the call fails, the change may have been made but isn't guaranteed to have been. The client should retry in this case. Note that this requires all implementations here to be idempotent. """ async def __init__(self, controller_name: str, http_host: str, http_port: str, http_middlewares: List[Any], detached: bool = False): # Used to read/write checkpoints. self.kv_store = RayInternalKVStore(namespace=controller_name) # ConfigurationStore self.configuration_store = ConfigurationStore() # ActorStateReconciler self.actor_reconciler = ActorStateReconciler(controller_name, detached) # backend -> AutoscalingPolicy self.autoscaling_policies = dict() # Dictionary of backend_tag -> router_name -> most recent queue length. self.backend_stats = defaultdict(lambda: defaultdict(dict)) # Used to ensure that only a single state-changing operation happens # at any given time. self.write_lock = asyncio.Lock() self.http_host = http_host self.http_port = http_port self.http_middlewares = http_middlewares # If starting the actor for the first time, starts up the other system # components. If recovering, fetches their actor handles. self.actor_reconciler._start_routers_if_needed(self.http_host, self.http_port, self.http_middlewares) # NOTE(edoakes): unfortunately, we can't completely recover from a # checkpoint in the constructor because we block while waiting for # other actors to start up, and those actors fetch soft state from # this actor. Because no other tasks will start executing until after # the constructor finishes, if we were to run this logic in the # constructor it could lead to deadlock between this actor and a child. # However we do need to guarantee that we have fully recovered from a # checkpoint before any other state-changing calls run. We address this # by acquiring the write_lock and then posting the task to recover from # a checkpoint to the event loop. Other state-changing calls acquire # this lock and will be blocked until recovering from the checkpoint # finishes. checkpoint = self.kv_store.get(CHECKPOINT_KEY) if checkpoint is None: logger.debug("No checkpoint found") else: await self.write_lock.acquire() asyncio.get_event_loop().create_task( self._recover_from_checkpoint(checkpoint)) # NOTE(simon): Currently we do all-to-all broadcast. This means # any listeners will receive notification for all changes. This # can be problem at scale, e.g. updating a single backend config # will send over the entire configs. In the future, we should # optimize the logic to support subscription by key. self.long_poll_host = LongPollerHost() self.notify_backend_configs_changed() self.notify_replica_handles_changed() self.notify_traffic_policies_changed() asyncio.get_event_loop().create_task(self.run_control_loop()) def notify_replica_handles_changed(self): self.long_poll_host.notify_changed( "worker_handles", { backend_tag: list(replica_dict.values()) for backend_tag, replica_dict in self.actor_reconciler.backend_replicas.items() }) def notify_traffic_policies_changed(self): self.long_poll_host.notify_changed( "traffic_policies", self.configuration_store.traffic_policies) def notify_backend_configs_changed(self): self.long_poll_host.notify_changed( "backend_configs", self.configuration_store.get_backend_configs()) async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]): """Proxy long pull client's listen request. Args: keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to determine whether or not the host should immediately return the data or wait for the value to be changed. """ return await ( self.long_poll_host.listen_for_change(keys_to_snapshot_ids)) def get_routers(self) -> Dict[str, ActorHandle]: """Returns a dictionary of node ID to router actor handles.""" return self.actor_reconciler.routers_cache def get_router_config(self) -> Dict[str, Dict[str, Tuple[str, List[str]]]]: """Called by the router on startup to fetch required state.""" return self.configuration_store.routes def _checkpoint(self) -> None: """Checkpoint internal state and write it to the KV store.""" assert self.write_lock.locked() logger.debug("Writing checkpoint") start = time.time() checkpoint = pickle.dumps( Checkpoint(self.configuration_store, self.actor_reconciler)) self.kv_store.put(CHECKPOINT_KEY, checkpoint) logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start)) if random.random( ) < _CRASH_AFTER_CHECKPOINT_PROBABILITY and self.detached: logger.warning("Intentionally crashing after checkpoint") os._exit(0) async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None: """Recover the instance state from the provided checkpoint. Performs the following operations: 1) Deserializes the internal state from the checkpoint. 2) Pushes the latest configuration to the routers in case we crashed before updating them. 3) Starts/stops any replicas that are pending creation or deletion. NOTE: this requires that self.write_lock is already acquired and will release it before returning. """ assert self.write_lock.locked() start = time.time() logger.info("Recovering from checkpoint") restored_checkpoint: Checkpoint = pickle.loads(checkpoint_bytes) # Restore ConfigurationStore self.configuration_store = restored_checkpoint.config # Restore ActorStateReconciler self.actor_reconciler = restored_checkpoint.reconciler self.autoscaling_policies = await self.actor_reconciler.\ _recover_from_checkpoint(self.configuration_store, self) logger.info("Recovered from checkpoint in {:.3f}s".format(time.time() - start)) self.write_lock.release() async def do_autoscale(self) -> None: for backend, info in self.configuration_store.backends.items(): if backend not in self.autoscaling_policies: continue new_num_replicas = self.autoscaling_policies[backend].scale( self.backend_stats[backend], info.backend_config.num_replicas) if new_num_replicas > 0: await self.update_backend_config( backend, BackendConfig(num_replicas=new_num_replicas)) async def run_control_loop(self) -> None: while True: await self.do_autoscale() async with self.write_lock: self.actor_reconciler._start_routers_if_needed( self.http_host, self.http_port, self.http_middlewares) checkpoint_required = self.actor_reconciler.\ _stop_routers_if_needed() if checkpoint_required: self._checkpoint() await asyncio.sleep(CONTROL_LOOP_PERIOD_S) def get_backend_configs(self) -> Dict[str, BackendConfig]: """Fetched by the router on startup.""" return self.configuration_store.get_backend_configs() def get_traffic_policies(self) -> Dict[str, TrafficPolicy]: """Fetched by the router on startup.""" return self.configuration_store.traffic_policies def _list_replicas(self, backend_tag: BackendTag) -> List[ReplicaTag]: """Used only for testing.""" return list(self.actor_reconciler.backend_replicas[backend_tag].keys()) def get_traffic_policy(self, endpoint: str) -> TrafficPolicy: """Fetched by serve handles.""" return self.configuration_store.traffic_policies[endpoint] def get_all_replica_handles(self) -> Dict[str, Dict[str, ActorHandle]]: """Fetched by the router on startup.""" return self.actor_reconciler.backend_replicas def get_all_backends(self) -> Dict[str, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" return self.configuration_store.get_backend_configs() def get_all_endpoints(self) -> Dict[str, Dict[str, Any]]: """Returns a dictionary of endpoint to endpoint config.""" endpoints = {} for route, (endpoint, methods) in self.configuration_store.routes.items(): if endpoint in self.configuration_store.traffic_policies: traffic_policy = self.configuration_store.traffic_policies[ endpoint] traffic_dict = traffic_policy.traffic_dict shadow_dict = traffic_policy.shadow_dict else: traffic_dict = {} shadow_dict = {} endpoints[endpoint] = { "route": route if route.startswith("/") else None, "methods": methods, "traffic": traffic_dict, "shadows": shadow_dict, } return endpoints async def _set_traffic(self, endpoint_name: str, traffic_dict: Dict[str, float]) -> None: if endpoint_name not in self.get_all_endpoints(): raise ValueError("Attempted to assign traffic for an endpoint '{}'" " that is not registered.".format(endpoint_name)) assert isinstance(traffic_dict, dict), "Traffic policy must be a dictionary." for backend in traffic_dict: if self.configuration_store.get_backend(backend) is None: raise ValueError( "Attempted to assign traffic to a backend '{}' that " "is not registered.".format(backend)) traffic_policy = TrafficPolicy(traffic_dict) self.configuration_store.traffic_policies[ endpoint_name] = traffic_policy # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the # update. self._checkpoint() self.notify_traffic_policies_changed() async def set_traffic(self, endpoint_name: str, traffic_dict: Dict[str, float]) -> None: """Sets the traffic policy for the specified endpoint.""" async with self.write_lock: await self._set_traffic(endpoint_name, traffic_dict) async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag, proportion: float) -> None: """Shadow traffic from the endpoint to the backend.""" async with self.write_lock: if endpoint_name not in self.get_all_endpoints(): raise ValueError( "Attempted to shadow traffic from an " "endpoint '{}' that is not registered.".format( endpoint_name)) if self.configuration_store.get_backend(backend_tag) is None: raise ValueError( "Attempted to shadow traffic to a backend '{}' that " "is not registered.".format(backend_tag)) self.configuration_store.traffic_policies[ endpoint_name].set_shadow(backend_tag, proportion) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the # update. self._checkpoint() self.notify_traffic_policies_changed() # TODO(architkulkarni): add Optional for route after cloudpickle upgrade async def create_endpoint(self, endpoint: str, traffic_dict: Dict[str, float], route, methods) -> None: """Create a new endpoint with the specified route and methods. If the route is None, this is a "headless" endpoint that will not be exposed over HTTP and can only be accessed via a handle. """ async with self.write_lock: # If this is a headless endpoint with no route, key the endpoint # based on its name. # TODO(edoakes): we should probably just store routes and endpoints # separately. if route is None: route = endpoint # TODO(edoakes): move this to client side. err_prefix = "Cannot create endpoint." if route in self.configuration_store.routes: # Ensures this method is idempotent if self.configuration_store.routes[route] == (endpoint, methods): return else: raise ValueError( "{} Route '{}' is already registered.".format( err_prefix, route)) if endpoint in self.get_all_endpoints(): raise ValueError( "{} Endpoint '{}' is already registered.".format( err_prefix, endpoint)) logger.info( "Registering route '{}' to endpoint '{}' with methods '{}'.". format(route, endpoint, methods)) self.configuration_store.routes[route] = (endpoint, methods) # NOTE(edoakes): checkpoint is written in self._set_traffic. await self._set_traffic(endpoint, traffic_dict) await asyncio.gather(*[ router.set_route_table.remote(self.configuration_store.routes) for router in self.actor_reconciler.router_handles() ]) async def delete_endpoint(self, endpoint: str) -> None: """Delete the specified endpoint. Does not modify any corresponding backends. """ logger.info("Deleting endpoint '{}'".format(endpoint)) async with self.write_lock: # This method must be idempotent. We should validate that the # specified endpoint exists on the client. for route, (route_endpoint, _) in self.configuration_store.routes.items(): if route_endpoint == endpoint: route_to_delete = route break else: logger.info("Endpoint '{}' doesn't exist".format(endpoint)) return # Remove the routing entry. del self.configuration_store.routes[route_to_delete] # Remove the traffic policy entry if it exists. if endpoint in self.configuration_store.traffic_policies: del self.configuration_store.traffic_policies[endpoint] self.actor_reconciler.endpoints_to_remove.append(endpoint) # NOTE(edoakes): we must write a checkpoint before pushing the # updates to the routers to avoid inconsistent state if we crash # after pushing the update. self._checkpoint() await asyncio.gather(*[ router.set_route_table.remote(self.configuration_store.routes) for router in self.actor_reconciler.router_handles() ]) async def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> None: """Register a new backend under the specified tag.""" async with self.write_lock: # Ensures this method is idempotent. backend_info = self.configuration_store.get_backend(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return backend_replica = create_backend_replica( replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. self.configuration_store.add_backend( backend_tag, BackendInfo(worker_class=backend_replica, backend_config=backend_config, replica_config=replica_config)) metadata = backend_config.internal_metadata if metadata.autoscaling_config is not None: self.autoscaling_policies[ backend_tag] = BasicAutoscalingPolicy( backend_tag, metadata.autoscaling_config) try: self.actor_reconciler._scale_backend_replicas( self.configuration_store.backends, backend_tag, backend_config.num_replicas) except RayServeException as e: del self.configuration_store.backends[backend_tag] raise e # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. self._checkpoint() await self.actor_reconciler._start_pending_backend_replicas( self.configuration_store) self.notify_replica_handles_changed() # Set the backend config inside the router # (particularly for max_concurrent_queries). self.notify_backend_configs_changed() await self.broadcast_backend_config(backend_tag) async def delete_backend(self, backend_tag: BackendTag) -> None: async with self.write_lock: # This method must be idempotent. We should validate that the # specified backend exists on the client. if self.configuration_store.get_backend(backend_tag) is None: return # Check that the specified backend isn't used by any endpoints. for endpoint, traffic_policy in self.configuration_store.\ traffic_policies.items(): if (backend_tag in traffic_policy.traffic_dict or backend_tag in traffic_policy.shadow_dict): raise ValueError("Backend '{}' is used by endpoint '{}' " "and cannot be deleted. Please remove " "the backend from all endpoints and try " "again.".format(backend_tag, endpoint)) # Scale its replicas down to 0. This will also remove the backend # from self.configuration_store.backends and # self.actor_reconciler.backend_replicas. self.actor_reconciler._scale_backend_replicas( self.configuration_store.backends, backend_tag, 0) # Remove the backend's metadata. del self.configuration_store.backends[backend_tag] if backend_tag in self.autoscaling_policies: del self.autoscaling_policies[backend_tag] # Add the intention to remove the backend from the router. self.actor_reconciler.backends_to_remove.append(backend_tag) # NOTE(edoakes): we must write a checkpoint before removing the # backend from the router to avoid inconsistent state if we crash # after pushing the update. self._checkpoint() await self.actor_reconciler._stop_pending_backend_replicas() self.notify_replica_handles_changed() async def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig) -> None: """Set the config for the specified backend.""" async with self.write_lock: assert (self.configuration_store.get_backend(backend_tag) ), "Backend {} is not registered.".format(backend_tag) assert isinstance(config_options, BackendConfig) stored_backend_config = self.configuration_store.get_backend( backend_tag).backend_config backend_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) backend_config._validate_complete() self.configuration_store.get_backend( backend_tag).backend_config = backend_config # Scale the replicas with the new configuration. self.actor_reconciler._scale_backend_replicas( self.configuration_store.backends, backend_tag, backend_config.num_replicas) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the # update. self._checkpoint() # Inform the router about change in configuration # (particularly for setting max_batch_size). await self.actor_reconciler._start_pending_backend_replicas( self.configuration_store) await self.actor_reconciler._stop_pending_backend_replicas() self.notify_replica_handles_changed() self.notify_backend_configs_changed() await self.broadcast_backend_config(backend_tag) async def broadcast_backend_config(self, backend_tag: BackendTag) -> None: backend_config = self.configuration_store.get_backend( backend_tag).backend_config broadcast_futures = [ replica.update_config.remote(backend_config).as_future() for replica in self.actor_reconciler.get_replica_handles_for_backend(backend_tag) ] await asyncio.gather(*broadcast_futures) def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig: """Get the current config for the specified backend.""" assert (self.configuration_store.get_backend(backend_tag) ), "Backend {} is not registered.".format(backend_tag) return self.configuration_store.get_backend(backend_tag).backend_config async def shutdown(self) -> None: """Shuts down the serve instance completely.""" async with self.write_lock: for router in self.actor_reconciler.router_handles(): ray.kill(router, no_restart=True) for replica in self.actor_reconciler.get_replica_handles(): ray.kill(replica, no_restart=True) self.kv_store.delete(CHECKPOINT_KEY)
def __init__(self) -> None: print("actor started") self.host = LongPollerHost() self.host.notify_changed("timer", time.time())