def __init__(self, backend_tag): self.backend_tag = backend_tag # NOTE(simon): We have to do this because max_concurrent_queries # and the replica handles come from different long poll keys. self.max_concurrent_queries: int = 8 self.in_flight_queries: Dict[ActorHandle, set] = dict() # The iterator used for load balancing among replicas. Using itertools # cycle, we implements a round-robin policy, skipping overloaded # replicas. # NOTE(simon): We can make this more pluggable and consider different # policies like: min load, pick min of two replicas, pick replicas on # the same node. self.replica_iterator = itertools.cycle(self.in_flight_queries.keys()) # Used to unblock this replica set waiting for free replicas. A newly # added replica or updated max_concurrent_queries value means the # query that waits on a free replica might be unblocked on. self.config_updated_event = asyncio.Event() self.num_queued_queries = 0 self.num_queued_queries_gauge = metrics.Gauge( "serve_backend_queued_queries", description=( "The current number of queries to this backend waiting" " to be assigned to a replica."), tag_keys=("backend", "endpoint")) self.num_queued_queries_gauge.set_default_tags({ "backend": self.backend_tag })
def __init__( self, deployment_name, event_loop: asyncio.AbstractEventLoop, ): self.deployment_name = deployment_name self.in_flight_queries: Dict[RunningReplicaInfo, set] = dict() # The iterator used for load balancing among replicas. Using itertools # cycle, we implements a round-robin policy, skipping overloaded # replicas. # NOTE(simon): We can make this more pluggable and consider different # policies like: min load, pick min of two replicas, pick replicas on # the same node. self.replica_iterator = itertools.cycle(self.in_flight_queries.keys()) # Used to unblock this replica set waiting for free replicas. A newly # added replica or updated max_concurrent_queries value means the # query that waits on a free replica might be unblocked on. # Python 3.8 has deprecated the 'loop' parameter, and Python 3.10 has # removed it alltogether. Call accordingly. if sys.version_info.major >= 3 and sys.version_info.minor >= 10: self.config_updated_event = asyncio.Event() else: self.config_updated_event = asyncio.Event(loop=event_loop) self.num_queued_queries = 0 self.num_queued_queries_gauge = metrics.Gauge( "serve_deployment_queued_queries", description=( "The current number of queries to this deployment waiting" " to be assigned to a replica."), tag_keys=("deployment", "endpoint")) self.num_queued_queries_gauge.set_default_tags( {"deployment": self.deployment_name})
def __init__( self, controller_handle, backend_tag, event_loop: asyncio.AbstractEventLoop, ): self.backend_tag = backend_tag # NOTE(simon): We have to do this because max_concurrent_queries # and the replica handles come from different long poll keys. self.max_concurrent_queries: int = 8 self.in_flight_queries: Dict[ActorHandle, set] = dict() # The iterator used for load balancing among replicas. Using itertools # cycle, we implements a round-robin policy, skipping overloaded # replicas. # NOTE(simon): We can make this more pluggable and consider different # policies like: min load, pick min of two replicas, pick replicas on # the same node. self.replica_iterator = itertools.cycle(self.in_flight_queries.keys()) # Used to unblock this replica set waiting for free replicas. A newly # added replica or updated max_concurrent_queries value means the # query that waits on a free replica might be unblocked on. self.config_updated_event = asyncio.Event(loop=event_loop) self.num_queued_queries = 0 self.num_queued_queries_gauge = metrics.Gauge( "serve_deployment_queued_queries", description=( "The current number of queries to this deployment waiting" " to be assigned to a replica."), tag_keys=("deployment", "endpoint")) self.num_queued_queries_gauge.set_default_tags({ "deployment": self.backend_tag }) self.long_poll_client = LongPollClient( controller_handle, { (LongPollNamespace.BACKEND_CONFIGS, backend_tag): self. set_max_concurrent_queries, (LongPollNamespace.REPLICA_HANDLES, backend_tag): self. update_worker_replicas, }, call_in_event_loop=event_loop, )
def __init__( self, _callable: Callable, deployment_name: str, replica_tag: ReplicaTag, deployment_config: DeploymentConfig, user_config: Any, version: DeploymentVersion, is_function: bool, controller_handle: ActorHandle, ) -> None: self.deployment_config = deployment_config self.deployment_name = deployment_name self.replica_tag = replica_tag self.callable = _callable self.is_function = is_function self.user_config = user_config self.version = version self.rwlock = aiorwlock.RWLock() user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None) if not callable(user_health_check): def user_health_check(): pass self.user_health_check = sync_to_async(user_health_check) self.num_ongoing_requests = 0 self.request_counter = metrics.Counter( "serve_deployment_request_counter", description= ("The number of queries that have been processed in this replica." ), tag_keys=("deployment", "replica"), ) self.request_counter.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.error_counter = metrics.Counter( "serve_deployment_error_counter", description=( "The number of exceptions that have occurred in this replica." ), tag_keys=("deployment", "replica"), ) self.error_counter.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.restart_counter = metrics.Counter( "serve_deployment_replica_starts", description= ("The number of times this replica has been restarted due to failure." ), tag_keys=("deployment", "replica"), ) self.restart_counter.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.processing_latency_tracker = metrics.Histogram( "serve_deployment_processing_latency_ms", description="The latency for queries to be processed.", boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("deployment", "replica"), ) self.processing_latency_tracker.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.num_processing_items = metrics.Gauge( "serve_replica_processing_queries", description="The current number of queries being processed.", tag_keys=("deployment", "replica"), ) self.num_processing_items.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.restart_counter.inc() self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s if deployment_config.autoscaling_config: process_remote_func = controller_handle.record_autoscaling_metrics.remote config = deployment_config.autoscaling_config start_metrics_pusher( interval_s=config.metrics_interval_s, collection_callback=self._collect_autoscaling_metrics, metrics_process_func=process_remote_func, ) # NOTE(edoakes): we used to recommend that users use the "ray" logger # and tagged the logs with metadata as below. We now recommend using # the "ray.serve" 'component logger' (as of Ray 1.13). This is left to # maintain backwards compatibility with users who were using the # existing logger. We can consider removing it in Ray 2.0. ray_logger = logging.getLogger("ray") for handler in ray_logger.handlers: handler.setFormatter( logging.Formatter( handler.formatter._fmt + f" component=serve deployment={self.deployment_name} " f"replica={self.replica_tag}"))
def __init__(self, _callable: Callable, backend_config: BackendConfig, is_function: bool, controller_handle: ActorHandle) -> None: self.backend_tag = ray.serve.api.get_replica_context().backend_tag self.replica_tag = ray.serve.api.get_replica_context().replica_tag self.callable = _callable self.is_function = is_function self.config = backend_config self.batch_queue = _BatchQueue(self.config.max_batch_size or 1, self.config.batch_wait_timeout) self.reconfigure(self.config.user_config) self.num_ongoing_requests = 0 self.request_counter = metrics.Counter( "serve_backend_request_counter", description=("The number of queries that have been " "processed in this replica."), tag_keys=("backend", )) self.request_counter.set_default_tags({"backend": self.backend_tag}) self.long_poll_client = LongPollAsyncClient(controller_handle, { LongPollKey.BACKEND_CONFIGS: self._update_backend_configs, }) self.error_counter = metrics.Counter( "serve_backend_error_counter", description=("The number of exceptions that have " "occurred in the backend."), tag_keys=("backend", )) self.error_counter.set_default_tags({"backend": self.backend_tag}) self.restart_counter = metrics.Counter( "serve_backend_replica_starts", description=("The number of times this replica " "has been restarted due to failure."), tag_keys=("backend", "replica")) self.restart_counter.set_default_tags({ "backend": self.backend_tag, "replica": self.replica_tag }) self.queuing_latency_tracker = metrics.Histogram( "serve_backend_queuing_latency_ms", description=("The latency for queries in the replica's queue " "waiting to be processed or batched."), boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("backend", "replica")) self.queuing_latency_tracker.set_default_tags({ "backend": self.backend_tag, "replica": self.replica_tag }) self.processing_latency_tracker = metrics.Histogram( "serve_backend_processing_latency_ms", description="The latency for queries to be processed.", boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("backend", "replica", "batch_size")) self.processing_latency_tracker.set_default_tags({ "backend": self.backend_tag, "replica": self.replica_tag }) self.num_queued_items = metrics.Gauge( "serve_replica_queued_queries", description=("The current number of queries queued in " "the backend replicas."), tag_keys=("backend", "replica")) self.num_queued_items.set_default_tags({ "backend": self.backend_tag, "replica": self.replica_tag }) self.num_processing_items = metrics.Gauge( "serve_replica_processing_queries", description="The current number of queries being processed.", tag_keys=("backend", "replica")) self.num_processing_items.set_default_tags({ "backend": self.backend_tag, "replica": self.replica_tag }) self.restart_counter.inc() ray_logger = logging.getLogger("ray") for handler in ray_logger.handlers: handler.setFormatter( logging.Formatter( handler.formatter._fmt + f" component=serve backend={self.backend_tag} " f"replica={self.replica_tag}")) asyncio.get_event_loop().create_task(self.main_loop())
def __init__( self, _callable: Callable, deployment_name: str, replica_tag: ReplicaTag, deployment_config: DeploymentConfig, user_config: Any, version: DeploymentVersion, is_function: bool, controller_handle: ActorHandle, ) -> None: self.deployment_config = deployment_config self.deployment_name = deployment_name self.replica_tag = replica_tag self.callable = _callable self.is_function = is_function self.user_config = user_config self.version = version self.rwlock = aiorwlock.RWLock() user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None) if not callable(user_health_check): def user_health_check(): pass self.user_health_check = sync_to_async(user_health_check) self.num_ongoing_requests = 0 self.request_counter = metrics.Counter( "serve_deployment_request_counter", description=("The number of queries that have been " "processed in this replica."), tag_keys=("deployment", "replica"), ) self.request_counter.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.error_counter = metrics.Counter( "serve_deployment_error_counter", description=("The number of exceptions that have " "occurred in this replica."), tag_keys=("deployment", "replica"), ) self.error_counter.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.restart_counter = metrics.Counter( "serve_deployment_replica_starts", description=("The number of times this replica " "has been restarted due to failure."), tag_keys=("deployment", "replica"), ) self.restart_counter.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.processing_latency_tracker = metrics.Histogram( "serve_deployment_processing_latency_ms", description="The latency for queries to be processed.", boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("deployment", "replica"), ) self.processing_latency_tracker.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.num_processing_items = metrics.Gauge( "serve_replica_processing_queries", description="The current number of queries being processed.", tag_keys=("deployment", "replica"), ) self.num_processing_items.set_default_tags({ "deployment": self.deployment_name, "replica": self.replica_tag }) self.restart_counter.inc() self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s if deployment_config.autoscaling_config: config = deployment_config.autoscaling_config start_metrics_pusher( interval_s=config.metrics_interval_s, collection_callback=self._collect_autoscaling_metrics, controller_handle=controller_handle, ) ray_logger = logging.getLogger("ray") for handler in ray_logger.handlers: handler.setFormatter( logging.Formatter( handler.formatter._fmt + f" component=serve deployment={self.deployment_name} " f"replica={self.replica_tag}"))
def __init__(self, _callable: Callable, backend_config: BackendConfig, is_function: bool, controller_handle: ActorHandle) -> None: self.backend_tag = ray.serve.api.get_replica_context().deployment self.replica_tag = ray.serve.api.get_replica_context().replica_tag self.callable = _callable self.is_function = is_function self.config = backend_config self.num_ongoing_requests = 0 self.request_counter = metrics.Counter( "serve_deployment_request_counter", description=("The number of queries that have been " "processed in this replica."), tag_keys=("deployment", "replica")) self.request_counter.set_default_tags({ "deployment": self.backend_tag, "replica": self.replica_tag }) self.loop = asyncio.get_event_loop() self.long_poll_client = LongPollClient( controller_handle, { (LongPollNamespace.BACKEND_CONFIGS, self.backend_tag): self._update_backend_configs, }, call_in_event_loop=self.loop, ) self.error_counter = metrics.Counter( "serve_deployment_error_counter", description=("The number of exceptions that have " "occurred in this replica."), tag_keys=("deployment", "replica")) self.error_counter.set_default_tags({ "deployment": self.backend_tag, "replica": self.replica_tag }) self.restart_counter = metrics.Counter( "serve_deployment_replica_starts", description=("The number of times this replica " "has been restarted due to failure."), tag_keys=("deployment", "replica")) self.restart_counter.set_default_tags({ "deployment": self.backend_tag, "replica": self.replica_tag }) self.processing_latency_tracker = metrics.Histogram( "serve_deployment_processing_latency_ms", description="The latency for queries to be processed.", boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("deployment", "replica")) self.processing_latency_tracker.set_default_tags({ "deployment": self.backend_tag, "replica": self.replica_tag }) self.num_processing_items = metrics.Gauge( "serve_replica_processing_queries", description="The current number of queries being processed.", tag_keys=("deployment", "replica")) self.num_processing_items.set_default_tags({ "deployment": self.backend_tag, "replica": self.replica_tag }) self.restart_counter.inc() ray_logger = logging.getLogger("ray") for handler in ray_logger.handlers: handler.setFormatter( logging.Formatter( handler.formatter._fmt + f" component=serve deployment={self.backend_tag} " f"replica={self.replica_tag}"))
def __init__(self, backend_tag: str, replica_tag: str, _callable: Callable, backend_config: BackendConfig, is_function: bool) -> None: self.backend_tag = backend_tag self.replica_tag = replica_tag self.callable = _callable self.is_function = is_function self.config = backend_config self.batch_queue = BatchQueue(self.config.max_batch_size or 1, self.config.batch_wait_timeout) self.num_ongoing_requests = 0 self.request_counter = metrics.Count( "backend_request_counter", description=("Number of queries that have been " "processed in this replica"), tag_keys=("backend", )) self.request_counter.set_default_tags({"backend": self.backend_tag}) self.error_counter = metrics.Count( "backend_error_counter", description=("Number of exceptions that have " "occurred in the backend"), tag_keys=("backend", )) self.error_counter.set_default_tags({"backend": self.backend_tag}) self.restart_counter = metrics.Count( "backend_worker_starts", description=("The number of time this replica workers " "has been restarted due to failure."), tag_keys=("backend", "replica_tag")) self.restart_counter.set_default_tags({ "backend": self.backend_tag, "replica_tag": self.replica_tag }) self.queuing_latency_tracker = metrics.Histogram( "backend_queuing_latency_ms", description=( "The latency for queries waiting in the replica's queue " "waiting to be processed or batched."), boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("backend", "replica_tag")) self.queuing_latency_tracker.set_default_tags({ "backend": self.backend_tag, "replica_tag": self.replica_tag }) self.processing_latency_tracker = metrics.Histogram( "backend_processing_latency_ms", description="The latency for queries to be processed", boundaries=DEFAULT_LATENCY_BUCKET_MS, tag_keys=("backend", "replica_tag", "batch_size")) self.processing_latency_tracker.set_default_tags({ "backend": self.backend_tag, "replica_tag": self.replica_tag }) self.num_queued_items = metrics.Gauge( "replica_queued_queries", description=("Current number of queries queued in the " "the backend replicas"), tag_keys=("backend", "replica_tag")) self.num_queued_items.set_default_tags({ "backend": self.backend_tag, "replica_tag": self.replica_tag }) self.num_processing_items = metrics.Gauge( "replica_processing_queries", description="Current number of queries being processed", tag_keys=("backend", "replica_tag")) self.num_processing_items.set_default_tags({ "backend": self.backend_tag, "replica_tag": self.replica_tag }) self.restart_counter.record(1) asyncio.get_event_loop().create_task(self.main_loop())
async def setup(self, name, controller_name): # Note: Several queues are used in the router # - When a request come in, it's placed inside its corresponding # endpoint_queue. # - The endpoint_queue is dequeued during flush operation, which moves # the queries to backend buffer_queue. Here we match a request # for an endpoint to a backend given some policy. # - The worker_queue is used to collect idle actor handle. These # handles are dequed during the second stage of flush operation, # which assign queries in buffer_queue to actor handle. self.name = name # -- Queues -- # # endpoint_name -> request queue # We use FIFO (left to right) ordering. The new items should be added # using appendleft. Old items should be removed via pop(). self.endpoint_queues: DefaultDict[deque[Query]] = defaultdict(deque) # backend_name -> worker replica tag queue self.worker_queues: DefaultDict[deque[str]] = defaultdict(deque) # backend_name -> worker payload queue self.backend_queues = defaultdict(deque) # -- Metadata -- # # endpoint_name -> traffic_policy self.traffic = dict() # backend_name -> backend_config self.backend_info = dict() # replica tag -> worker_handle self.replicas = dict() # backend_name -> replica_tag -> concurrent queries counter self.queries_counter = defaultdict(lambda: defaultdict(int)) # -- Synchronization -- # # This lock guarantee that only one flush operation can happen at a # time. Without the lock, multiple flush operation can pop from the # same buffer_queue and worker_queue and create deadlock. For example, # an operation holding the only query and the other flush operation # holding the only idle replica. Additionally, allowing only one flush # operation at a time simplifies design overhead for custom queuing and # batching policies. self.flush_lock = asyncio.Lock() # -- State Restoration -- # # Fetch the worker handles, traffic policies, and backend configs from # the controller. We use a "pull-based" approach instead of pushing # them from the controller so that the router can transparently recover # from failure. self.controller = ray.get_actor(controller_name) traffic_policies = ray.get( self.controller.get_traffic_policies.remote()) for endpoint, traffic_policy in traffic_policies.items(): await self.set_traffic(endpoint, traffic_policy) backend_dict = ray.get(self.controller.get_all_worker_handles.remote()) for backend_tag, replica_dict in backend_dict.items(): for replica_tag, worker in replica_dict.items(): await self.add_new_worker(backend_tag, replica_tag, worker) backend_configs = ray.get(self.controller.get_backend_configs.remote()) for backend, backend_config in backend_configs.items(): await self.set_backend_config(backend, backend_config) # -- Metrics Registration -- # self.num_router_requests = metrics.Count( "num_router_requests", description="Number of requests processed by the router.", tag_keys=("endpoint", )) self.num_error_endpoint_requests = metrics.Count( "num_error_endpoint_requests", description=( "Number of requests that errored when getting results " "for the endpoint."), tag_keys=("endpoint", )) self.num_error_backend_requests = metrics.Count( "num_error_backend_requests", description=("Number of requests that errored when getting result " "from the backend."), tag_keys=("backend", )) self.backend_queue_size = metrics.Gauge( "backend_queued_queries", description=("Current number of queries queued " "in the router for a backend"), tag_keys=("backend", )) asyncio.get_event_loop().create_task(self.report_queue_lengths())