def __init__(self, backend_tag: str, replica_tag: str, _callable: Callable, backend_config: BackendConfig, is_function: bool) -> None: self.backend_tag = backend_tag self.replica_tag = replica_tag self.callable = _callable self.is_function = is_function self.config = backend_config self.batch_queue = BatchQueue(self.config.max_batch_size or 1, self.config.batch_wait_timeout) self.num_ongoing_requests = 0 self.request_counter = metrics.Count( "backend_request_counter", ("Number of queries that have been " "processed in this replica"), "requests", ["backend"]) self.error_counter = metrics.Count("backend_error_counter", ("Number of exceptions that have " "occurred in the backend"), "errors", ["backend"]) self.restart_counter = metrics.Count( "backend_worker_starts", ("The number of time this replica workers " "has been restarted due to failure."), "restarts", ["backend", "replica_tag"]) self.queuing_latency_tracker = metrics.Histogram( "backend_queuing_latency_ms", ("The latency for queries waiting in the replica's queue " "waiting to be processed or batched."), "ms", DEFAULT_LATENCY_BUCKET_MS, ["backend", "replica_tag"]) self.processing_latency_tracker = metrics.Histogram( "backend_processing_latency_ms", "The latency for queries to be processed", "ms", DEFAULT_LATENCY_BUCKET_MS, ["backend", "replica_tag", "batch_size"]) self.num_queued_items = metrics.Gauge( "replica_queued_queries", "Current number of queries queued in the the backend replicas", "requests", ["backend", "replica_tag"]) self.num_processing_items = metrics.Gauge( "replica_processing_queries", "Current number of queries being processed", "requests", ["backend", "replica_tag"]) self.restart_counter.record(1, { "backend": self.backend_tag, "replica_tag": self.replica_tag }) asyncio.get_event_loop().create_task(self.main_loop())
async def setup(self, name, controller_name): # Note: Several queues are used in the router # - When a request come in, it's placed inside its corresponding # endpoint_queue. # - The endpoint_queue is dequeued during flush operation, which moves # the queries to backend buffer_queue. Here we match a request # for an endpoint to a backend given some policy. # - The worker_queue is used to collect idle actor handle. These # handles are dequed during the second stage of flush operation, # which assign queries in buffer_queue to actor handle. self.name = name # -- Queues -- # # endpoint_name -> request queue # We use FIFO (left to right) ordering. The new items should be added # using appendleft. Old items should be removed via pop(). self.endpoint_queues: DefaultDict[deque[Query]] = defaultdict(deque) # backend_name -> worker replica tag queue self.worker_queues: DefaultDict[deque[str]] = defaultdict(deque) # backend_name -> worker payload queue self.backend_queues = defaultdict(deque) # -- Metadata -- # # endpoint_name -> traffic_policy self.traffic = dict() # backend_name -> backend_config self.backend_info = dict() # replica tag -> worker_handle self.replicas = dict() # backend_name -> replica_tag -> concurrent queries counter self.queries_counter = defaultdict(lambda: defaultdict(int)) # -- Synchronization -- # # This lock guarantee that only one flush operation can happen at a # time. Without the lock, multiple flush operation can pop from the # same buffer_queue and worker_queue and create deadlock. For example, # an operation holding the only query and the other flush operation # holding the only idle replica. Additionally, allowing only one flush # operation at a time simplifies design overhead for custom queuing and # batching policies. self.flush_lock = asyncio.Lock() # -- State Restoration -- # # Fetch the worker handles, traffic policies, and backend configs from # the controller. We use a "pull-based" approach instead of pushing # them from the controller so that the router can transparently recover # from failure. self.controller = ray.get_actor(controller_name) traffic_policies = ray.get( self.controller.get_traffic_policies.remote()) for endpoint, traffic_policy in traffic_policies.items(): await self.set_traffic(endpoint, traffic_policy) backend_dict = ray.get(self.controller.get_all_worker_handles.remote()) for backend_tag, replica_dict in backend_dict.items(): for replica_tag, worker in replica_dict.items(): await self.add_new_worker(backend_tag, replica_tag, worker) backend_configs = ray.get(self.controller.get_backend_configs.remote()) for backend, backend_config in backend_configs.items(): await self.set_backend_config(backend, backend_config) # -- Metrics Registration -- # self.num_router_requests = metrics.Count( "num_router_requests", "Number of requests processed by the router.", "requests", ["endpoint"]) self.num_error_endpoint_requests = metrics.Count( "num_error_endpoint_requests", ("Number of requests that errored when getting results " "for the endpoint."), "requests", ["endpoint"]) self.num_error_backend_requests = metrics.Count( "num_error_backend_requests", ("Number of requests that errored when getting result " "from the backend."), "requests", ["backend"]) self.backend_queue_size = metrics.Gauge( "backend_queued_queries", "Current number of queries queued in the router for a backend", "requests", ["backend"]) asyncio.get_event_loop().create_task(self.report_queue_lengths())