コード例 #1
0
    def __init__(self, backend_tag):
        self.backend_tag = backend_tag
        # NOTE(simon): We have to do this because max_concurrent_queries
        # and the replica handles come from different long poll keys.
        self.max_concurrent_queries: int = 8
        self.in_flight_queries: Dict[ActorHandle, set] = dict()
        # The iterator used for load balancing among replicas. Using itertools
        # cycle, we implements a round-robin policy, skipping overloaded
        # replicas.
        # NOTE(simon): We can make this more pluggable and consider different
        # policies like: min load, pick min of two replicas, pick replicas on
        # the same node.
        self.replica_iterator = itertools.cycle(self.in_flight_queries.keys())

        # Used to unblock this replica set waiting for free replicas. A newly
        # added replica or updated max_concurrent_queries value means the
        # query that waits on a free replica might be unblocked on.
        self.config_updated_event = asyncio.Event()
        self.num_queued_queries = 0
        self.num_queued_queries_gauge = metrics.Gauge(
            "serve_backend_queued_queries",
            description=(
                "The current number of queries to this backend waiting"
                " to be assigned to a replica."),
            tag_keys=("backend", "endpoint"))
        self.num_queued_queries_gauge.set_default_tags({
            "backend": self.backend_tag
        })
コード例 #2
0
ファイル: router.py プロジェクト: stjordanis/ray
    def __init__(
        self,
        deployment_name,
        event_loop: asyncio.AbstractEventLoop,
    ):
        self.deployment_name = deployment_name
        self.in_flight_queries: Dict[RunningReplicaInfo, set] = dict()
        # The iterator used for load balancing among replicas. Using itertools
        # cycle, we implements a round-robin policy, skipping overloaded
        # replicas.
        # NOTE(simon): We can make this more pluggable and consider different
        # policies like: min load, pick min of two replicas, pick replicas on
        # the same node.
        self.replica_iterator = itertools.cycle(self.in_flight_queries.keys())

        # Used to unblock this replica set waiting for free replicas. A newly
        # added replica or updated max_concurrent_queries value means the
        # query that waits on a free replica might be unblocked on.

        # Python 3.8 has deprecated the 'loop' parameter, and Python 3.10 has
        # removed it alltogether. Call accordingly.
        if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
            self.config_updated_event = asyncio.Event()
        else:
            self.config_updated_event = asyncio.Event(loop=event_loop)

        self.num_queued_queries = 0
        self.num_queued_queries_gauge = metrics.Gauge(
            "serve_deployment_queued_queries",
            description=(
                "The current number of queries to this deployment waiting"
                " to be assigned to a replica."),
            tag_keys=("deployment", "endpoint"))
        self.num_queued_queries_gauge.set_default_tags(
            {"deployment": self.deployment_name})
コード例 #3
0
ファイル: router.py プロジェクト: hngenc/ray
    def __init__(
            self,
            controller_handle,
            backend_tag,
            event_loop: asyncio.AbstractEventLoop,
    ):
        self.backend_tag = backend_tag
        # NOTE(simon): We have to do this because max_concurrent_queries
        # and the replica handles come from different long poll keys.
        self.max_concurrent_queries: int = 8
        self.in_flight_queries: Dict[ActorHandle, set] = dict()
        # The iterator used for load balancing among replicas. Using itertools
        # cycle, we implements a round-robin policy, skipping overloaded
        # replicas.
        # NOTE(simon): We can make this more pluggable and consider different
        # policies like: min load, pick min of two replicas, pick replicas on
        # the same node.
        self.replica_iterator = itertools.cycle(self.in_flight_queries.keys())

        # Used to unblock this replica set waiting for free replicas. A newly
        # added replica or updated max_concurrent_queries value means the
        # query that waits on a free replica might be unblocked on.
        self.config_updated_event = asyncio.Event(loop=event_loop)
        self.num_queued_queries = 0
        self.num_queued_queries_gauge = metrics.Gauge(
            "serve_deployment_queued_queries",
            description=(
                "The current number of queries to this deployment waiting"
                " to be assigned to a replica."),
            tag_keys=("deployment", "endpoint"))
        self.num_queued_queries_gauge.set_default_tags({
            "deployment": self.backend_tag
        })

        self.long_poll_client = LongPollClient(
            controller_handle,
            {
                (LongPollNamespace.BACKEND_CONFIGS, backend_tag): self.
                set_max_concurrent_queries,
                (LongPollNamespace.REPLICA_HANDLES, backend_tag): self.
                update_worker_replicas,
            },
            call_in_event_loop=event_loop,
        )
コード例 #4
0
    def __init__(
        self,
        _callable: Callable,
        deployment_name: str,
        replica_tag: ReplicaTag,
        deployment_config: DeploymentConfig,
        user_config: Any,
        version: DeploymentVersion,
        is_function: bool,
        controller_handle: ActorHandle,
    ) -> None:
        self.deployment_config = deployment_config
        self.deployment_name = deployment_name
        self.replica_tag = replica_tag
        self.callable = _callable
        self.is_function = is_function
        self.user_config = user_config
        self.version = version
        self.rwlock = aiorwlock.RWLock()

        user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None)
        if not callable(user_health_check):

            def user_health_check():
                pass

        self.user_health_check = sync_to_async(user_health_check)

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Counter(
            "serve_deployment_request_counter",
            description=
            ("The number of queries that have been processed in this replica."
             ),
            tag_keys=("deployment", "replica"),
        )
        self.request_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.error_counter = metrics.Counter(
            "serve_deployment_error_counter",
            description=(
                "The number of exceptions that have occurred in this replica."
            ),
            tag_keys=("deployment", "replica"),
        )
        self.error_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter = metrics.Counter(
            "serve_deployment_replica_starts",
            description=
            ("The number of times this replica has been restarted due to failure."
             ),
            tag_keys=("deployment", "replica"),
        )
        self.restart_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "serve_deployment_processing_latency_ms",
            description="The latency for queries to be processed.",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("deployment", "replica"),
        )
        self.processing_latency_tracker.set_default_tags({
            "deployment":
            self.deployment_name,
            "replica":
            self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "serve_replica_processing_queries",
            description="The current number of queries being processed.",
            tag_keys=("deployment", "replica"),
        )
        self.num_processing_items.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter.inc()

        self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s

        if deployment_config.autoscaling_config:
            process_remote_func = controller_handle.record_autoscaling_metrics.remote
            config = deployment_config.autoscaling_config
            start_metrics_pusher(
                interval_s=config.metrics_interval_s,
                collection_callback=self._collect_autoscaling_metrics,
                metrics_process_func=process_remote_func,
            )

        # NOTE(edoakes): we used to recommend that users use the "ray" logger
        # and tagged the logs with metadata as below. We now recommend using
        # the "ray.serve" 'component logger' (as of Ray 1.13). This is left to
        # maintain backwards compatibility with users who were using the
        # existing logger. We can consider removing it in Ray 2.0.
        ray_logger = logging.getLogger("ray")
        for handler in ray_logger.handlers:
            handler.setFormatter(
                logging.Formatter(
                    handler.formatter._fmt +
                    f" component=serve deployment={self.deployment_name} "
                    f"replica={self.replica_tag}"))
コード例 #5
0
ファイル: backend_worker.py プロジェクト: zhangbushi10/ray
    def __init__(self, _callable: Callable, backend_config: BackendConfig,
                 is_function: bool, controller_handle: ActorHandle) -> None:
        self.backend_tag = ray.serve.api.get_replica_context().backend_tag
        self.replica_tag = ray.serve.api.get_replica_context().replica_tag
        self.callable = _callable
        self.is_function = is_function

        self.config = backend_config
        self.batch_queue = _BatchQueue(self.config.max_batch_size or 1,
                                       self.config.batch_wait_timeout)
        self.reconfigure(self.config.user_config)

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Counter(
            "serve_backend_request_counter",
            description=("The number of queries that have been "
                         "processed in this replica."),
            tag_keys=("backend", ))
        self.request_counter.set_default_tags({"backend": self.backend_tag})

        self.long_poll_client = LongPollAsyncClient(controller_handle, {
            LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
        })

        self.error_counter = metrics.Counter(
            "serve_backend_error_counter",
            description=("The number of exceptions that have "
                         "occurred in the backend."),
            tag_keys=("backend", ))
        self.error_counter.set_default_tags({"backend": self.backend_tag})

        self.restart_counter = metrics.Counter(
            "serve_backend_replica_starts",
            description=("The number of times this replica "
                         "has been restarted due to failure."),
            tag_keys=("backend", "replica"))
        self.restart_counter.set_default_tags({
            "backend": self.backend_tag,
            "replica": self.replica_tag
        })

        self.queuing_latency_tracker = metrics.Histogram(
            "serve_backend_queuing_latency_ms",
            description=("The latency for queries in the replica's queue "
                         "waiting to be processed or batched."),
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("backend", "replica"))
        self.queuing_latency_tracker.set_default_tags({
            "backend": self.backend_tag,
            "replica": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "serve_backend_processing_latency_ms",
            description="The latency for queries to be processed.",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("backend", "replica", "batch_size"))
        self.processing_latency_tracker.set_default_tags({
            "backend": self.backend_tag,
            "replica": self.replica_tag
        })

        self.num_queued_items = metrics.Gauge(
            "serve_replica_queued_queries",
            description=("The current number of queries queued in "
                         "the backend replicas."),
            tag_keys=("backend", "replica"))
        self.num_queued_items.set_default_tags({
            "backend": self.backend_tag,
            "replica": self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "serve_replica_processing_queries",
            description="The current number of queries being processed.",
            tag_keys=("backend", "replica"))
        self.num_processing_items.set_default_tags({
            "backend": self.backend_tag,
            "replica": self.replica_tag
        })

        self.restart_counter.inc()

        ray_logger = logging.getLogger("ray")
        for handler in ray_logger.handlers:
            handler.setFormatter(
                logging.Formatter(
                    handler.formatter._fmt +
                    f" component=serve backend={self.backend_tag} "
                    f"replica={self.replica_tag}"))

        asyncio.get_event_loop().create_task(self.main_loop())
コード例 #6
0
ファイル: replica.py プロジェクト: wuisawesome/ray
    def __init__(
        self,
        _callable: Callable,
        deployment_name: str,
        replica_tag: ReplicaTag,
        deployment_config: DeploymentConfig,
        user_config: Any,
        version: DeploymentVersion,
        is_function: bool,
        controller_handle: ActorHandle,
    ) -> None:
        self.deployment_config = deployment_config
        self.deployment_name = deployment_name
        self.replica_tag = replica_tag
        self.callable = _callable
        self.is_function = is_function
        self.user_config = user_config
        self.version = version
        self.rwlock = aiorwlock.RWLock()

        user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None)
        if not callable(user_health_check):

            def user_health_check():
                pass

        self.user_health_check = sync_to_async(user_health_check)

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Counter(
            "serve_deployment_request_counter",
            description=("The number of queries that have been "
                         "processed in this replica."),
            tag_keys=("deployment", "replica"),
        )
        self.request_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.error_counter = metrics.Counter(
            "serve_deployment_error_counter",
            description=("The number of exceptions that have "
                         "occurred in this replica."),
            tag_keys=("deployment", "replica"),
        )
        self.error_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter = metrics.Counter(
            "serve_deployment_replica_starts",
            description=("The number of times this replica "
                         "has been restarted due to failure."),
            tag_keys=("deployment", "replica"),
        )
        self.restart_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "serve_deployment_processing_latency_ms",
            description="The latency for queries to be processed.",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("deployment", "replica"),
        )
        self.processing_latency_tracker.set_default_tags({
            "deployment":
            self.deployment_name,
            "replica":
            self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "serve_replica_processing_queries",
            description="The current number of queries being processed.",
            tag_keys=("deployment", "replica"),
        )
        self.num_processing_items.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter.inc()

        self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s

        if deployment_config.autoscaling_config:
            config = deployment_config.autoscaling_config
            start_metrics_pusher(
                interval_s=config.metrics_interval_s,
                collection_callback=self._collect_autoscaling_metrics,
                controller_handle=controller_handle,
            )

        ray_logger = logging.getLogger("ray")
        for handler in ray_logger.handlers:
            handler.setFormatter(
                logging.Formatter(
                    handler.formatter._fmt +
                    f" component=serve deployment={self.deployment_name} "
                    f"replica={self.replica_tag}"))
コード例 #7
0
    def __init__(self, _callable: Callable, backend_config: BackendConfig,
                 is_function: bool, controller_handle: ActorHandle) -> None:
        self.backend_tag = ray.serve.api.get_replica_context().deployment
        self.replica_tag = ray.serve.api.get_replica_context().replica_tag
        self.callable = _callable
        self.is_function = is_function

        self.config = backend_config

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Counter(
            "serve_deployment_request_counter",
            description=("The number of queries that have been "
                         "processed in this replica."),
            tag_keys=("deployment", "replica"))
        self.request_counter.set_default_tags({
            "deployment": self.backend_tag,
            "replica": self.replica_tag
        })

        self.loop = asyncio.get_event_loop()
        self.long_poll_client = LongPollClient(
            controller_handle,
            {
                (LongPollNamespace.BACKEND_CONFIGS, self.backend_tag):
                self._update_backend_configs,
            },
            call_in_event_loop=self.loop,
        )

        self.error_counter = metrics.Counter(
            "serve_deployment_error_counter",
            description=("The number of exceptions that have "
                         "occurred in this replica."),
            tag_keys=("deployment", "replica"))
        self.error_counter.set_default_tags({
            "deployment": self.backend_tag,
            "replica": self.replica_tag
        })

        self.restart_counter = metrics.Counter(
            "serve_deployment_replica_starts",
            description=("The number of times this replica "
                         "has been restarted due to failure."),
            tag_keys=("deployment", "replica"))
        self.restart_counter.set_default_tags({
            "deployment": self.backend_tag,
            "replica": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "serve_deployment_processing_latency_ms",
            description="The latency for queries to be processed.",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("deployment", "replica"))
        self.processing_latency_tracker.set_default_tags({
            "deployment":
            self.backend_tag,
            "replica":
            self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "serve_replica_processing_queries",
            description="The current number of queries being processed.",
            tag_keys=("deployment", "replica"))
        self.num_processing_items.set_default_tags({
            "deployment": self.backend_tag,
            "replica": self.replica_tag
        })

        self.restart_counter.inc()

        ray_logger = logging.getLogger("ray")
        for handler in ray_logger.handlers:
            handler.setFormatter(
                logging.Formatter(
                    handler.formatter._fmt +
                    f" component=serve deployment={self.backend_tag} "
                    f"replica={self.replica_tag}"))
コード例 #8
0
    def __init__(self, backend_tag: str, replica_tag: str, _callable: Callable,
                 backend_config: BackendConfig, is_function: bool) -> None:
        self.backend_tag = backend_tag
        self.replica_tag = replica_tag
        self.callable = _callable
        self.is_function = is_function

        self.config = backend_config
        self.batch_queue = BatchQueue(self.config.max_batch_size or 1,
                                      self.config.batch_wait_timeout)

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Count(
            "backend_request_counter",
            description=("Number of queries that have been "
                         "processed in this replica"),
            tag_keys=("backend", ))
        self.request_counter.set_default_tags({"backend": self.backend_tag})

        self.error_counter = metrics.Count(
            "backend_error_counter",
            description=("Number of exceptions that have "
                         "occurred in the backend"),
            tag_keys=("backend", ))
        self.error_counter.set_default_tags({"backend": self.backend_tag})

        self.restart_counter = metrics.Count(
            "backend_worker_starts",
            description=("The number of time this replica workers "
                         "has been restarted due to failure."),
            tag_keys=("backend", "replica_tag"))
        self.restart_counter.set_default_tags({
            "backend": self.backend_tag,
            "replica_tag": self.replica_tag
        })

        self.queuing_latency_tracker = metrics.Histogram(
            "backend_queuing_latency_ms",
            description=(
                "The latency for queries waiting in the replica's queue "
                "waiting to be processed or batched."),
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("backend", "replica_tag"))
        self.queuing_latency_tracker.set_default_tags({
            "backend": self.backend_tag,
            "replica_tag": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "backend_processing_latency_ms",
            description="The latency for queries to be processed",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("backend", "replica_tag", "batch_size"))
        self.processing_latency_tracker.set_default_tags({
            "backend": self.backend_tag,
            "replica_tag": self.replica_tag
        })

        self.num_queued_items = metrics.Gauge(
            "replica_queued_queries",
            description=("Current number of queries queued in the "
                         "the backend replicas"),
            tag_keys=("backend", "replica_tag"))
        self.num_queued_items.set_default_tags({
            "backend": self.backend_tag,
            "replica_tag": self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "replica_processing_queries",
            description="Current number of queries being processed",
            tag_keys=("backend", "replica_tag"))
        self.num_processing_items.set_default_tags({
            "backend": self.backend_tag,
            "replica_tag": self.replica_tag
        })

        self.restart_counter.record(1)

        asyncio.get_event_loop().create_task(self.main_loop())
コード例 #9
0
    async def setup(self, name, controller_name):
        # Note: Several queues are used in the router
        # - When a request come in, it's placed inside its corresponding
        #   endpoint_queue.
        # - The endpoint_queue is dequeued during flush operation, which moves
        #   the queries to backend buffer_queue. Here we match a request
        #   for an endpoint to a backend given some policy.
        # - The worker_queue is used to collect idle actor handle. These
        #   handles are dequed during the second stage of flush operation,
        #   which assign queries in buffer_queue to actor handle.

        self.name = name

        # -- Queues -- #

        # endpoint_name -> request queue
        # We use FIFO (left to right) ordering. The new items should be added
        # using appendleft. Old items should be removed via pop().
        self.endpoint_queues: DefaultDict[deque[Query]] = defaultdict(deque)
        # backend_name -> worker replica tag queue
        self.worker_queues: DefaultDict[deque[str]] = defaultdict(deque)
        # backend_name -> worker payload queue
        self.backend_queues = defaultdict(deque)

        # -- Metadata -- #

        # endpoint_name -> traffic_policy
        self.traffic = dict()
        # backend_name -> backend_config
        self.backend_info = dict()
        # replica tag -> worker_handle
        self.replicas = dict()
        # backend_name -> replica_tag -> concurrent queries counter
        self.queries_counter = defaultdict(lambda: defaultdict(int))

        # -- Synchronization -- #

        # This lock guarantee that only one flush operation can happen at a
        # time. Without the lock, multiple flush operation can pop from the
        # same buffer_queue and worker_queue and create deadlock. For example,
        # an operation holding the only query and the other flush operation
        # holding the only idle replica. Additionally, allowing only one flush
        # operation at a time simplifies design overhead for custom queuing and
        # batching policies.
        self.flush_lock = asyncio.Lock()

        # -- State Restoration -- #
        # Fetch the worker handles, traffic policies, and backend configs from
        # the controller. We use a "pull-based" approach instead of pushing
        # them from the controller so that the router can transparently recover
        # from failure.
        self.controller = ray.get_actor(controller_name)

        traffic_policies = ray.get(
            self.controller.get_traffic_policies.remote())
        for endpoint, traffic_policy in traffic_policies.items():
            await self.set_traffic(endpoint, traffic_policy)

        backend_dict = ray.get(self.controller.get_all_worker_handles.remote())
        for backend_tag, replica_dict in backend_dict.items():
            for replica_tag, worker in replica_dict.items():
                await self.add_new_worker(backend_tag, replica_tag, worker)

        backend_configs = ray.get(self.controller.get_backend_configs.remote())
        for backend, backend_config in backend_configs.items():
            await self.set_backend_config(backend, backend_config)

        # -- Metrics Registration -- #
        self.num_router_requests = metrics.Count(
            "num_router_requests",
            description="Number of requests processed by the router.",
            tag_keys=("endpoint", ))
        self.num_error_endpoint_requests = metrics.Count(
            "num_error_endpoint_requests",
            description=(
                "Number of requests that errored when getting results "
                "for the endpoint."),
            tag_keys=("endpoint", ))
        self.num_error_backend_requests = metrics.Count(
            "num_error_backend_requests",
            description=("Number of requests that errored when getting result "
                         "from the backend."),
            tag_keys=("backend", ))

        self.backend_queue_size = metrics.Gauge(
            "backend_queued_queries",
            description=("Current number of queries queued "
                         "in the router for a backend"),
            tag_keys=("backend", ))

        asyncio.get_event_loop().create_task(self.report_queue_lengths())