Esempio n. 1
0
    async def invoke_single(self, request_item: Query) -> Any:
        logger.debug("Replica {} started executing request {}".format(
            self.replica_tag, request_item.metadata.request_id))
        args, kwargs = parse_request_item(request_item)

        start = time.time()
        method_to_call = None
        try:
            runner_method = self.get_runner_method(request_item)
            method_to_call = sync_to_async(runner_method)
            result = None
            if len(inspect.signature(runner_method).parameters) > 0:
                result = await method_to_call(*args, **kwargs)
            else:
                # The method doesn't take in anything, including the request
                # information, so we pass nothing into it
                result = await method_to_call()

            result = await self.ensure_serializable_response(result)
            self.request_counter.inc()
        except Exception as e:
            import os

            if "RAY_PDB" in os.environ:
                ray.util.pdb.post_mortem()
            function_name = "unknown"
            if method_to_call is not None:
                function_name = method_to_call.__name__
            result = wrap_to_ray_error(function_name, e)
            self.error_counter.inc()

        latency_ms = (time.time() - start) * 1000
        self.processing_latency_tracker.observe(latency_ms)

        return result
Esempio n. 2
0
    async def invoke_single(self, request_item: Query) -> Any:
        logger.debug("Replica {} started executing request {}".format(
            self.replica_tag, request_item.metadata.request_id))
        args, kwargs = parse_request_item(request_item)

        start = time.time()
        method_to_call = None
        try:
            method_to_call = sync_to_async(
                self.get_runner_method(request_item))
            result = await method_to_call(*args, **kwargs)

            result = await self.ensure_serializable_response(result)
            self.request_counter.inc()
        except Exception as e:
            import os
            if "RAY_PDB" in os.environ:
                ray.util.pdb.post_mortem()
            function_name = "unknown"
            if method_to_call is not None:
                function_name = method_to_call.__name__
            result = wrap_to_ray_error(function_name, e)
            self.error_counter.inc()

        latency_ms = (time.time() - start) * 1000
        self.processing_latency_tracker.observe(latency_ms)

        return result
Esempio n. 3
0
    async def invoke_batch(self, request_item_list: List[Query]) -> List[Any]:
        args = []
        call_methods = set()
        batch_size = len(request_item_list)

        # Construct the batch of requests
        for item in request_item_list:
            logger.debug("Replica {} started executing request {}".format(
                self.replica_tag, item.metadata.request_id))
            args.append(parse_request_item(item))
            call_methods.add(self.get_runner_method(item))

        timing_start = time.time()
        try:
            if len(call_methods) != 1:
                raise RayServeException(
                    f"Queries contain mixed calling methods: {call_methods}. "
                    "Please only send the same type of requests in batching "
                    "mode.")

            self.request_counter.inc(batch_size)

            call_method = sync_to_async(call_methods.pop())
            result_list = await call_method(args)

            if not isinstance(result_list, Iterable) or isinstance(
                    result_list, (dict, set)):
                error_message = ("RayServe expects an ordered iterable object "
                                 "but the replica returned a {}".format(
                                     type(result_list)))
                raise RayServeException(error_message)

            # Normalize the result into a list type. This operation is fast
            # in Python because it doesn't copy anything.
            result_list = list(result_list)

            if (len(result_list) != batch_size):
                error_message = ("Worker doesn't preserve batch size. The "
                                 "input has length {} but the returned list "
                                 "has length {}. Please return a list of "
                                 "results with length equal to the batch size"
                                 ".".format(batch_size, len(result_list)))
                raise RayServeException(error_message)
            for i, result in enumerate(result_list):
                result_list[i] = (await
                                  self.ensure_serializable_response(result))
        except Exception as e:
            wrapped_exception = wrap_to_ray_error(call_method.__name__, e)
            self.error_counter.inc()
            result_list = [wrapped_exception for _ in range(batch_size)]

        latency_ms = (time.time() - timing_start) * 1000
        self.processing_latency_tracker.observe(
            latency_ms, tags={"batch_size": str(batch_size)})

        return result_list
Esempio n. 4
0
 async def reconfigure(self, user_config) -> None:
     if user_config:
         if self.is_function:
             raise ValueError(
                 "backend_def must be a class to use user_config")
         elif not hasattr(self.callable, BACKEND_RECONFIGURE_METHOD):
             raise RayServeException("user_config specified but backend " +
                                     self.backend_tag + " missing " +
                                     BACKEND_RECONFIGURE_METHOD + " method")
         reconfigure_method = sync_to_async(
             getattr(self.callable, BACKEND_RECONFIGURE_METHOD))
         await reconfigure_method(user_config)
Esempio n. 5
0
 async def reconfigure(self, user_config: Any):
     self.user_config = user_config
     self.version = DeploymentVersion(self.version.code_version,
                                      user_config=user_config)
     if self.is_function:
         raise ValueError(
             "deployment_def must be a class to use user_config")
     elif not hasattr(self.callable, RECONFIGURE_METHOD):
         raise RayServeException("user_config specified but deployment " +
                                 self.deployment_name + " missing " +
                                 RECONFIGURE_METHOD + " method")
     reconfigure_method = sync_to_async(
         getattr(self.callable, RECONFIGURE_METHOD))
     await reconfigure_method(user_config)
Esempio n. 6
0
    async def invoke_single(self, request_item: Query) -> Tuple[Any, bool]:
        """Executes the provided request on this replica.

        Returns the user-provided output and a boolean indicating if the
        request succeeded (user code didn't raise an exception).
        """
        logger.debug(
            "Replica {} started executing request {}".format(
                self.replica_tag, request_item.metadata.request_id
            )
        )
        args, kwargs = parse_request_item(request_item)

        method_to_call = None
        success = True
        try:
            runner_method = self.get_runner_method(request_item)
            method_to_call = sync_to_async(runner_method)
            result = None
            if len(inspect.signature(runner_method).parameters) > 0:
                result = await method_to_call(*args, **kwargs)
            else:
                # When access via http http_arg_is_pickled with no args:
                # args = (<starlette.requests.Request object at 0x7fe900694cc0>,)
                # When access via python with no args:
                # args = ()
                if len(args) == 1 and isinstance(args[0], starlette.requests.Request):
                    # The method doesn't take in anything, including the request
                    # information, so we pass nothing into it
                    result = await method_to_call()
                else:
                    # Will throw due to signature mismatch if user attempts to
                    # call with non-empty args
                    result = await method_to_call(*args, **kwargs)

            result = await self.ensure_serializable_response(result)
            self.request_counter.inc()
        except Exception as e:
            logger.exception(f"Request failed due to {type(e).__name__}:")
            success = False
            if "RAY_PDB" in os.environ:
                ray.util.pdb.post_mortem()
            function_name = "unknown"
            if method_to_call is not None:
                function_name = method_to_call.__name__
            result = wrap_to_ray_error(function_name, e)
            self.error_counter.inc()

        return result, success
Esempio n. 7
0
    async def invoke_single(self, request_item: Query) -> Any:
        logger.debug("Replica {} started executing request {}".format(
            self.replica_tag, request_item.metadata.request_id))
        arg = parse_request_item(request_item)

        start = time.time()
        method_to_call = None
        try:
            # TODO(simon): Split this section out when invoke_batch is removed.
            if self.config.internal_metadata.is_asgi_app:
                request: Request = arg
                scope = request.scope
                root_path = self.config.internal_metadata.path_prefix

                # The incoming scope["path"] contains prefixed path and it
                # won't be stripped by FastAPI.
                request.scope["path"] = scope["path"].replace(root_path, "", 1)
                # root_path is used such that the reverse look up and
                # redirection works.
                request.scope["root_path"] = root_path

                sender = ASGIHTTPSender()
                await self.callable._serve_asgi_app(
                    request.scope,
                    request._receive,
                    sender,
                )
                result = sender.build_starlette_response()
            else:
                method_to_call = sync_to_async(
                    self.get_runner_method(request_item))
                result = await method_to_call(arg)
            result = await self.ensure_serializable_response(result)
            self.request_counter.inc()
        except Exception as e:
            import os
            if "RAY_PDB" in os.environ:
                ray.util.pdb.post_mortem()
            function_name = "unknown"
            if method_to_call is not None:
                function_name = method_to_call.__name__
            result = wrap_to_ray_error(function_name, e)
            self.error_counter.inc()

        latency_ms = (time.time() - start) * 1000
        self.processing_latency_tracker.observe(latency_ms,
                                                tags={"batch_size": "1"})

        return result
Esempio n. 8
0
    async def invoke_single(self, request_item: Query) -> Any:
        logger.debug("Replica {} started executing request {}".format(
            self.replica_tag, request_item.metadata.request_id))
        args, kwargs = parse_request_item(request_item)

        start = time.time()
        method_to_call = None
        try:
            # TODO(simon): Split this section out when invoke_batch is removed.
            if self.config.internal_metadata.is_asgi_app:
                request: Request = args[0]
                sender = ASGIHTTPSender()
                await self.callable._serve_asgi_app(
                    request.scope,
                    request._receive,
                    sender,
                )
                result = sender.build_starlette_response()
            else:
                method_to_call = sync_to_async(
                    self.get_runner_method(request_item))
                result = await method_to_call(*args, **kwargs)
            result = await self.ensure_serializable_response(result)
            self.request_counter.inc()
        except Exception as e:
            import os
            if "RAY_PDB" in os.environ:
                ray.util.pdb.post_mortem()
            function_name = "unknown"
            if method_to_call is not None:
                function_name = method_to_call.__name__
            result = wrap_to_ray_error(function_name, e)
            self.error_counter.inc()

        latency_ms = (time.time() - start) * 1000
        self.processing_latency_tracker.observe(latency_ms)

        return result
Esempio n. 9
0
    async def invoke_single(self, request_item: Query) -> Any:
        logger.debug("Replica {} started executing request {}".format(
            self.replica_tag, request_item.metadata.request_id))
        method_to_call = sync_to_async(self.get_runner_method(request_item))
        arg = parse_request_item(request_item)

        start = time.time()
        try:
            result = await method_to_call(arg)
            result = await self.ensure_serializable_response(result)
            self.request_counter.inc()
        except Exception as e:
            import os
            if "RAY_PDB" in os.environ:
                ray.util.pdb.post_mortem()
            result = wrap_to_ray_error(method_to_call.__name__, e)
            self.error_counter.inc()

        latency_ms = (time.time() - start) * 1000
        self.processing_latency_tracker.observe(
            latency_ms, tags={"batch_size": "1"})

        return result
Esempio n. 10
0
    def __init__(
        self,
        _callable: Callable,
        deployment_name: str,
        replica_tag: ReplicaTag,
        deployment_config: DeploymentConfig,
        user_config: Any,
        version: DeploymentVersion,
        is_function: bool,
        controller_handle: ActorHandle,
    ) -> None:
        self.deployment_config = deployment_config
        self.deployment_name = deployment_name
        self.replica_tag = replica_tag
        self.callable = _callable
        self.is_function = is_function
        self.user_config = user_config
        self.version = version
        self.rwlock = aiorwlock.RWLock()

        user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None)
        if not callable(user_health_check):

            def user_health_check():
                pass

        self.user_health_check = sync_to_async(user_health_check)

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Counter(
            "serve_deployment_request_counter",
            description=
            ("The number of queries that have been processed in this replica."
             ),
            tag_keys=("deployment", "replica"),
        )
        self.request_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.error_counter = metrics.Counter(
            "serve_deployment_error_counter",
            description=(
                "The number of exceptions that have occurred in this replica."
            ),
            tag_keys=("deployment", "replica"),
        )
        self.error_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter = metrics.Counter(
            "serve_deployment_replica_starts",
            description=
            ("The number of times this replica has been restarted due to failure."
             ),
            tag_keys=("deployment", "replica"),
        )
        self.restart_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "serve_deployment_processing_latency_ms",
            description="The latency for queries to be processed.",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("deployment", "replica"),
        )
        self.processing_latency_tracker.set_default_tags({
            "deployment":
            self.deployment_name,
            "replica":
            self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "serve_replica_processing_queries",
            description="The current number of queries being processed.",
            tag_keys=("deployment", "replica"),
        )
        self.num_processing_items.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter.inc()

        self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s

        if deployment_config.autoscaling_config:
            process_remote_func = controller_handle.record_autoscaling_metrics.remote
            config = deployment_config.autoscaling_config
            start_metrics_pusher(
                interval_s=config.metrics_interval_s,
                collection_callback=self._collect_autoscaling_metrics,
                metrics_process_func=process_remote_func,
            )

        # NOTE(edoakes): we used to recommend that users use the "ray" logger
        # and tagged the logs with metadata as below. We now recommend using
        # the "ray.serve" 'component logger' (as of Ray 1.13). This is left to
        # maintain backwards compatibility with users who were using the
        # existing logger. We can consider removing it in Ray 2.0.
        ray_logger = logging.getLogger("ray")
        for handler in ray_logger.handlers:
            handler.setFormatter(
                logging.Formatter(
                    handler.formatter._fmt +
                    f" component=serve deployment={self.deployment_name} "
                    f"replica={self.replica_tag}"))
Esempio n. 11
0
    def __init__(
        self,
        _callable: Callable,
        deployment_name: str,
        replica_tag: ReplicaTag,
        deployment_config: DeploymentConfig,
        user_config: Any,
        version: DeploymentVersion,
        is_function: bool,
        controller_handle: ActorHandle,
    ) -> None:
        self.deployment_config = deployment_config
        self.deployment_name = deployment_name
        self.replica_tag = replica_tag
        self.callable = _callable
        self.is_function = is_function
        self.user_config = user_config
        self.version = version
        self.rwlock = aiorwlock.RWLock()

        user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None)
        if not callable(user_health_check):

            def user_health_check():
                pass

        self.user_health_check = sync_to_async(user_health_check)

        self.num_ongoing_requests = 0

        self.request_counter = metrics.Counter(
            "serve_deployment_request_counter",
            description=("The number of queries that have been "
                         "processed in this replica."),
            tag_keys=("deployment", "replica"),
        )
        self.request_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.error_counter = metrics.Counter(
            "serve_deployment_error_counter",
            description=("The number of exceptions that have "
                         "occurred in this replica."),
            tag_keys=("deployment", "replica"),
        )
        self.error_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter = metrics.Counter(
            "serve_deployment_replica_starts",
            description=("The number of times this replica "
                         "has been restarted due to failure."),
            tag_keys=("deployment", "replica"),
        )
        self.restart_counter.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.processing_latency_tracker = metrics.Histogram(
            "serve_deployment_processing_latency_ms",
            description="The latency for queries to be processed.",
            boundaries=DEFAULT_LATENCY_BUCKET_MS,
            tag_keys=("deployment", "replica"),
        )
        self.processing_latency_tracker.set_default_tags({
            "deployment":
            self.deployment_name,
            "replica":
            self.replica_tag
        })

        self.num_processing_items = metrics.Gauge(
            "serve_replica_processing_queries",
            description="The current number of queries being processed.",
            tag_keys=("deployment", "replica"),
        )
        self.num_processing_items.set_default_tags({
            "deployment": self.deployment_name,
            "replica": self.replica_tag
        })

        self.restart_counter.inc()

        self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s

        if deployment_config.autoscaling_config:
            config = deployment_config.autoscaling_config
            start_metrics_pusher(
                interval_s=config.metrics_interval_s,
                collection_callback=self._collect_autoscaling_metrics,
                controller_handle=controller_handle,
            )

        ray_logger = logging.getLogger("ray")
        for handler in ray_logger.handlers:
            handler.setFormatter(
                logging.Formatter(
                    handler.formatter._fmt +
                    f" component=serve deployment={self.deployment_name} "
                    f"replica={self.replica_tag}"))