async def invoke_single(self, request_item: Query) -> Any: logger.debug("Replica {} started executing request {}".format( self.replica_tag, request_item.metadata.request_id)) args, kwargs = parse_request_item(request_item) start = time.time() method_to_call = None try: method_to_call = sync_to_async( self.get_runner_method(request_item)) result = await method_to_call(*args, **kwargs) result = await self.ensure_serializable_response(result) self.request_counter.inc() except Exception as e: import os if "RAY_PDB" in os.environ: ray.util.pdb.post_mortem() function_name = "unknown" if method_to_call is not None: function_name = method_to_call.__name__ result = wrap_to_ray_error(function_name, e) self.error_counter.inc() latency_ms = (time.time() - start) * 1000 self.processing_latency_tracker.observe(latency_ms) return result
async def invoke_single(self, request_item: Query) -> Any: logger.debug("Replica {} started executing request {}".format( self.replica_tag, request_item.metadata.request_id)) args, kwargs = parse_request_item(request_item) start = time.time() method_to_call = None try: runner_method = self.get_runner_method(request_item) method_to_call = sync_to_async(runner_method) result = None if len(inspect.signature(runner_method).parameters) > 0: result = await method_to_call(*args, **kwargs) else: # The method doesn't take in anything, including the request # information, so we pass nothing into it result = await method_to_call() result = await self.ensure_serializable_response(result) self.request_counter.inc() except Exception as e: import os if "RAY_PDB" in os.environ: ray.util.pdb.post_mortem() function_name = "unknown" if method_to_call is not None: function_name = method_to_call.__name__ result = wrap_to_ray_error(function_name, e) self.error_counter.inc() latency_ms = (time.time() - start) * 1000 self.processing_latency_tracker.observe(latency_ms) return result
async def invoke_single(self, request_item: Query) -> Any: args, kwargs, is_web_context = parse_request_item(request_item) serve_context.web = is_web_context method_to_call = self.get_runner_method(request_item) args = args if self.has_positional_args(method_to_call) else [] method_to_call = ensure_async(method_to_call) start = time.time() try: result = await method_to_call(*args, **kwargs) self.request_counter.record(1, {"backend": self.backend_tag}) except Exception as e: result = wrap_to_ray_error(e) self.error_counter.record(1, {"backend": self.backend_tag}) finally: self._reset_context() self.processing_latency_tracker.record( (time.time() - start) * 1000, { "backend": self.backend_tag, "replica": self.replica_tag, "batch_size": "1" }) return result
async def invoke_batch(self, request_item_list: List[Query]) -> List[Any]: args = [] call_methods = set() batch_size = len(request_item_list) # Construct the batch of requests for item in request_item_list: logger.debug("Replica {} started executing request {}".format( self.replica_tag, item.metadata.request_id)) args.append(parse_request_item(item)) call_methods.add(self.get_runner_method(item)) timing_start = time.time() try: if len(call_methods) != 1: raise RayServeException( f"Queries contain mixed calling methods: {call_methods}. " "Please only send the same type of requests in batching " "mode.") self.request_counter.inc(batch_size) call_method = sync_to_async(call_methods.pop()) result_list = await call_method(args) if not isinstance(result_list, Iterable) or isinstance( result_list, (dict, set)): error_message = ("RayServe expects an ordered iterable object " "but the replica returned a {}".format( type(result_list))) raise RayServeException(error_message) # Normalize the result into a list type. This operation is fast # in Python because it doesn't copy anything. result_list = list(result_list) if (len(result_list) != batch_size): error_message = ("Worker doesn't preserve batch size. The " "input has length {} but the returned list " "has length {}. Please return a list of " "results with length equal to the batch size" ".".format(batch_size, len(result_list))) raise RayServeException(error_message) for i, result in enumerate(result_list): result_list[i] = (await self.ensure_serializable_response(result)) except Exception as e: wrapped_exception = wrap_to_ray_error(call_method.__name__, e) self.error_counter.inc() result_list = [wrapped_exception for _ in range(batch_size)] latency_ms = (time.time() - timing_start) * 1000 self.processing_latency_tracker.observe( latency_ms, tags={"batch_size": str(batch_size)}) return result_list
async def invoke_batch(self, request_item_list: List[Query]) -> List[Any]: args = [] call_methods = set() batch_size = len(request_item_list) # Construct the batch of requests for item in request_item_list: args.append(parse_request_item(item)) call_methods.add(self.get_runner_method(item)) timing_start = time.time() try: if len(call_methods) != 1: raise RayServeException( f"Queries contain mixed calling methods: {call_methods}. " "Please only send the same type of requests in batching " "mode.") self.request_counter.record(batch_size, {"backend": self.backend_tag}) call_method = ensure_async(call_methods.pop()) result_list = await call_method(args) if not isinstance(result_list, Iterable) or isinstance( result_list, (dict, set)): error_message = ("RayServe expects an ordered iterable object " "but the worker returned a {}".format( type(result_list))) raise RayServeException(error_message) # Normalize the result into a list type. This operation is fast # in Python because it doesn't copy anything. result_list = list(result_list) if (len(result_list) != batch_size): error_message = ("Worker doesn't preserve batch size. The " "input has length {} but the returned list " "has length {}. Please return a list of " "results with length equal to the batch size" ".".format(batch_size, len(result_list))) raise RayServeException(error_message) except Exception as e: wrapped_exception = wrap_to_ray_error(e) self.error_counter.record(1, {"backend": self.backend_tag}) result_list = [wrapped_exception for _ in range(batch_size)] self.processing_latency_tracker.record( (time.time() - timing_start) * 1000, { "backend": self.backend_tag, "replica_tag": self.replica_tag, "batch_size": str(batch_size) }) return result_list
def invoke_single(self, request_item): args, kwargs, is_web_context = parse_request_item(request_item) serve_context.web = is_web_context start_timestamp = time.time() try: result = self.__call__(*args, **kwargs) except Exception as e: result = wrap_to_ray_error(e) self._serve_metric_error_counter += 1 self._serve_metric_latency_list.append(time.time() - start_timestamp) return result
async def invoke_single(self, request_item: Query) -> Any: logger.debug("Replica {} started executing request {}".format( self.replica_tag, request_item.metadata.request_id)) arg = parse_request_item(request_item) start = time.time() method_to_call = None try: # TODO(simon): Split this section out when invoke_batch is removed. if self.config.internal_metadata.is_asgi_app: request: Request = arg scope = request.scope root_path = self.config.internal_metadata.path_prefix # The incoming scope["path"] contains prefixed path and it # won't be stripped by FastAPI. request.scope["path"] = scope["path"].replace(root_path, "", 1) # root_path is used such that the reverse look up and # redirection works. request.scope["root_path"] = root_path sender = ASGIHTTPSender() await self.callable._serve_asgi_app( request.scope, request._receive, sender, ) result = sender.build_starlette_response() else: method_to_call = sync_to_async( self.get_runner_method(request_item)) result = await method_to_call(arg) result = await self.ensure_serializable_response(result) self.request_counter.inc() except Exception as e: import os if "RAY_PDB" in os.environ: ray.util.pdb.post_mortem() function_name = "unknown" if method_to_call is not None: function_name = method_to_call.__name__ result = wrap_to_ray_error(function_name, e) self.error_counter.inc() latency_ms = (time.time() - start) * 1000 self.processing_latency_tracker.observe(latency_ms, tags={"batch_size": "1"}) return result
async def invoke_single(self, request_item: Query) -> Tuple[Any, bool]: """Executes the provided request on this replica. Returns the user-provided output and a boolean indicating if the request succeeded (user code didn't raise an exception). """ logger.debug( "Replica {} started executing request {}".format( self.replica_tag, request_item.metadata.request_id ) ) args, kwargs = parse_request_item(request_item) method_to_call = None success = True try: runner_method = self.get_runner_method(request_item) method_to_call = sync_to_async(runner_method) result = None if len(inspect.signature(runner_method).parameters) > 0: result = await method_to_call(*args, **kwargs) else: # When access via http http_arg_is_pickled with no args: # args = (<starlette.requests.Request object at 0x7fe900694cc0>,) # When access via python with no args: # args = () if len(args) == 1 and isinstance(args[0], starlette.requests.Request): # The method doesn't take in anything, including the request # information, so we pass nothing into it result = await method_to_call() else: # Will throw due to signature mismatch if user attempts to # call with non-empty args result = await method_to_call(*args, **kwargs) result = await self.ensure_serializable_response(result) self.request_counter.inc() except Exception as e: logger.exception(f"Request failed due to {type(e).__name__}:") success = False if "RAY_PDB" in os.environ: ray.util.pdb.post_mortem() function_name = "unknown" if method_to_call is not None: function_name = method_to_call.__name__ result = wrap_to_ray_error(function_name, e) self.error_counter.inc() return result, success
async def invoke_single(self, request_item): args, kwargs, is_web_context = parse_request_item(request_item) serve_context.web = is_web_context method_to_call = self.get_runner_method(request_item) args = args if self.has_positional_args(method_to_call) else [] method_to_call = ensure_async(method_to_call) try: result = await method_to_call(*args, **kwargs) self.request_counter.add() except Exception as e: result = wrap_to_ray_error(e) self.error_counter.add() return result
async def invoke_single(self, request_item: Query) -> Any: method_to_call = ensure_async(self.get_runner_method(request_item)) arg = parse_request_item(request_item) start = time.time() try: result = await method_to_call(arg) self.request_counter.record(1) except Exception as e: result = wrap_to_ray_error(e) self.error_counter.record(1) self.processing_latency_tracker.record( (time.time() - start) * 1000, tags={"batch_size": "1"}) return result
async def invoke_single(self, request_item): args, kwargs, is_web_context = parse_request_item(request_item) serve_context.web = is_web_context start_timestamp = time.time() method_to_call = self.get_runner_method(request_item) args = args if self.has_positional_args(method_to_call) else [] method_to_call = ensure_async(method_to_call) try: result = await method_to_call(*args, **kwargs) except Exception as e: result = wrap_to_ray_error(e) self.error_counter += 1 self.latency_list.append(time.time() - start_timestamp) return result
async def invoke_single(self, request_item: Query) -> Any: args, kwargs, is_web_context = parse_request_item(request_item) serve_context.web = is_web_context method_to_call = self.get_runner_method(request_item) args = args if self.has_positional_args(method_to_call) else [] method_to_call = ensure_async(method_to_call) try: result = await method_to_call(*args, **kwargs) self.request_counter.record(1, {"backend": self.backend_tag}) except Exception as e: result = wrap_to_ray_error(e) self.error_counter.record(1, {"backend": self.backend_tag}) finally: self._reset_context() return result
async def invoke_single(self, request_item: Query) -> Any: logger.debug("Replica {} started executing request {}".format( self.replica_tag, request_item.metadata.request_id)) method_to_call = ensure_async(self.get_runner_method(request_item)) arg = parse_request_item(request_item) start = time.time() try: result = await method_to_call(arg) self.request_counter.record(1) except Exception as e: import os if "RAY_PDB" in os.environ: ray.util.pdb.post_mortem() result = wrap_to_ray_error(e) self.error_counter.record(1) latency_ms = (time.time() - start) * 1000 self.processing_latency_tracker.record( latency_ms, tags={"batch_size": "1"}) return result
async def invoke_single(self, request_item: Query) -> Any: logger.debug("Replica {} started executing request {}".format( self.replica_tag, request_item.metadata.request_id)) args, kwargs = parse_request_item(request_item) start = time.time() method_to_call = None try: # TODO(simon): Split this section out when invoke_batch is removed. if self.config.internal_metadata.is_asgi_app: request: Request = args[0] sender = ASGIHTTPSender() await self.callable._serve_asgi_app( request.scope, request._receive, sender, ) result = sender.build_starlette_response() else: method_to_call = sync_to_async( self.get_runner_method(request_item)) result = await method_to_call(*args, **kwargs) result = await self.ensure_serializable_response(result) self.request_counter.inc() except Exception as e: import os if "RAY_PDB" in os.environ: ray.util.pdb.post_mortem() function_name = "unknown" if method_to_call is not None: function_name = method_to_call.__name__ result = wrap_to_ray_error(function_name, e) self.error_counter.inc() latency_ms = (time.time() - start) * 1000 self.processing_latency_tracker.observe(latency_ms) return result
def invoke_batch(self, request_item_list): # TODO(alind) : create no-http services. The enqueues # from such services will always be TaskContext.Python. # Assumption : all the requests in a bacth # have same serve context. # For batching kwargs are modified as follows - # kwargs [Python Context] : key,val # kwargs_list : key, [val1,val2, ... , valn] # or # args[Web Context] : val # args_list : [val1,val2, ...... , valn] # where n (current batch size) <= max_batch_size of a backend arg_list = [] kwargs_list = defaultdict(list) context_flags = set() batch_size = len(request_item_list) for item in request_item_list: args, kwargs, is_web_context = parse_request_item(item) context_flags.add(is_web_context) if is_web_context: # Python context only have kwargs flask_request = args[0] arg_list.append(flask_request) else: # Web context only have one positional argument for k, v in kwargs.items(): kwargs_list[k].append(v) # Set the flask request as a list to conform # with batching semantics: when in batching # mode, each argument it turned into list. arg_list.append(FakeFlaskRequest()) try: # check mixing of query context # unified context needed if len(context_flags) != 1: raise RayServeException( "Batched queries contain mixed context. Please only send " "the same type of requests in batching mode.") serve_context.web = context_flags.pop() serve_context.batch_size = batch_size # Flask requests are passed to __call__ as a list arg_list = [arg_list] start_timestamp = time.time() result_list = self.__call__(*arg_list, **kwargs_list) self._serve_metric_latency_list.append(time.time() - start_timestamp) if (not isinstance(result_list, list)) or (len(result_list) != batch_size): raise RayServeException("__call__ function " "doesn't preserve batch-size. " "Please return a list of result " "with length equals to the batch " "size.") return result_list except Exception as e: wrapped_exception = wrap_to_ray_error(e) self._serve_metric_error_counter += batch_size return [wrapped_exception for _ in range(batch_size)]
async def invoke_batch(self, request_item_list): arg_list = [] kwargs_list = defaultdict(list) context_flags = set() batch_size = len(request_item_list) call_methods = set() for item in request_item_list: args, kwargs, is_web_context = parse_request_item(item) context_flags.add(is_web_context) call_method = self.get_runner_method(item) call_methods.add(call_method) if is_web_context: # Python context only have kwargs flask_request = args[0] arg_list.append(flask_request) else: # Web context only have one positional argument for k, v in kwargs.items(): kwargs_list[k].append(v) # Set the flask request as a list to conform # with batching semantics: when in batching # mode, each argument is turned into list. if self.has_positional_args(call_method): arg_list.append(FakeFlaskRequest()) try: # Check mixing of query context (unified context needed). if len(context_flags) != 1: raise RayServeException( "Batched queries contain mixed context. Please only send " "the same type of requests in batching mode.") serve_context.web = context_flags.pop() if len(call_methods) != 1: raise RayServeException( "Queries contain mixed calling methods. Please only send " "the same type of requests in batching mode.") call_method = ensure_async(call_methods.pop()) serve_context.batch_size = batch_size # Flask requests are passed to __call__ as a list arg_list = [arg_list] self.request_counter.add(batch_size) result_list = await call_method(*arg_list, **kwargs_list) if not isinstance(result_list, Iterable) or isinstance( result_list, (dict, set)): error_message = ("RayServe expects an ordered iterable object " "but the worker returned a {}".format( type(result_list))) raise RayServeException(error_message) # Normalize the result into a list type. This operation is fast # in Python because it doesn't copy anything. result_list = list(result_list) if (len(result_list) != batch_size): error_message = ("Worker doesn't preserve batch size. The " "input has length {} but the returned list " "has length {}. Please return a list of " "results with length equal to the batch size" ".".format(batch_size, len(result_list))) raise RayServeException(error_message) self._reset_context() return result_list except Exception as e: wrapped_exception = wrap_to_ray_error(e) self.error_counter.add() self._reset_context() return [wrapped_exception for _ in range(batch_size)]
async def invoke_batch(self, request_item_list): # TODO(alind) : create no-http services. The enqueues # from such services will always be TaskContext.Python. # Assumption : all the requests in a bacth # have same serve context. # For batching kwargs are modified as follows - # kwargs [Python Context] : key,val # kwargs_list : key, [val1,val2, ... , valn] # or # args[Web Context] : val # args_list : [val1,val2, ...... , valn] # where n (current batch size) <= max_batch_size of a backend arg_list = [] kwargs_list = defaultdict(list) context_flags = set() batch_size = len(request_item_list) call_methods = set() for item in request_item_list: args, kwargs, is_web_context = parse_request_item(item) context_flags.add(is_web_context) call_method = self.get_runner_method(item) call_methods.add(call_method) if is_web_context: # Python context only have kwargs flask_request = args[0] arg_list.append(flask_request) else: # Web context only have one positional argument for k, v in kwargs.items(): kwargs_list[k].append(v) # Set the flask request as a list to conform # with batching semantics: when in batching # mode, each argument is turned into list. if self.has_positional_args(call_method): arg_list.append(FakeFlaskRequest()) try: # Check mixing of query context (unified context needed). if len(context_flags) != 1: raise RayServeException( "Batched queries contain mixed context. Please only send " "the same type of requests in batching mode.") serve_context.web = context_flags.pop() if len(call_methods) != 1: raise RayServeException( "Queries contain mixed calling methods. Please only send " "the same type of requests in batching mode.") call_method = ensure_async(call_methods.pop()) serve_context.batch_size = batch_size # Flask requests are passed to __call__ as a list arg_list = [arg_list] start_timestamp = time.time() result_list = await call_method(*arg_list, **kwargs_list) self.latency_list.append(time.time() - start_timestamp) if (not isinstance(result_list, list)) or (len(result_list) != batch_size): error_message = ("Worker doesn't preserve batch size. The " "input has length {} but the returned list " "has length {}. Please return a list of " "results with length equal to the batch size" ".".format(batch_size, len(result_list))) raise RayServeException(error_message) return result_list except Exception as e: wrapped_exception = wrap_to_ray_error(e) self.error_counter += batch_size return [wrapped_exception for _ in range(batch_size)]