def serve(port, bento=None, with_conda=False, enable_microbatch=False): track_cli('serve') bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path ) bento_service = load(bento_service_bundle_path) if with_conda: run_with_conda_env( bento_service_bundle_path, 'bentoml serve {bento} --port {port} {flags}'.format( bento=bento_service_bundle_path, port=port, flags="--enable-microbatch" if enable_microbatch else "", ), ) return if enable_microbatch: with reserve_free_port() as api_server_port: # start server right after port released # to reduce potential race marshal_server = MarshalService( bento_service_bundle_path, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=1, ) api_server = BentoAPIServer(bento_service, port=api_server_port) marshal_server.async_start(port=port) api_server.start() else: api_server = BentoAPIServer(bento_service, port=port) api_server.start()
def load(self): server = MarshalService( self.bento_service_bundle_path, self.outbound_host, self.outbound_port, outbound_workers=self.outbound_workers, ) return server.make_app()
class MarshalServer: """ MarshalServer creates a reverse proxy server in front of actual API server, implementing the micro batching feature. Requests in a short period(mb_max_latency) are collected and sent to API server, merged into a single request. """ _DEFAULT_PORT = config("apiserver").getint("default_port") _DEFAULT_MAX_LATENCY = config("marshal_server").getint( "default_max_latency") _DEFAULT_MAX_BATCH_SIZE = config("marshal_server").getint( "default_max_batch_size") def __init__(self, target_host, target_port, port=_DEFAULT_PORT): self.port = port self.marshal_app = MarshalService(target_host, target_port) def setup_routes_from_pb(self, bento_service_metadata_pb): for api_config in bento_service_metadata_pb.apis: if api_config.handler_type in HANDLER_TYPES_BATCH_MODE_SUPPORTED: handler_config = getattr(api_config, "handler_config", {}) max_latency = (handler_config["mb_max_latency"] if "mb_max_latency" in handler_config else self._DEFAULT_MAX_LATENCY) self.marshal_app.add_batch_handler( api_config.name, max_latency, self._DEFAULT_MAX_BATCH_SIZE) marshal_logger.info("Micro batch enabled for API `%s`", api_config.name) def async_start(self): """ Start an micro batch server at the specific port on the instance or parameter. """ track_server('marshal') marshal_proc = multiprocessing.Process( target=self.marshal_app.fork_start_app, kwargs=dict(port=self.port), daemon=True, ) # TODO: make sure child process dies when parent process is killed. marshal_proc.start() marshal_logger.info("Running micro batch service on :%d", self.port)
def __init__(self, target_host, target_port, port=_DEFAULT_PORT): self.port = port self.marshal_app = MarshalService(target_host, target_port)
def load(self): server = MarshalService( self.bento_service_bundle_path, self.target_host, self.target_port, ) return server.make_app()