def __init__( self, special_cases: Union[str, Dict[str, SpecialCaseFunction]], server_interceptors: Optional[List] = None, prometheus_enabled: Optional[bool] = True, ): self.special_cases = special_cases self.server_interceptors = server_interceptors self.prometheus_enabled = prometheus_enabled self.mock_server: grpc.Server with reserve_free_port() as service_port: self.service_port: int = service_port with reserve_free_port() as prom_port: self.prom_port: int = prom_port
def serve(port, bento=None, with_conda=False, enable_microbatch=False): track_cli('serve') bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path) bento_service = load(bento_service_bundle_path) if with_conda: run_with_conda_env( bento_service_bundle_path, 'bentoml serve {bento} --port {port} {flags}'.format( bento=bento_service_bundle_path, port=port, flags="--enable-microbatch" if enable_microbatch else "", ), ) return if enable_microbatch: with reserve_free_port() as api_server_port: # start server right after port released # to reduce potential race marshal_server = MarshalService( bento_service_bundle_path, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=1, ) api_server = BentoAPIServer(bento_service, port=api_server_port) marshal_server.async_start(port=port) api_server.start() else: api_server = BentoAPIServer(bento_service, port=port) api_server.start()
def start_dev_server(saved_bundle_path: str, port: int, enable_microbatch: bool, run_with_ngrok: bool): logger.info("Starting BentoML API server in development mode..") from bentoml import load from bentoml.server.api_server import BentoAPIServer from bentoml.marshal.marshal import MarshalService from bentoml.utils import reserve_free_port bento_service = load(saved_bundle_path) if run_with_ngrok: from bentoml.utils.flask_ngrok import start_ngrok from threading import Timer thread = Timer(1, start_ngrok, args=(port, )) thread.setDaemon(True) thread.start() if enable_microbatch: with reserve_free_port() as api_server_port: # start server right after port released # to reduce potential race marshal_server = MarshalService( saved_bundle_path, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=1, ) api_server = BentoAPIServer(bento_service, port=api_server_port) marshal_server.async_start(port=port) api_server.start() else: api_server = BentoAPIServer(bento_service, port=port) api_server.start()
def start_dev_server( saved_bundle_path: str, port: int = Provide[BentoMLContainer.config.api_server.port], enable_microbatch: bool = Provide[ BentoMLContainer.config.api_server.enable_microbatch], mb_max_batch_size: int = Provide[ BentoMLContainer.config.marshal_server.max_batch_size], mb_max_latency: int = Provide[ BentoMLContainer.config.marshal_server.max_latency], run_with_ngrok: bool = Provide[ BentoMLContainer.config.api_server.run_with_ngrok], enable_swagger: bool = Provide[ BentoMLContainer.config.api_server.enable_swagger], ): logger.info("Starting BentoML API server in development mode..") import multiprocessing from bentoml.saved_bundle import load_from_dir from bentoml.server.api_server import BentoAPIServer from bentoml.utils import reserve_free_port if run_with_ngrok: from threading import Timer from bentoml.utils.flask_ngrok import start_ngrok thread = Timer(1, start_ngrok, args=(port, )) thread.setDaemon(True) thread.start() if enable_microbatch: with reserve_free_port() as api_server_port: # start server right after port released # to reduce potential race marshal_proc = multiprocessing.Process( target=start_dev_batching_server, kwargs=dict( api_server_port=api_server_port, saved_bundle_path=saved_bundle_path, port=port, mb_max_latency=mb_max_latency, mb_max_batch_size=mb_max_batch_size, ), daemon=True, ) marshal_proc.start() bento_service = load_from_dir(saved_bundle_path) api_server = BentoAPIServer(bento_service, port=api_server_port, enable_swagger=enable_swagger) api_server.start() else: bento_service = load_from_dir(saved_bundle_path) api_server = BentoAPIServer(bento_service, port=port, enable_swagger=enable_swagger) api_server.start()
def serve_gunicorn( port, workers, timeout, bento=None, with_conda=False, enable_microbatch=False, microbatch_workers=1, ): track_cli('serve_gunicorn') bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path) if with_conda: run_with_conda_env( pip_installed_bundle_path, 'bentoml serve_gunicorn {bento} -p {port} -w {workers} ' '--timeout {timeout} {flags}'.format( bento=bento_service_bundle_path, port=port, workers=workers, timeout=timeout, flags="--enable-microbatch" if enable_microbatch else "", ), ) return if workers is None: workers = get_gunicorn_num_of_workers() from bentoml.server.gunicorn_server import GunicornBentoServer if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=bento_service_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( bento_service_bundle_path, api_server_port, workers, timeout, prometheus_lock, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(bento_service_bundle_path, port, workers, timeout) gunicorn_app.run()
def start_prod_server( saved_bundle_path: str, port: int, timeout: int, workers: int, enable_microbatch: bool, microbatch_workers: int, enable_swagger: bool, ): logger.info("Starting BentoML API server in production mode..") import psutil import multiprocessing assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer from bentoml.server.utils import get_gunicorn_num_of_workers from bentoml.utils import reserve_free_port if workers is None: workers = get_gunicorn_num_of_workers() if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( saved_bundle_path, api_server_port, workers, timeout, prometheus_lock, enable_swagger, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(saved_bundle_path, port, workers, timeout, enable_swagger=enable_swagger) gunicorn_app.run()
def start_prod_server( saved_bundle_path: str, port: Optional[int] = None, workers: Optional[int] = None, timeout: Optional[int] = None, enable_microbatch: Optional[bool] = None, enable_swagger: Optional[bool] = None, mb_max_batch_size: Optional[int] = None, mb_max_latency: Optional[int] = None, microbatch_workers: Optional[int] = None, config_file: Optional[str] = None, ): import psutil assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" config = BentoMLConfiguration(override_config_file=config_file) config.override(["api_server", "port"], port) config.override(["api_server", "workers"], workers) config.override(["api_server", "timeout"], timeout) config.override(["api_server", "enable_microbatch"], enable_microbatch) config.override(["api_server", "enable_swagger"], enable_swagger) config.override(["marshal_server", "max_batch_size"], mb_max_batch_size) config.override(["marshal_server", "max_latency"], mb_max_latency) config.override(["marshal_server", "workers"], microbatch_workers) if config.config['api_server'].get('enable_microbatch'): prometheus_lock = multiprocessing.Lock() with reserve_free_port() as api_server_port: pass model_server_job = multiprocessing.Process( target=_start_prod_server, kwargs=dict( saved_bundle_path=saved_bundle_path, port=api_server_port, config=config, prometheus_lock=prometheus_lock, ), daemon=True, ) model_server_job.start() try: _start_prod_batching_server( saved_bundle_path=saved_bundle_path, config=config, api_server_port=api_server_port, prometheus_lock=prometheus_lock, ) finally: model_server_job.terminate() else: _start_prod_server(saved_bundle_path=saved_bundle_path, config=config)
def start_prod_server( saved_bundle_path: str, port: Optional[int] = None, workers: Optional[int] = None, timeout: Optional[int] = None, enable_microbatch: Optional[bool] = None, enable_swagger: Optional[bool] = None, mb_max_batch_size: Optional[int] = None, mb_max_latency: Optional[int] = None, microbatch_workers: Optional[int] = None, ): import psutil assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" prometheus_lock = multiprocessing.Lock() with reserve_free_port() as api_server_port: pass model_server_job = multiprocessing.Process( target=_start_prod_server, kwargs=dict( saved_bundle_path=saved_bundle_path, port=api_server_port, timeout=timeout, workers=workers, prometheus_lock=prometheus_lock, enable_swagger=enable_swagger, ), daemon=True, ) model_server_job.start() try: _start_prod_proxy( saved_bundle_path=saved_bundle_path, port=port, api_server_port=api_server_port, workers=microbatch_workers, timeout=timeout, outbound_workers=workers, enable_microbatch=enable_microbatch, mb_max_batch_size=mb_max_batch_size, mb_max_latency=mb_max_latency, prometheus_lock=prometheus_lock, ) finally: model_server_job.terminate()
def start_dev_server( bundle_path: str, port: Optional[int] = None, enable_microbatch: Optional[bool] = None, mb_max_batch_size: Optional[int] = None, mb_max_latency: Optional[int] = None, run_with_ngrok: Optional[bool] = None, enable_swagger: Optional[bool] = None, config_file: Optional[str] = None, ): config = BentoMLConfiguration(override_config_file=config_file) config.override(["api_server", "port"], port) config.override(["api_server", "enable_microbatch"], enable_microbatch) config.override(["api_server", "enable_swagger"], enable_swagger) config.override(["marshal_server", "max_batch_size"], mb_max_batch_size) config.override(["marshal_server", "max_latency"], mb_max_latency) if run_with_ngrok: from threading import Timer from bentoml.utils.flask_ngrok import start_ngrok thread = Timer(1, start_ngrok, args=(port, )) thread.setDaemon(True) thread.start() with reserve_free_port() as api_server_port: # start server right after port released # to reduce potential race model_server_proc = multiprocessing.Process( target=_start_dev_server, kwargs=dict( api_server_port=api_server_port, saved_bundle_path=bundle_path, config=config, ), daemon=True, ) model_server_proc.start() try: _start_dev_proxy( api_server_port=api_server_port, saved_bundle_path=bundle_path, config=config, ) finally: model_server_proc.terminate()
def start_dev_server( bundle_path: str, port: Optional[int] = None, enable_microbatch: Optional[bool] = None, mb_max_batch_size: Optional[int] = None, mb_max_latency: Optional[int] = None, run_with_ngrok: Optional[bool] = None, enable_swagger: Optional[bool] = None, ): if run_with_ngrok: from threading import Timer from bentoml.utils.flask_ngrok import start_ngrok thread = Timer(1, start_ngrok, args=(port, )) thread.setDaemon(True) thread.start() with reserve_free_port() as api_server_port: # start server right after port released # to reduce potential race model_server_proc = multiprocessing.Process( target=_start_dev_server, kwargs=dict( api_server_port=api_server_port, saved_bundle_path=bundle_path, enable_swagger=enable_swagger, ), daemon=True, ) model_server_proc.start() try: _start_dev_proxy( port=port, api_server_port=api_server_port, saved_bundle_path=bundle_path, enable_microbatch=enable_microbatch, mb_max_batch_size=mb_max_batch_size, mb_max_latency=mb_max_latency, ) finally: model_server_proc.terminate()
def start_yatai_service_grpc_server( db_url, grpc_port, ui_port, with_ui, base_url, repository_type, file_system_directory, s3_url, s3_endpoint_url, gcs_url, web_ui_log_path: str = Provide[BentoMLContainer.yatai_logging_path], ): # Lazily import grpcio for YataiSerivce gRPC related actions import grpc from bentoml.yatai.db import DB from bentoml.yatai.repository import create_repository from bentoml.yatai.yatai_service_impl import get_yatai_service_impl from bentoml.yatai.proto.yatai_service_pb2_grpc import add_YataiServicer_to_server from bentoml.yatai.proto.yatai_service_pb2_grpc import YataiServicer YataiServicerImpl = get_yatai_service_impl(YataiServicer) yatai_service = YataiServicerImpl( repository=create_repository(repository_type, file_system_directory, s3_url, s3_endpoint_url, gcs_url), database=DB(db_url), ) # Define interceptors here grpc_interceptors = [PromServerInterceptor(), ServiceLatencyInterceptor()] server = grpc.server( futures.ThreadPoolExecutor(max_workers=10), interceptors=grpc_interceptors, ) add_YataiServicer_to_server(yatai_service, server) debug_mode = get_debug_mode() if debug_mode: try: logger.debug("Enabling gRPC server reflection for debugging") from bentoml.yatai.proto import yatai_service_pb2 from grpc_reflection.v1alpha import reflection SERVICE_NAMES = ( yatai_service_pb2.DESCRIPTOR.services_by_name["Yatai"]. full_name, reflection.SERVICE_NAME, ) reflection.enable_server_reflection(SERVICE_NAMES, server) except ImportError: logger.debug( "Failed to enable gRPC server reflection, missing required package: " '"pip install grpcio-reflection"') server.add_insecure_port(f"[::]:{grpc_port}") # NOTE: the current implementation sets prometheus_port to # 50052 to accomodate with Makefile setups. Currently there # isn't a way to find the reserve_free_port dynamically inside # Makefile to find the free ports for prometheus_port without # the help of a shell scripts. prometheus_port = 50052 with reserve_free_port() as port: prometheus_port = port # prevents wsgi to see prometheus_port as used start_http_server(prometheus_port) server.start() if with_ui: ensure_node_available_or_raise() yatai_grpc_server_address = f"localhost:{grpc_port}" prometheus_address = f"http://localhost:{prometheus_port}" async_start_yatai_service_web_ui( yatai_grpc_server_address, prometheus_address, ui_port, web_ui_log_path, debug_mode, base_url, ) # We don't import _echo function from click_utils because of circular dep if with_ui: if debug_mode is True: ui_port = 8080 web_ui_link = f"http://127.0.0.1:{ui_port}" if base_url != ".": web_ui_link += f"/{base_url}" web_ui_message = f"running on {web_ui_link}" else: web_ui_message = "off" if debug_mode: prom_ui_message = "off" else: prom_ui_message = f"running on http://127.0.0.1:{ui_port}/metrics\n" click.echo( f"* Starting BentoML YataiService gRPC Server\n" f'* Debug mode: { "on" if debug_mode else "off"}\n' f"* Web UI: {web_ui_message}\n" f"* Running on 127.0.0.1:{grpc_port} (Press CTRL+C to quit)\n" f"* Prometheus: {prom_ui_message}\n" f"* Help and instructions: " f"https://docs.bentoml.org/en/latest/guides/yatai_service.html\n" f'{f"* Web server log can be found here: {web_ui_log_path}" if with_ui else ""}' f"\n-----\n" f"* Usage in Python:\n" f'* bento_svc.save(yatai_url="127.0.0.1:{grpc_port}")\n' f"* from bentoml.yatai.client import get_yatai_client\n" f'* get_yatai_client("127.0.0.1:{grpc_port}").repository.list()\n' f"* Usage in CLI:\n" f"* bentoml list --yatai-url=127.0.0.1:{grpc_port}\n" f"* bentoml containerize IrisClassifier:latest --yatai-url=127.0.0.1:" f"{grpc_port}\n" f"* bentoml push IrisClassifier:20200918001645_CD2886 --yatai-url=127.0.0.1:" f"{grpc_port}\n" f"* bentoml pull IrisClassifier:20200918001645_CD2886 --yatai-url=127.0.0.1:" f"{grpc_port}\n" f"* bentoml retrieve IrisClassifier:20200918001645_CD2886 " f'--yatai-url=127.0.0.1:{grpc_port} --target_dir="/tmp/foo/bar"\n' f"* bentoml delete IrisClassifier:20200918001645_CD2886 " f"--yatai-url=127.0.0.1:{grpc_port}\n" # TODO: simplify the example usage here once related documentation is ready ) try: while True: time.sleep(_ONE_DAY_IN_SECONDS) except KeyboardInterrupt: logger.info("Terminating YataiService gRPC server..") server.stop(grace=None)
def serve_gunicorn( port, workers, timeout, bento=None, with_conda=False, enable_microbatch=False, microbatch_workers=1, ): if not psutil.POSIX: _echo( "The `bentoml server-gunicon` command is only supported on POSIX. " "On windows platform, use `bentoml serve` for local API testing and " "docker for running production API endpoint: " "https://docs.docker.com/docker-for-windows/ " ) return bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path ) if with_conda: return run_with_conda_env( pip_installed_bundle_path, 'bentoml serve_gunicorn {bento} -p {port} -w {workers} ' '--timeout {timeout} {flags}'.format( bento=bento_service_bundle_path, port=port, workers=workers, timeout=timeout, flags="--enable-microbatch" if enable_microbatch else "", ), ) if workers is None: workers = get_gunicorn_num_of_workers() # Gunicorn only supports POSIX platforms from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=bento_service_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( bento_service_bundle_path, api_server_port, workers, timeout, prometheus_lock, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer( bento_service_bundle_path, port, workers, timeout ) gunicorn_app.run()
def start_prod_server( saved_bundle_path: str, port: int = Provide[BentoMLContainer.config.api_server.port], timeout: int = Provide[BentoMLContainer.config.api_server.timeout], workers: int = Provide[BentoMLContainer.api_server_workers], enable_microbatch: bool = Provide[ BentoMLContainer.config.api_server.enable_microbatch], mb_max_batch_size: int = Provide[ BentoMLContainer.config.marshal_server.max_batch_size], mb_max_latency: int = Provide[ BentoMLContainer.config.marshal_server.max_latency], microbatch_workers: int = Provide[ BentoMLContainer.config.marshal_server.workers], enable_swagger: bool = Provide[ BentoMLContainer.config.api_server.enable_swagger], ): logger.info("Starting BentoML API server in production mode..") import multiprocessing import psutil assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer from bentoml.utils import reserve_free_port if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, mb_max_batch_size=mb_max_batch_size, mb_max_latency=mb_max_latency, ) gunicorn_app = GunicornBentoServer( saved_bundle_path, api_server_port, workers, timeout, prometheus_lock, enable_swagger, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(saved_bundle_path, port, workers, timeout, enable_swagger=enable_swagger) gunicorn_app.run()