def serve_gunicorn( port, workers, timeout, bento=None, with_conda=False, enable_microbatch=False, microbatch_workers=1, ): track_cli('serve_gunicorn') bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path) if with_conda: run_with_conda_env( pip_installed_bundle_path, 'bentoml serve_gunicorn {bento} -p {port} -w {workers} ' '--timeout {timeout} {flags}'.format( bento=bento_service_bundle_path, port=port, workers=workers, timeout=timeout, flags="--enable-microbatch" if enable_microbatch else "", ), ) return if workers is None: workers = get_gunicorn_num_of_workers() from bentoml.server.gunicorn_server import GunicornBentoServer if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=bento_service_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( bento_service_bundle_path, api_server_port, workers, timeout, prometheus_lock, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(bento_service_bundle_path, port, workers, timeout) gunicorn_app.run()
def start_prod_server( saved_bundle_path: str, port: int, timeout: int, workers: int, enable_microbatch: bool, microbatch_workers: int, enable_swagger: bool, ): logger.info("Starting BentoML API server in production mode..") import psutil import multiprocessing assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer from bentoml.server.utils import get_gunicorn_num_of_workers from bentoml.utils import reserve_free_port if workers is None: workers = get_gunicorn_num_of_workers() if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( saved_bundle_path, api_server_port, workers, timeout, prometheus_lock, enable_swagger, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(saved_bundle_path, port, workers, timeout, enable_swagger=enable_swagger) gunicorn_app.run()
# This implement the sagemaker serving service shell. It starts nginx and gunicorn. # Parameter Env Var Default Value # number of workers BENTO_SERVER_TIMEOUT 60s # timeout GUNICORN_WORKER_COUNT number of cpu cores / 2 + 1 # api name API_NAME None import subprocess import os import signal import sys from bentoml.server.utils import get_gunicorn_num_of_workers bento_server_timeout = os.environ.get('BENTOML_GUNICORN_TIMEOUT', 60) bento_server_workers = int( os.environ.get('BENTOML_GUNICORN_NUM_OF_WORKERS', get_gunicorn_num_of_workers()) ) def sigterm_handler(nginx_pid, gunicorn_pid): try: os.kill(nginx_pid, signal.SIGQUIT) except OSError: pass try: os.kill(gunicorn_pid, signal.SIGTERM) except OSError: pass sys.exit(0)
# This implement the sagemaker serving service shell. It starts nginx and gunicorn. # Parameter Env Var Default Value # number of workers BENTO_SERVER_TIMEOUT 60s # timeout GUNICORN_WORKER_COUNT number of cpu cores / 2 + 1 # api name API_NAME None import subprocess import os import signal import sys from bentoml.server.utils import get_gunicorn_num_of_workers bento_server_timeout = os.environ.get('BENTO_SERVER_TIMEOUT', 60) bento_server_workers = int( os.environ.get('GUNICORN_WORKER_COUNT', get_gunicorn_num_of_workers()) ) def sigterm_handler(nginx_pid, gunicorn_pid): try: os.kill(nginx_pid, signal.SIGQUIT) except OSError: pass try: os.kill(gunicorn_pid, signal.SIGTERM) except OSError: pass sys.exit(0)
import atexit from bentoml.server.utils import get_gunicorn_num_of_workers from bentoml.utils.usage_stats import track_server_stop workers = get_gunicorn_num_of_workers() def worker_exit(server, worker): # pylint: disable=unused-argument from prometheus_client import multiprocess multiprocess.mark_process_dead(worker.pid) def post_fork(server, worker): server.log.debug("Worker spawned (pid: %s)", worker.pid) def pre_fork(server, worker): # pylint: disable=unused-argument pass def pre_exec(server): server.log.debug("Forked child, re-executing.") def when_ready(server): server.log.debug("Server is ready. Spawning workers") def worker_int(worker):
def serve_gunicorn( port, workers, timeout, bento=None, with_conda=False, enable_microbatch=False, microbatch_workers=1, ): if not psutil.POSIX: _echo( "The `bentoml server-gunicon` command is only supported on POSIX. " "On windows platform, use `bentoml serve` for local API testing and " "docker for running production API endpoint: " "https://docs.docker.com/docker-for-windows/ " ) return bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path ) if with_conda: return run_with_conda_env( pip_installed_bundle_path, 'bentoml serve_gunicorn {bento} -p {port} -w {workers} ' '--timeout {timeout} {flags}'.format( bento=bento_service_bundle_path, port=port, workers=workers, timeout=timeout, flags="--enable-microbatch" if enable_microbatch else "", ), ) if workers is None: workers = get_gunicorn_num_of_workers() # Gunicorn only supports POSIX platforms from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=bento_service_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( bento_service_bundle_path, api_server_port, workers, timeout, prometheus_lock, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer( bento_service_bundle_path, port, workers, timeout ) gunicorn_app.run()