Example #1
0
 def __init__(
     self,
     special_cases: Union[str, Dict[str, SpecialCaseFunction]],
     server_interceptors: Optional[List] = None,
     prometheus_enabled: Optional[bool] = True,
 ):
     self.special_cases = special_cases
     self.server_interceptors = server_interceptors
     self.prometheus_enabled = prometheus_enabled
     self.mock_server: grpc.Server
     with reserve_free_port() as service_port:
         self.service_port: int = service_port
     with reserve_free_port() as prom_port:
         self.prom_port: int = prom_port
Example #2
0
    def serve(port, bento=None, with_conda=False, enable_microbatch=False):
        track_cli('serve')
        bento_service_bundle_path = resolve_bundle_path(
            bento, pip_installed_bundle_path)
        bento_service = load(bento_service_bundle_path)

        if with_conda:
            run_with_conda_env(
                bento_service_bundle_path,
                'bentoml serve {bento} --port {port} {flags}'.format(
                    bento=bento_service_bundle_path,
                    port=port,
                    flags="--enable-microbatch" if enable_microbatch else "",
                ),
            )
            return

        if enable_microbatch:
            with reserve_free_port() as api_server_port:
                # start server right after port released
                #  to reduce potential race
                marshal_server = MarshalService(
                    bento_service_bundle_path,
                    outbound_host="localhost",
                    outbound_port=api_server_port,
                    outbound_workers=1,
                )
                api_server = BentoAPIServer(bento_service,
                                            port=api_server_port)
            marshal_server.async_start(port=port)
            api_server.start()
        else:
            api_server = BentoAPIServer(bento_service, port=port)
            api_server.start()
Example #3
0
def start_dev_server(saved_bundle_path: str, port: int,
                     enable_microbatch: bool, run_with_ngrok: bool):
    logger.info("Starting BentoML API server in development mode..")

    from bentoml import load
    from bentoml.server.api_server import BentoAPIServer
    from bentoml.marshal.marshal import MarshalService
    from bentoml.utils import reserve_free_port

    bento_service = load(saved_bundle_path)

    if run_with_ngrok:
        from bentoml.utils.flask_ngrok import start_ngrok
        from threading import Timer

        thread = Timer(1, start_ngrok, args=(port, ))
        thread.setDaemon(True)
        thread.start()

    if enable_microbatch:
        with reserve_free_port() as api_server_port:
            # start server right after port released
            #  to reduce potential race
            marshal_server = MarshalService(
                saved_bundle_path,
                outbound_host="localhost",
                outbound_port=api_server_port,
                outbound_workers=1,
            )
            api_server = BentoAPIServer(bento_service, port=api_server_port)
        marshal_server.async_start(port=port)
        api_server.start()
    else:
        api_server = BentoAPIServer(bento_service, port=port)
        api_server.start()
def start_dev_server(
    saved_bundle_path: str,
    port: int = Provide[BentoMLContainer.config.api_server.port],
    enable_microbatch: bool = Provide[
        BentoMLContainer.config.api_server.enable_microbatch],
    mb_max_batch_size: int = Provide[
        BentoMLContainer.config.marshal_server.max_batch_size],
    mb_max_latency: int = Provide[
        BentoMLContainer.config.marshal_server.max_latency],
    run_with_ngrok: bool = Provide[
        BentoMLContainer.config.api_server.run_with_ngrok],
    enable_swagger: bool = Provide[
        BentoMLContainer.config.api_server.enable_swagger],
):
    logger.info("Starting BentoML API server in development mode..")

    import multiprocessing

    from bentoml.saved_bundle import load_from_dir
    from bentoml.server.api_server import BentoAPIServer
    from bentoml.utils import reserve_free_port

    if run_with_ngrok:
        from threading import Timer

        from bentoml.utils.flask_ngrok import start_ngrok

        thread = Timer(1, start_ngrok, args=(port, ))
        thread.setDaemon(True)
        thread.start()

    if enable_microbatch:
        with reserve_free_port() as api_server_port:
            # start server right after port released
            #  to reduce potential race

            marshal_proc = multiprocessing.Process(
                target=start_dev_batching_server,
                kwargs=dict(
                    api_server_port=api_server_port,
                    saved_bundle_path=saved_bundle_path,
                    port=port,
                    mb_max_latency=mb_max_latency,
                    mb_max_batch_size=mb_max_batch_size,
                ),
                daemon=True,
            )
        marshal_proc.start()

        bento_service = load_from_dir(saved_bundle_path)
        api_server = BentoAPIServer(bento_service,
                                    port=api_server_port,
                                    enable_swagger=enable_swagger)
        api_server.start()
    else:
        bento_service = load_from_dir(saved_bundle_path)
        api_server = BentoAPIServer(bento_service,
                                    port=port,
                                    enable_swagger=enable_swagger)
        api_server.start()
Example #5
0
    def serve_gunicorn(
        port,
        workers,
        timeout,
        bento=None,
        with_conda=False,
        enable_microbatch=False,
        microbatch_workers=1,
    ):
        track_cli('serve_gunicorn')
        bento_service_bundle_path = resolve_bundle_path(
            bento, pip_installed_bundle_path)

        if with_conda:
            run_with_conda_env(
                pip_installed_bundle_path,
                'bentoml serve_gunicorn {bento} -p {port} -w {workers} '
                '--timeout {timeout} {flags}'.format(
                    bento=bento_service_bundle_path,
                    port=port,
                    workers=workers,
                    timeout=timeout,
                    flags="--enable-microbatch" if enable_microbatch else "",
                ),
            )
            return

        if workers is None:
            workers = get_gunicorn_num_of_workers()

        from bentoml.server.gunicorn_server import GunicornBentoServer

        if enable_microbatch:
            prometheus_lock = multiprocessing.Lock()
            # avoid load model before gunicorn fork
            with reserve_free_port() as api_server_port:
                marshal_server = GunicornMarshalServer(
                    bundle_path=bento_service_bundle_path,
                    port=port,
                    workers=microbatch_workers,
                    prometheus_lock=prometheus_lock,
                    outbound_host="localhost",
                    outbound_port=api_server_port,
                    outbound_workers=workers,
                )

                gunicorn_app = GunicornBentoServer(
                    bento_service_bundle_path,
                    api_server_port,
                    workers,
                    timeout,
                    prometheus_lock,
                )
            marshal_server.async_run()
            gunicorn_app.run()
        else:
            gunicorn_app = GunicornBentoServer(bento_service_bundle_path, port,
                                               workers, timeout)
            gunicorn_app.run()
Example #6
0
def start_prod_server(
    saved_bundle_path: str,
    port: int,
    timeout: int,
    workers: int,
    enable_microbatch: bool,
    microbatch_workers: int,
    enable_swagger: bool,
):
    logger.info("Starting BentoML API server in production mode..")

    import psutil
    import multiprocessing

    assert (
        psutil.POSIX
    ), "BentoML API Server production mode only supports POSIX platforms"

    from bentoml.server.gunicorn_server import GunicornBentoServer
    from bentoml.server.marshal_server import GunicornMarshalServer
    from bentoml.server.utils import get_gunicorn_num_of_workers
    from bentoml.utils import reserve_free_port

    if workers is None:
        workers = get_gunicorn_num_of_workers()

    if enable_microbatch:
        prometheus_lock = multiprocessing.Lock()
        # avoid load model before gunicorn fork
        with reserve_free_port() as api_server_port:
            marshal_server = GunicornMarshalServer(
                bundle_path=saved_bundle_path,
                port=port,
                workers=microbatch_workers,
                prometheus_lock=prometheus_lock,
                outbound_host="localhost",
                outbound_port=api_server_port,
                outbound_workers=workers,
            )

            gunicorn_app = GunicornBentoServer(
                saved_bundle_path,
                api_server_port,
                workers,
                timeout,
                prometheus_lock,
                enable_swagger,
            )
        marshal_server.async_run()
        gunicorn_app.run()
    else:
        gunicorn_app = GunicornBentoServer(saved_bundle_path,
                                           port,
                                           workers,
                                           timeout,
                                           enable_swagger=enable_swagger)
        gunicorn_app.run()
Example #7
0
def start_prod_server(
    saved_bundle_path: str,
    port: Optional[int] = None,
    workers: Optional[int] = None,
    timeout: Optional[int] = None,
    enable_microbatch: Optional[bool] = None,
    enable_swagger: Optional[bool] = None,
    mb_max_batch_size: Optional[int] = None,
    mb_max_latency: Optional[int] = None,
    microbatch_workers: Optional[int] = None,
    config_file: Optional[str] = None,
):
    import psutil

    assert (
        psutil.POSIX
    ), "BentoML API Server production mode only supports POSIX platforms"

    config = BentoMLConfiguration(override_config_file=config_file)
    config.override(["api_server", "port"], port)
    config.override(["api_server", "workers"], workers)
    config.override(["api_server", "timeout"], timeout)
    config.override(["api_server", "enable_microbatch"], enable_microbatch)
    config.override(["api_server", "enable_swagger"], enable_swagger)
    config.override(["marshal_server", "max_batch_size"], mb_max_batch_size)
    config.override(["marshal_server", "max_latency"], mb_max_latency)
    config.override(["marshal_server", "workers"], microbatch_workers)

    if config.config['api_server'].get('enable_microbatch'):
        prometheus_lock = multiprocessing.Lock()
        with reserve_free_port() as api_server_port:
            pass

        model_server_job = multiprocessing.Process(
            target=_start_prod_server,
            kwargs=dict(
                saved_bundle_path=saved_bundle_path,
                port=api_server_port,
                config=config,
                prometheus_lock=prometheus_lock,
            ),
            daemon=True,
        )
        model_server_job.start()

        try:
            _start_prod_batching_server(
                saved_bundle_path=saved_bundle_path,
                config=config,
                api_server_port=api_server_port,
                prometheus_lock=prometheus_lock,
            )
        finally:
            model_server_job.terminate()
    else:
        _start_prod_server(saved_bundle_path=saved_bundle_path, config=config)
Example #8
0
def start_prod_server(
    saved_bundle_path: str,
    port: Optional[int] = None,
    workers: Optional[int] = None,
    timeout: Optional[int] = None,
    enable_microbatch: Optional[bool] = None,
    enable_swagger: Optional[bool] = None,
    mb_max_batch_size: Optional[int] = None,
    mb_max_latency: Optional[int] = None,
    microbatch_workers: Optional[int] = None,
):

    import psutil

    assert (
        psutil.POSIX
    ), "BentoML API Server production mode only supports POSIX platforms"

    prometheus_lock = multiprocessing.Lock()
    with reserve_free_port() as api_server_port:
        pass

    model_server_job = multiprocessing.Process(
        target=_start_prod_server,
        kwargs=dict(
            saved_bundle_path=saved_bundle_path,
            port=api_server_port,
            timeout=timeout,
            workers=workers,
            prometheus_lock=prometheus_lock,
            enable_swagger=enable_swagger,
        ),
        daemon=True,
    )
    model_server_job.start()

    try:
        _start_prod_proxy(
            saved_bundle_path=saved_bundle_path,
            port=port,
            api_server_port=api_server_port,
            workers=microbatch_workers,
            timeout=timeout,
            outbound_workers=workers,
            enable_microbatch=enable_microbatch,
            mb_max_batch_size=mb_max_batch_size,
            mb_max_latency=mb_max_latency,
            prometheus_lock=prometheus_lock,
        )
    finally:
        model_server_job.terminate()
Example #9
0
def start_dev_server(
    bundle_path: str,
    port: Optional[int] = None,
    enable_microbatch: Optional[bool] = None,
    mb_max_batch_size: Optional[int] = None,
    mb_max_latency: Optional[int] = None,
    run_with_ngrok: Optional[bool] = None,
    enable_swagger: Optional[bool] = None,
    config_file: Optional[str] = None,
):
    config = BentoMLConfiguration(override_config_file=config_file)
    config.override(["api_server", "port"], port)
    config.override(["api_server", "enable_microbatch"], enable_microbatch)
    config.override(["api_server", "enable_swagger"], enable_swagger)
    config.override(["marshal_server", "max_batch_size"], mb_max_batch_size)
    config.override(["marshal_server", "max_latency"], mb_max_latency)

    if run_with_ngrok:
        from threading import Timer

        from bentoml.utils.flask_ngrok import start_ngrok

        thread = Timer(1, start_ngrok, args=(port, ))
        thread.setDaemon(True)
        thread.start()

    with reserve_free_port() as api_server_port:
        # start server right after port released
        #  to reduce potential race

        model_server_proc = multiprocessing.Process(
            target=_start_dev_server,
            kwargs=dict(
                api_server_port=api_server_port,
                saved_bundle_path=bundle_path,
                config=config,
            ),
            daemon=True,
        )
    model_server_proc.start()

    try:
        _start_dev_proxy(
            api_server_port=api_server_port,
            saved_bundle_path=bundle_path,
            config=config,
        )
    finally:
        model_server_proc.terminate()
Example #10
0
def start_dev_server(
    bundle_path: str,
    port: Optional[int] = None,
    enable_microbatch: Optional[bool] = None,
    mb_max_batch_size: Optional[int] = None,
    mb_max_latency: Optional[int] = None,
    run_with_ngrok: Optional[bool] = None,
    enable_swagger: Optional[bool] = None,
):
    if run_with_ngrok:
        from threading import Timer

        from bentoml.utils.flask_ngrok import start_ngrok

        thread = Timer(1, start_ngrok, args=(port, ))
        thread.setDaemon(True)
        thread.start()

    with reserve_free_port() as api_server_port:
        # start server right after port released
        #  to reduce potential race

        model_server_proc = multiprocessing.Process(
            target=_start_dev_server,
            kwargs=dict(
                api_server_port=api_server_port,
                saved_bundle_path=bundle_path,
                enable_swagger=enable_swagger,
            ),
            daemon=True,
        )
    model_server_proc.start()

    try:
        _start_dev_proxy(
            port=port,
            api_server_port=api_server_port,
            saved_bundle_path=bundle_path,
            enable_microbatch=enable_microbatch,
            mb_max_batch_size=mb_max_batch_size,
            mb_max_latency=mb_max_latency,
        )
    finally:
        model_server_proc.terminate()
Example #11
0
def start_yatai_service_grpc_server(
    db_url,
    grpc_port,
    ui_port,
    with_ui,
    base_url,
    repository_type,
    file_system_directory,
    s3_url,
    s3_endpoint_url,
    gcs_url,
    web_ui_log_path: str = Provide[BentoMLContainer.yatai_logging_path],
):
    # Lazily import grpcio for YataiSerivce gRPC related actions
    import grpc
    from bentoml.yatai.db import DB
    from bentoml.yatai.repository import create_repository
    from bentoml.yatai.yatai_service_impl import get_yatai_service_impl
    from bentoml.yatai.proto.yatai_service_pb2_grpc import add_YataiServicer_to_server
    from bentoml.yatai.proto.yatai_service_pb2_grpc import YataiServicer

    YataiServicerImpl = get_yatai_service_impl(YataiServicer)
    yatai_service = YataiServicerImpl(
        repository=create_repository(repository_type, file_system_directory,
                                     s3_url, s3_endpoint_url, gcs_url),
        database=DB(db_url),
    )

    # Define interceptors here
    grpc_interceptors = [PromServerInterceptor(), ServiceLatencyInterceptor()]
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=10),
        interceptors=grpc_interceptors,
    )
    add_YataiServicer_to_server(yatai_service, server)
    debug_mode = get_debug_mode()
    if debug_mode:
        try:
            logger.debug("Enabling gRPC server reflection for debugging")
            from bentoml.yatai.proto import yatai_service_pb2
            from grpc_reflection.v1alpha import reflection

            SERVICE_NAMES = (
                yatai_service_pb2.DESCRIPTOR.services_by_name["Yatai"].
                full_name,
                reflection.SERVICE_NAME,
            )
            reflection.enable_server_reflection(SERVICE_NAMES, server)
        except ImportError:
            logger.debug(
                "Failed to enable gRPC server reflection, missing required package: "
                '"pip install grpcio-reflection"')
    server.add_insecure_port(f"[::]:{grpc_port}")

    # NOTE: the current implementation sets prometheus_port to
    # 50052 to accomodate with Makefile setups. Currently there
    # isn't a way to find the reserve_free_port dynamically inside
    # Makefile to find the free ports for prometheus_port without
    # the help of a shell scripts.
    prometheus_port = 50052
    with reserve_free_port() as port:
        prometheus_port = port
    # prevents wsgi to see prometheus_port as used
    start_http_server(prometheus_port)
    server.start()
    if with_ui:
        ensure_node_available_or_raise()
        yatai_grpc_server_address = f"localhost:{grpc_port}"
        prometheus_address = f"http://localhost:{prometheus_port}"
        async_start_yatai_service_web_ui(
            yatai_grpc_server_address,
            prometheus_address,
            ui_port,
            web_ui_log_path,
            debug_mode,
            base_url,
        )

    # We don't import _echo function from click_utils because of circular dep
    if with_ui:
        if debug_mode is True:
            ui_port = 8080
        web_ui_link = f"http://127.0.0.1:{ui_port}"
        if base_url != ".":
            web_ui_link += f"/{base_url}"
        web_ui_message = f"running on {web_ui_link}"
    else:
        web_ui_message = "off"
    if debug_mode:
        prom_ui_message = "off"
    else:
        prom_ui_message = f"running on http://127.0.0.1:{ui_port}/metrics\n"

    click.echo(
        f"* Starting BentoML YataiService gRPC Server\n"
        f'* Debug mode: { "on" if debug_mode else "off"}\n'
        f"* Web UI: {web_ui_message}\n"
        f"* Running on 127.0.0.1:{grpc_port} (Press CTRL+C to quit)\n"
        f"* Prometheus: {prom_ui_message}\n"
        f"* Help and instructions: "
        f"https://docs.bentoml.org/en/latest/guides/yatai_service.html\n"
        f'{f"* Web server log can be found here: {web_ui_log_path}" if with_ui else ""}'
        f"\n-----\n"
        f"* Usage in Python:\n"
        f'*  bento_svc.save(yatai_url="127.0.0.1:{grpc_port}")\n'
        f"*  from bentoml.yatai.client import get_yatai_client\n"
        f'*  get_yatai_client("127.0.0.1:{grpc_port}").repository.list()\n'
        f"* Usage in CLI:\n"
        f"*  bentoml list --yatai-url=127.0.0.1:{grpc_port}\n"
        f"*  bentoml containerize IrisClassifier:latest --yatai-url=127.0.0.1:"
        f"{grpc_port}\n"
        f"*  bentoml push IrisClassifier:20200918001645_CD2886 --yatai-url=127.0.0.1:"
        f"{grpc_port}\n"
        f"*  bentoml pull IrisClassifier:20200918001645_CD2886 --yatai-url=127.0.0.1:"
        f"{grpc_port}\n"
        f"*  bentoml retrieve IrisClassifier:20200918001645_CD2886 "
        f'--yatai-url=127.0.0.1:{grpc_port} --target_dir="/tmp/foo/bar"\n'
        f"*  bentoml delete IrisClassifier:20200918001645_CD2886 "
        f"--yatai-url=127.0.0.1:{grpc_port}\n"
        # TODO: simplify the example usage here once related documentation is ready
    )

    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        logger.info("Terminating YataiService gRPC server..")
        server.stop(grace=None)
Example #12
0
    def serve_gunicorn(
        port,
        workers,
        timeout,
        bento=None,
        with_conda=False,
        enable_microbatch=False,
        microbatch_workers=1,
    ):
        if not psutil.POSIX:
            _echo(
                "The `bentoml server-gunicon` command is only supported on POSIX. "
                "On windows platform, use `bentoml serve` for local API testing and "
                "docker for running production API endpoint: "
                "https://docs.docker.com/docker-for-windows/ "
            )
            return
        bento_service_bundle_path = resolve_bundle_path(
            bento, pip_installed_bundle_path
        )

        if with_conda:
            return run_with_conda_env(
                pip_installed_bundle_path,
                'bentoml serve_gunicorn {bento} -p {port} -w {workers} '
                '--timeout {timeout} {flags}'.format(
                    bento=bento_service_bundle_path,
                    port=port,
                    workers=workers,
                    timeout=timeout,
                    flags="--enable-microbatch" if enable_microbatch else "",
                ),
            )

        if workers is None:
            workers = get_gunicorn_num_of_workers()

        # Gunicorn only supports POSIX platforms
        from bentoml.server.gunicorn_server import GunicornBentoServer
        from bentoml.server.marshal_server import GunicornMarshalServer

        if enable_microbatch:
            prometheus_lock = multiprocessing.Lock()
            # avoid load model before gunicorn fork
            with reserve_free_port() as api_server_port:
                marshal_server = GunicornMarshalServer(
                    bundle_path=bento_service_bundle_path,
                    port=port,
                    workers=microbatch_workers,
                    prometheus_lock=prometheus_lock,
                    outbound_host="localhost",
                    outbound_port=api_server_port,
                    outbound_workers=workers,
                )

                gunicorn_app = GunicornBentoServer(
                    bento_service_bundle_path,
                    api_server_port,
                    workers,
                    timeout,
                    prometheus_lock,
                )
            marshal_server.async_run()
            gunicorn_app.run()
        else:
            gunicorn_app = GunicornBentoServer(
                bento_service_bundle_path, port, workers, timeout
            )
            gunicorn_app.run()
def start_prod_server(
    saved_bundle_path: str,
    port: int = Provide[BentoMLContainer.config.api_server.port],
    timeout: int = Provide[BentoMLContainer.config.api_server.timeout],
    workers: int = Provide[BentoMLContainer.api_server_workers],
    enable_microbatch: bool = Provide[
        BentoMLContainer.config.api_server.enable_microbatch],
    mb_max_batch_size: int = Provide[
        BentoMLContainer.config.marshal_server.max_batch_size],
    mb_max_latency: int = Provide[
        BentoMLContainer.config.marshal_server.max_latency],
    microbatch_workers: int = Provide[
        BentoMLContainer.config.marshal_server.workers],
    enable_swagger: bool = Provide[
        BentoMLContainer.config.api_server.enable_swagger],
):
    logger.info("Starting BentoML API server in production mode..")

    import multiprocessing

    import psutil

    assert (
        psutil.POSIX
    ), "BentoML API Server production mode only supports POSIX platforms"

    from bentoml.server.gunicorn_server import GunicornBentoServer
    from bentoml.server.marshal_server import GunicornMarshalServer
    from bentoml.utils import reserve_free_port

    if enable_microbatch:
        prometheus_lock = multiprocessing.Lock()
        # avoid load model before gunicorn fork
        with reserve_free_port() as api_server_port:
            marshal_server = GunicornMarshalServer(
                bundle_path=saved_bundle_path,
                port=port,
                workers=microbatch_workers,
                prometheus_lock=prometheus_lock,
                outbound_host="localhost",
                outbound_port=api_server_port,
                outbound_workers=workers,
                mb_max_batch_size=mb_max_batch_size,
                mb_max_latency=mb_max_latency,
            )

            gunicorn_app = GunicornBentoServer(
                saved_bundle_path,
                api_server_port,
                workers,
                timeout,
                prometheus_lock,
                enable_swagger,
            )
        marshal_server.async_run()
        gunicorn_app.run()
    else:
        gunicorn_app = GunicornBentoServer(saved_bundle_path,
                                           port,
                                           workers,
                                           timeout,
                                           enable_swagger=enable_swagger)
        gunicorn_app.run()