Ejemplo n.º 1
0
def test_server_multiproc_threading(set_timeout, restore_signal):

    started = 0
    terminated = 0
    proc_idxs = [0, 0, 0]
    value_lock = threading.Lock()

    @aiotools.actxmgr
    async def myserver(loop, proc_idx, args):
        nonlocal started, terminated, proc_idxs
        await asyncio.sleep(0)
        with value_lock:
            started += 1
            proc_idxs[proc_idx] = proc_idx

        yield

        await asyncio.sleep(0)
        with value_lock:
            terminated += 1

    def interrupt():
        os.kill(0, signal.SIGINT)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myserver, num_workers=3, use_threading=True)

    assert started == 3
    assert terminated == 3
    assert list(proc_idxs) == [0, 1, 2]
Ejemplo n.º 2
0
def test_server_user_main_threading(set_timeout, restore_signal):
    main_enter = False
    main_exit = False

    @contextlib.contextmanager
    def mymain():
        nonlocal main_enter, main_exit
        main_enter = True
        yield 987
        main_exit = True

    @aiotools.actxmgr
    async def myworker(loop, proc_idx, args):
        assert args[0] == 987  # first arg from user main
        assert args[1] == 123  # second arg from start_server args
        yield

    def interrupt():
        os.kill(0, signal.SIGINT)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myworker,
                          mymain,
                          num_workers=3,
                          use_threading=True,
                          args=(123, ))

    assert main_enter
    assert main_exit
Ejemplo n.º 3
0
def test_server_multiproc(set_timeout, restore_signal):

    started = mp.Value('i', 0)
    terminated = mp.Value('i', 0)
    proc_idxs = mp.Array('i', 3)

    @aiotools.actxmgr
    async def myserver(loop, proc_idx, args):
        started, terminated, proc_idxs = args
        await asyncio.sleep(0)
        with started.get_lock():
            started.value += 1
        proc_idxs[proc_idx] = proc_idx

        yield

        await asyncio.sleep(0)
        with terminated.get_lock():
            terminated.value += 1

    def interrupt():
        os.kill(0, signal.SIGINT)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myserver,
                          num_workers=3,
                          args=(started, terminated, proc_idxs))

    assert started.value == 3
    assert terminated.value == 3
    assert list(proc_idxs) == [0, 1, 2]
    assert len(mp.active_children()) == 0
Ejemplo n.º 4
0
def main():

    config = load_config(extra_args_func=gw_args)
    init_logger(config)

    log.info(f'Backend.AI Gateway {__version__}')
    log.info(f'runtime: {env_info()}')

    log_config = logging.getLogger('ai.backend.gateway.config')
    log_config.debug('debug mode enabled.')

    if config.debug:
        aiohttp.log.server_logger.setLevel('DEBUG')
        aiohttp.log.access_logger.setLevel('DEBUG')
    else:
        aiohttp.log.server_logger.setLevel('WARNING')
        aiohttp.log.access_logger.setLevel('WARNING')

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

    num_workers = os.cpu_count()
    manager = SyncManager()
    manager.start(lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
    shared_states = manager.Namespace()
    shared_states.lock = manager.Lock()
    shared_states.barrier = manager.Barrier(num_workers)
    shared_states.agent_last_seen = manager.dict()

    try:
        aiotools.start_server(server_main, num_workers=num_workers,
                              extra_procs=[event_router],
                              args=(config, shared_states))
    finally:
        manager.shutdown()
        log.info('terminated.')
Ejemplo n.º 5
0
def test_server_extra_proc_custom_stop_signal(set_timeout, restore_signal):

    received_signals = mp.Array('i', [0, 0])

    def extra_proc(key, _, pidx, args):
        received_signals = args[0]
        try:
            while True:
                time.sleep(0.1)
        except aiotools.InterruptedBySignal as e:
            received_signals[key] = e.args[0]

    @aiotools.server
    async def myworker(loop, pidx, args):
        yield

    def interrupt():
        os.kill(os.getpid(), signal.SIGUSR1)

    set_timeout(0.3, interrupt)
    aiotools.start_server(myworker,
                          extra_procs=[
                              functools.partial(extra_proc, 0),
                              functools.partial(extra_proc, 1)
                          ],
                          stop_signals={signal.SIGUSR1},
                          args=(received_signals, ),
                          num_workers=3)

    assert received_signals[0] == signal.SIGUSR1
    assert received_signals[1] == signal.SIGUSR1
Ejemplo n.º 6
0
def test_server_user_main_custom_stop_signals(set_timeout, restore_signal):
    main_enter = False
    main_exit = False
    main_signal = None
    worker_signals = mp.Array('i', 3)

    @aiotools.main
    def mymain():
        nonlocal main_enter, main_exit, main_signal
        main_enter = True
        main_signal = yield
        main_exit = True

    @aiotools.server
    async def myworker(loop, proc_idx, args):
        worker_signals = args[0]
        worker_signals[proc_idx] = yield

    def interrupt():
        os.kill(os.getpid(), signal.SIGUSR1)

    def noop(signum, frame):
        pass

    set_timeout(0.2, interrupt)
    aiotools.start_server(myworker,
                          mymain,
                          num_workers=3,
                          stop_signals={signal.SIGUSR1},
                          args=(worker_signals, ))

    assert main_enter
    assert main_exit
    assert main_signal == signal.SIGUSR1
    assert list(worker_signals) == [signal.SIGUSR1] * 3
Ejemplo n.º 7
0
def test_server_user_main_tuple(set_timeout, restore_signal):
    main_enter = False
    main_exit = False

    @aiotools.main
    def mymain():
        nonlocal main_enter, main_exit
        main_enter = True
        yield 987, 654
        main_exit = True

    @aiotools.server
    async def myworker(loop, proc_idx, args):
        assert args[0] == 987  # first arg from user main
        assert args[1] == 654  # second arg from user main
        assert args[2] == 123  # third arg from start_server args
        yield

    def interrupt():
        os.kill(os.getpid(), signal.SIGINT)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myworker, mymain, num_workers=3, args=(123, ))

    assert main_enter
    assert main_exit
Ejemplo n.º 8
0
def test_server_user_main(mocker, set_timeout, restore_signal, start_method):

    mpctx = mp.get_context(start_method)
    mocker.patch('aiotools.server.mp', mpctx)

    main_enter = False
    main_exit = False

    # FIXME: This should work with start_method = "spawn", but to test with it
    #        we need to allow passing arguments to user-provided main functions.

    @aiotools.main
    def mymain_user_main():
        nonlocal main_enter, main_exit
        main_enter = True
        yield 987
        main_exit = True

    @aiotools.server  # type: ignore
    async def myworker_user_main(loop, proc_idx, args):
        assert args[0] == 987  # first arg from user main
        assert args[1] == 123  # second arg from start_server args
        yield

    set_timeout(0.2, interrupt)
    aiotools.start_server(myworker_user_main,
                          mymain_user_main,
                          num_workers=3,
                          args=(123, ))

    assert main_enter
    assert main_exit
Ejemplo n.º 9
0
def main(ctx, config_path, debug):

    cfg = load_config(config_path, debug)

    if ctx.invoked_subcommand is None:
        cfg['manager']['pid-file'].write_text(str(os.getpid()))
        try:
            logger = Logger(cfg['logging'])
            with logger:
                ns = cfg['etcd']['namespace']
                setproctitle(f"backend.ai: manager {ns}")
                log.info('Backend.AI Gateway {0}', __version__)
                log.info('runtime: {0}', env_info())
                log_config = logging.getLogger('ai.backend.gateway.config')
                log_config.debug('debug mode enabled.')

                if cfg['manager']['event-loop'] == 'uvloop':
                    uvloop.install()
                    log.info('Using uvloop as the event loop backend')
                try:
                    aiotools.start_server(
                        server_main,
                        num_workers=cfg['manager']['num-proc'],
                        extra_procs=[event_router],
                        args=(cfg, ))
                finally:
                    log.info('terminated.')
        finally:
            if cfg['manager']['pid-file'].is_file():
                # check is_file() to prevent deleting /dev/null!
                cfg['manager']['pid-file'].unlink()
    else:
        # Click is going to invoke a subcommand.
        pass
Ejemplo n.º 10
0
def main():

    config = load_config(extra_args_funcs=(gw_args, Logger.update_log_args))
    logger = Logger(config)
    logger.add_pkg('aiotools')
    logger.add_pkg('aiopg')
    logger.add_pkg('ai.backend')

    with logger:
        log.info(f'Backend.AI Gateway {__version__}')
        log.info(f'runtime: {env_info()}')
        log_config = logging.getLogger('ai.backend.gateway.config')
        log_config.debug('debug mode enabled.')
        if config.debug:
            aiohttp.log.server_logger.setLevel('DEBUG')
            aiohttp.log.access_logger.setLevel('DEBUG')
        else:
            aiohttp.log.server_logger.setLevel('WARNING')
            aiohttp.log.access_logger.setLevel('WARNING')

        asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
        try:
            aiotools.start_server(server_main,
                                  num_workers=config.num_proc,
                                  extra_procs=[event_router],
                                  args=(config, ))
        finally:
            log.info('terminated.')
Ejemplo n.º 11
0
def test_server_singleproc_threading(restore_signal):

    started = 0
    terminated = 0
    value_lock = threading.Lock()

    def interrupt():
        os.kill(0, signal.SIGINT)

    @aiotools.actxmgr
    async def myserver(loop, proc_idx, args):
        nonlocal started, terminated
        assert proc_idx == 0
        assert len(args) == 0
        await asyncio.sleep(0)
        with value_lock:
            started += 1
        loop.call_later(0.2, interrupt)

        yield

        await asyncio.sleep(0)
        with value_lock:
            terminated += 1

    aiotools.start_server(myserver, use_threading=True)

    assert started == 1
    assert terminated == 1
Ejemplo n.º 12
0
def test_server_singleproc(restore_signal):

    started = mp.Value('i', 0)
    terminated = mp.Value('i', 0)

    def interrupt():
        os.kill(0, signal.SIGINT)

    @aiotools.actxmgr
    async def myserver(loop, proc_idx, args):
        nonlocal started, terminated
        assert proc_idx == 0
        assert len(args) == 0
        await asyncio.sleep(0)
        with started.get_lock():
            started.value += 1
        loop.call_later(0.2, interrupt)

        yield

        await asyncio.sleep(0)
        with terminated.get_lock():
            terminated.value += 1

    aiotools.start_server(myserver)

    assert started.value == 1
    assert terminated.value == 1
Ejemplo n.º 13
0
def test_server_singleproc(mocker, set_timeout, restore_signal, start_method):

    mpctx = mp.get_context(start_method)
    mocker.patch('aiotools.server.mp', mpctx)

    started = mpctx.Value('i', 0)
    terminated = mpctx.Value('i', 0)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myserver_singleproc, args=(started, terminated))

    assert started.value == 1
    assert terminated.value == 1
Ejemplo n.º 14
0
def test_server_multiproc(mocker, set_timeout, restore_signal, start_method):

    mpctx = mp.get_context(start_method)
    mocker.patch('aiotools.server.mp', mpctx)

    started = mpctx.Value('i', 0)
    terminated = mpctx.Value('i', 0)
    proc_idxs = mpctx.Array('i', 3)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myserver_multiproc,
                          num_workers=3,
                          args=(started, terminated, proc_idxs))

    assert started.value == 3
    assert terminated.value == 3
    assert list(proc_idxs) == [0, 1, 2]
    assert len(mp.active_children()) == 0
Ejemplo n.º 15
0
def test_server_extra_proc_threading(set_timeout, restore_signal):

    # When using extra_procs with threading, you need to provide a way to
    # explicitly interrupt your synchronous loop.
    # Here, we use a threading.Event object to signal interruption.

    extras = [0, 0]
    value_lock = threading.Lock()

    def extra_proc(key, intr_event, pidx, args):
        assert isinstance(intr_event, threading.Event)
        with value_lock:
            extras[key] = 980 + key
        try:
            while not intr_event.is_set():
                time.sleep(0.1)
        except Exception as e:
            print(f'extra[{key}] exception', e)
        finally:
            with value_lock:
                extras[key] = 990 + key

    @aiotools.actxmgr
    async def myworker(loop, pidx, args):
        yield

    def interrupt():
        os.kill(0, signal.SIGINT)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myworker,
                          extra_procs=[
                              functools.partial(extra_proc, 0),
                              functools.partial(extra_proc, 1)
                          ],
                          use_threading=True,
                          num_workers=3,
                          args=(123, ))

    assert extras[0] == 990
    assert extras[1] == 991
Ejemplo n.º 16
0
def main(ctx: click.Context, config_path: Path, debug: bool) -> None:

    cfg = load_config(config_path, debug)

    if ctx.invoked_subcommand is None:
        cfg['manager']['pid-file'].write_text(str(os.getpid()))
        log_sockpath = Path(
            f'/tmp/backend.ai/ipc/manager-logger-{os.getpid()}.sock')
        log_sockpath.parent.mkdir(parents=True, exist_ok=True)
        log_endpoint = f'ipc://{log_sockpath}'
        try:
            logger = Logger(cfg['logging'],
                            is_master=True,
                            log_endpoint=log_endpoint)
            with logger:
                ns = cfg['etcd']['namespace']
                setproctitle(f"backend.ai: manager {ns}")
                log.info('Backend.AI Gateway {0}', __version__)
                log.info('runtime: {0}', env_info())
                log_config = logging.getLogger('ai.backend.gateway.config')
                log_config.debug('debug mode enabled.')

                if cfg['manager']['event-loop'] == 'uvloop':
                    import uvloop
                    uvloop.install()
                    log.info('Using uvloop as the event loop backend')
                try:
                    aiotools.start_server(
                        server_main_logwrapper,
                        num_workers=cfg['manager']['num-proc'],
                        args=(cfg, log_endpoint),
                    )
                finally:
                    log.info('terminated.')
        finally:
            if cfg['manager']['pid-file'].is_file():
                # check is_file() to prevent deleting /dev/null!
                cfg['manager']['pid-file'].unlink()
    else:
        # Click is going to invoke a subcommand.
        pass
Ejemplo n.º 17
0
def test_server_multiproc_custom_stop_signals(mocker, set_timeout,
                                              restore_signal, start_method):

    mpctx = mp.get_context(start_method)
    mocker.patch('aiotools.server.mp', mpctx)

    started = mpctx.Value('i', 0)
    terminated = mpctx.Value('i', 0)
    received_signals = mpctx.Array('i', 2)
    proc_idxs = mpctx.Array('i', 2)

    set_timeout(0.2, interrupt_usr1)
    aiotools.start_server(myserver_multiproc_custom_stop_signals,
                          num_workers=2,
                          stop_signals={signal.SIGUSR1},
                          args=(started, terminated, received_signals,
                                proc_idxs))

    assert started.value == 2
    assert terminated.value == 2
    assert list(received_signals) == [signal.SIGUSR1, signal.SIGUSR1]
    assert list(proc_idxs) == [0, 1]
    assert len(mpctx.active_children()) == 0
Ejemplo n.º 18
0
def test_server_worker_init_error(mocker, restore_signal, use_threading,
                                  start_method):

    mpctx = mp.get_context(start_method)
    mocker.patch('aiotools.server.mp', mpctx)

    started = mpctx.Value('i', 0)
    terminated = mpctx.Value('i', 0)
    log_queue = mpctx.Queue()

    aiotools.start_server(myserver_worker_init_error,
                          num_workers=3,
                          use_threading=use_threading,
                          args=(started, terminated, log_queue))
    # it should automatically shut down!

    # reset logging
    logging.shutdown()

    assert started.value == 3
    # workers who did not raise errors have already started,
    # and they should have terminated normally
    # when the errorneous worker interrupted the main loop.
    assert terminated.value == 2
    assert len(mp.active_children()) == 0
    assert not log_queue.empty()
    has_error_log = False
    while not log_queue.empty():
        rec = log_queue.get()
        if rec.levelname == 'ERROR':
            has_error_log = True
            assert 'initialization' in rec.message
            # exception info is logged to the console,
            # but we cannot access it here because exceptions
            # are not picklable.
            assert rec.exc_info is None
    assert has_error_log
Ejemplo n.º 19
0
def test_server_worker_init_error_multi(mocker, restore_signal, use_threading,
                                        start_method):

    mpctx = mp.get_context(start_method)
    mocker.patch('aiotools.server.mp', mpctx)

    started = mpctx.Value('i', 0)
    terminated = mpctx.Value('i', 0)
    log_queue = mpctx.Queue()

    aiotools.start_server(myserver_worker_init_error_multi,
                          num_workers=3,
                          use_threading=use_threading,
                          args=(started, terminated, log_queue))
    # it should automatically shut down!

    # reset logging
    logging.shutdown()

    assert started.value >= 1
    # non-errored workers should have been terminated normally.
    assert terminated.value >= 1
    # there is one worker remaining -- which is "cancelled"!
    # just ensure that all workers have terminated now.
    assert len(mpctx.active_children()) == 0
    assert not log_queue.empty()
    has_error_log = False
    while not log_queue.empty():
        rec = log_queue.get()
        if rec.levelname == 'ERROR':
            has_error_log = True
            assert 'initialization' in rec.message
            # exception info is logged to the console,
            # but we cannot access it here because exceptions
            # are not picklable.
            assert rec.exc_info is None
    assert has_error_log
Ejemplo n.º 20
0
def test_server_extra_proc(set_timeout, restore_signal):

    extras = mp.Array('i', [0, 0])

    def extra_proc(key, _, pidx, args):
        assert _ is None
        extras[key] = 980 + key
        try:
            while True:
                time.sleep(0.1)
        except KeyboardInterrupt:
            print(f'extra[{key}] interrupted', file=sys.stderr)
        except Exception as e:
            print(f'extra[{key}] exception', e, file=sys.stderr)
        finally:
            print(f'extra[{key}] finish', file=sys.stderr)
            extras[key] = 990 + key

    @aiotools.actxmgr
    async def myworker(loop, pidx, args):
        yield

    def interrupt():
        os.kill(0, signal.SIGINT)

    set_timeout(0.2, interrupt)
    aiotools.start_server(myworker,
                          extra_procs=[
                              functools.partial(extra_proc, 0),
                              functools.partial(extra_proc, 1)
                          ],
                          num_workers=3,
                          args=(123, ))

    assert extras[0] == 990
    assert extras[1] == 991
Ejemplo n.º 21
0
def main(cli_ctx, config_path, debug):
    volume_config_iv = t.Dict({
        t.Key('etcd'):
        t.Dict({
            t.Key('namespace'): t.String,
            t.Key('addr'): tx.HostPortPair(allow_blank_host=False)
        }).allow_extra('*'),
        t.Key('logging'):
        t.Any,  # checked in ai.backend.common.logging
        t.Key('agent'):
        t.Dict({
            t.Key('mode'): t.Enum('scratch', 'vfolder'),
            t.Key('rpc-listen-addr'): tx.HostPortPair(allow_blank_host=True),
            t.Key('user-uid'): t.Int,
            t.Key('user-gid'): t.Int
        }),
        t.Key('storage'):
        t.Dict({
            t.Key('mode'): t.Enum('xfs', 'btrfs'),
            t.Key('path'): t.String
        })
    }).allow_extra('*')

    # Determine where to read configuration.
    raw_cfg, cfg_src_path = config.read_from_file(config_path, 'agent')

    config.override_with_env(raw_cfg, ('etcd', 'namespace'),
                             'BACKEND_NAMESPACE')
    config.override_with_env(raw_cfg, ('etcd', 'addr'), 'BACKEND_ETCD_ADDR')
    config.override_with_env(raw_cfg, ('etcd', 'user'), 'BACKEND_ETCD_USER')
    config.override_with_env(raw_cfg, ('etcd', 'password'),
                             'BACKEND_ETCD_PASSWORD')
    config.override_with_env(raw_cfg, ('agent', 'rpc-listen-addr', 'host'),
                             'BACKEND_AGENT_HOST_OVERRIDE')
    config.override_with_env(raw_cfg, ('agent', 'rpc-listen-addr', 'port'),
                             'BACKEND_AGENT_PORT')

    if debug:
        config.override_key(raw_cfg, ('debug', 'enabled'), True)
        config.override_key(raw_cfg, ('logging', 'level'), 'DEBUG')
        config.override_key(raw_cfg, ('logging', 'pkg-ns', 'ai.backend'),
                            'DEBUG')

    try:
        cfg = config.check(raw_cfg, volume_config_iv)
        cfg['_src'] = cfg_src_path
    except config.ConfigurationError as e:
        print(
            'ConfigurationError: Validation of agent configuration has failed:',
            file=sys.stderr)
        print(pformat(e.invalid_data), file=sys.stderr)
        raise click.Abort()

    rpc_host = cfg['agent']['rpc-listen-addr'].host
    if (isinstance(rpc_host, BaseIPAddress)
            and (rpc_host.is_unspecified or rpc_host.is_link_local)):
        print(
            'ConfigurationError: '
            'Cannot use link-local or unspecified IP address as the RPC listening host.',
            file=sys.stderr)
        raise click.Abort()

    if os.getuid() != 0:
        print('Storage agent can only be run as root', file=sys.stderr)
        raise click.Abort()

    if cli_ctx.invoked_subcommand is None:
        setproctitle('Backend.AI: Storage Agent')
        logger = Logger(cfg['logging'])
        with logger:
            log.info('Backend.AI Storage Agent', VERSION)

            log_config = logging.getLogger('ai.backend.agent.config')
            if debug:
                log_config.debug('debug mode enabled.')

            if 'debug' in cfg and cfg['debug']['enabled']:
                print('== Agent configuration ==')
                pprint(cfg)

            aiotools.start_server(server_main,
                                  num_workers=1,
                                  use_threading=True,
                                  args=(cfg, ))
            log.info('exit.')
    return 0
Ejemplo n.º 22
0
def main(cli_ctx, config_path, debug):
    # Determine where to read configuration.
    raw_cfg, cfg_src_path = config.read_from_file(config_path, 'storage-proxy')

    config.override_with_env(raw_cfg, ('etcd', 'namespace'),
                             'BACKEND_NAMESPACE')
    config.override_with_env(raw_cfg, ('etcd', 'addr'), 'BACKEND_ETCD_ADDR')
    config.override_with_env(raw_cfg, ('etcd', 'user'), 'BACKEND_ETCD_USER')
    config.override_with_env(raw_cfg, ('etcd', 'password'),
                             'BACKEND_ETCD_PASSWORD')
    if debug:
        config.override_key(raw_cfg, ('debug', 'enabled'), True)

    try:
        local_config = config.check(raw_cfg, local_config_iv)
        local_config['_src'] = cfg_src_path
    except config.ConfigurationError as e:
        print(
            'ConfigurationError: Validation of agent configuration has failed:',
            file=sys.stderr)
        print(pformat(e.invalid_data), file=sys.stderr)
        raise click.Abort()

    if local_config['debug']['enabled']:
        config.override_key(local_config, ('logging', 'level'), 'DEBUG')
        config.override_key(local_config, ('logging', 'pkg-ns', 'ai.backend'),
                            'DEBUG')

    # if os.getuid() != 0:
    #     print('Storage agent can only be run as root', file=sys.stderr)
    #     raise click.Abort()

    multiprocessing.set_start_method('spawn')

    if cli_ctx.invoked_subcommand is None:
        local_config['storage-proxy']['pid-file'].write_text(str(os.getpid()))
        log_sockpath = Path(
            f'/tmp/backend.ai/ipc/storage-proxy-logger-{os.getpid()}.sock')
        log_sockpath.parent.mkdir(parents=True, exist_ok=True)
        log_endpoint = f'ipc://{log_sockpath}'
        local_config['logging']['endpoint'] = log_endpoint
        try:
            logger = Logger(local_config['logging'],
                            is_master=True,
                            log_endpoint=log_endpoint)
            with logger:
                setproctitle('backend.ai: storage-proxy')
                log.info('Backend.AI Storage Proxy', VERSION)
                log.info('Runtime: {0}', env_info())
                log.info('Node ID: {0}',
                         local_config['storage-proxy']['node-id'])
                log_config = logging.getLogger('ai.backend.agent.config')
                if local_config['debug']['enabled']:
                    log_config.debug('debug mode enabled.')
                if 'debug' in local_config and local_config['debug']['enabled']:
                    print('== Storage proxy configuration ==')
                    pprint(local_config)
                if local_config['storage-proxy']['event-loop'] == 'uvloop':
                    import uvloop
                    uvloop.install()
                    log.info('Using uvloop as the event loop backend')
                aiotools.start_server(
                    server_main_logwrapper,
                    use_threading=False,
                    num_workers=local_config['storage-proxy']['num-proc'],
                    args=(local_config, log_endpoint),
                )
                log.info('exit.')
        finally:
            if local_config['storage-proxy']['pid-file'].is_file():
                # check is_file() to prevent deleting /dev/null!
                local_config['storage-proxy']['pid-file'].unlink()
    return 0
Ejemplo n.º 23
0
    router.connect('ipc://example-events')

    async def process_incoming(router):
        while True:
            data = await router.recv()
            if not data:
                return
            log.info(data)

    task = loop.create_task(process_incoming(router))
    log.info('started')

    try:
        yield
    finally:
        await task
        router.close()
        zctx.term()
        log.info('terminated')


if __name__ == '__main__':
    # This example must be run with multiprocessing.
    server = aiotools.start_server(
        worker_main,
        use_threading=False,
        num_workers=num_workers,
        extra_procs=[router_main],
        start_method='spawn',
    )
Ejemplo n.º 24
0
        while True:
            log.value += 1
            await asyncio.sleep(1)


@aiotools.actxmgr
async def worker_main(loop, pidx, args):
    app = web.Application()
    loop = asyncio.get_event_loop()
    future = loop.create_task(display_log(pidx))
    app.add_routes(routes)

    web_handler = app.make_handler()
    server = await loop.create_server(web_handler,
                                      host='0.0.0.0',
                                      port=8888,
                                      reuse_port=True)
    try:
        yield
    finally:
        server.close()
        await server.wait_closed()
        await app.shutdown()
        await web_handler.finish_connections(60.0)
        await app.cleanup()


if __name__ == '__main__':
    # Run the above server using 4 worker processes.
    aiotools.start_server(worker_main, num_workers=4)
Ejemplo n.º 25
0
def main(cli_ctx, config_path, debug):
    # Determine where to read configuration.
    raw_cfg, cfg_src_path = config.read_from_file(config_path, "storage-proxy")

    config.override_with_env(raw_cfg, ("etcd", "namespace"),
                             "BACKEND_NAMESPACE")
    config.override_with_env(raw_cfg, ("etcd", "addr"), "BACKEND_ETCD_ADDR")
    config.override_with_env(raw_cfg, ("etcd", "user"), "BACKEND_ETCD_USER")
    config.override_with_env(raw_cfg, ("etcd", "password"),
                             "BACKEND_ETCD_PASSWORD")
    if debug:
        config.override_key(raw_cfg, ("debug", "enabled"), True)

    try:
        local_config = config.check(raw_cfg, local_config_iv)
        local_config["_src"] = cfg_src_path
    except config.ConfigurationError as e:
        print(
            "ConfigurationError: Validation of agent configuration has failed:",
            file=sys.stderr,
        )
        print(pformat(e.invalid_data), file=sys.stderr)
        raise click.Abort()

    if local_config["debug"]["enabled"]:
        config.override_key(local_config, ("logging", "level"), "DEBUG")
        config.override_key(local_config, ("logging", "pkg-ns", "ai.backend"),
                            "DEBUG")

    # if os.getuid() != 0:
    #     print('Storage agent can only be run as root', file=sys.stderr)
    #     raise click.Abort()

    multiprocessing.set_start_method("spawn")

    if cli_ctx.invoked_subcommand is None:
        local_config["storage-proxy"]["pid-file"].write_text(str(os.getpid()))
        log_sockpath = Path(
            f"/tmp/backend.ai/ipc/storage-proxy-logger-{os.getpid()}.sock", )
        log_sockpath.parent.mkdir(parents=True, exist_ok=True)
        log_endpoint = f"ipc://{log_sockpath}"
        local_config["logging"]["endpoint"] = log_endpoint
        try:
            logger = Logger(
                local_config["logging"],
                is_master=True,
                log_endpoint=log_endpoint,
            )
            with logger:
                setproctitle("backend.ai: storage-proxy")
                log.info("Backend.AI Storage Proxy", VERSION)
                log.info("Runtime: {0}", env_info())
                log.info("Node ID: {0}",
                         local_config["storage-proxy"]["node-id"])
                log_config = logging.getLogger("ai.backend.agent.config")
                if local_config["debug"]["enabled"]:
                    log_config.debug("debug mode enabled.")
                if "debug" in local_config and local_config["debug"]["enabled"]:
                    print("== Storage proxy configuration ==")
                    pprint(local_config)
                if local_config["storage-proxy"]["event-loop"] == "uvloop":
                    import uvloop

                    uvloop.install()
                    log.info("Using uvloop as the event loop backend")
                aiotools.start_server(
                    server_main_logwrapper,
                    num_workers=local_config["storage-proxy"]["num-proc"],
                    args=(local_config, log_endpoint),
                )
                log.info("exit.")
        finally:
            if local_config["storage-proxy"]["pid-file"].is_file():
                # check is_file() to prevent deleting /dev/null!
                local_config["storage-proxy"]["pid-file"].unlink()
    return 0
Ejemplo n.º 26
0
def test_server_worker_init_error(restore_signal, use_threading):

    started = mp.Value('i', 0)
    terminated = mp.Value('i', 0)
    log_queue = mp.Queue()

    @aiotools.actxmgr
    async def myserver(loop, proc_idx, args):
        started, terminated = args
        await asyncio.sleep(0)
        with started.get_lock():
            started.value += 1
        if proc_idx == 0:
            raise ZeroDivisionError('oops')

        yield

        # should not be reached if errored.
        await asyncio.sleep(0)
        with terminated.get_lock():
            terminated.value += 1

    logging.config.dictConfig({
        'version': 1,
        'handlers': {
            'q': {
                'class': 'logging.handlers.QueueHandler',
                'queue': log_queue,
                'level': 'DEBUG',
            },
            'console': {
                'class': 'logging.StreamHandler',
                'stream': 'ext://sys.stderr',
                'level': 'DEBUG',
            },
        },
        'loggers': {
            'aiotools': {
                'handlers': ['q', 'console'],
                'level': 'DEBUG',
            },
        },
    })

    aiotools.start_server(myserver,
                          num_workers=3,
                          use_threading=use_threading,
                          args=(started, terminated))
    # it should automatically shut down!

    # reset logging
    logging.shutdown()

    assert started.value == 3
    # non-errored workers should have been terminated normally.
    assert terminated.value == 2
    assert len(mp.active_children()) == 0
    assert not log_queue.empty()
    while not log_queue.empty():
        rec = log_queue.get()
        assert rec.levelname == 'ERROR'
        assert 'worker initialization' in rec.message
        # exception info is logged to the console,
        # but we cannot access it here because exceptions
        # are not picklable.
        assert rec.exc_info is None
Ejemplo n.º 27
0
def main(
    cli_ctx: click.Context,
    config_path: Path,
    debug: bool,
) -> int:

    # Determine where to read configuration.
    raw_cfg, cfg_src_path = config.read_from_file(config_path, 'agent')

    # Override the read config with environment variables (for legacy).
    config.override_with_env(raw_cfg, ('etcd', 'namespace'), 'BACKEND_NAMESPACE')
    config.override_with_env(raw_cfg, ('etcd', 'addr'), 'BACKEND_ETCD_ADDR')
    config.override_with_env(raw_cfg, ('etcd', 'user'), 'BACKEND_ETCD_USER')
    config.override_with_env(raw_cfg, ('etcd', 'password'), 'BACKEND_ETCD_PASSWORD')
    config.override_with_env(raw_cfg, ('agent', 'rpc-listen-addr', 'host'),
                             'BACKEND_AGENT_HOST_OVERRIDE')
    config.override_with_env(raw_cfg, ('agent', 'rpc-listen-addr', 'port'),
                             'BACKEND_AGENT_PORT')
    config.override_with_env(raw_cfg, ('agent', 'pid-file'), 'BACKEND_PID_FILE')
    config.override_with_env(raw_cfg, ('container', 'port-range'),
                             'BACKEND_CONTAINER_PORT_RANGE')
    config.override_with_env(raw_cfg, ('container', 'kernel-host'),
                             'BACKEND_KERNEL_HOST_OVERRIDE')
    config.override_with_env(raw_cfg, ('container', 'sandbox-type'), 'BACKEND_SANDBOX_TYPE')
    config.override_with_env(raw_cfg, ('container', 'scratch-root'), 'BACKEND_SCRATCH_ROOT')
    if debug:
        config.override_key(raw_cfg, ('debug', 'enabled'), True)
        config.override_key(raw_cfg, ('logging', 'level'), 'DEBUG')
        config.override_key(raw_cfg, ('logging', 'pkg-ns', 'ai.backend'), 'DEBUG')

    # Validate and fill configurations
    # (allow_extra will make configs to be forward-copmatible)
    try:
        cfg = config.check(raw_cfg, agent_local_config_iv)
        if cfg['agent']['backend'] == AgentBackend.KUBERNETES:
            cfg = config.check(raw_cfg, k8s_extra_config_iv)
            if cfg['registry']['type'] == 'local':
                registry_target_config_iv = registry_local_config_iv
            elif cfg['registry']['type'] == 'ecr':
                registry_target_config_iv = registry_ecr_config_iv
            else:
                print('Validation of agent configuration has failed: registry type {} not supported'
                    .format(cfg['registry']['type']), file=sys.stderr)
                raise click.Abort()

            registry_cfg = config.check(cfg['registry'], registry_target_config_iv)
            cfg['registry'] = registry_cfg
        if cfg['agent']['backend'] == AgentBackend.DOCKER:
            config.check(raw_cfg, docker_extra_config_iv)
        if 'debug' in cfg and cfg['debug']['enabled']:
            print('== Agent configuration ==')
            pprint(cfg)
        cfg['_src'] = cfg_src_path
    except config.ConfigurationError as e:
        print('ConfigurationError: Validation of agent configuration has failed:', file=sys.stderr)
        print(pformat(e.invalid_data), file=sys.stderr)
        raise click.Abort()

    rpc_host = cfg['agent']['rpc-listen-addr'].host
    if (isinstance(rpc_host, BaseIPAddress) and
        (rpc_host.is_unspecified or rpc_host.is_link_local)):
        print('ConfigurationError: '
              'Cannot use link-local or unspecified IP address as the RPC listening host.',
              file=sys.stderr)
        raise click.Abort()

    if os.getuid() != 0 and cfg['container']['stats-type'] == 'cgroup':
        print('Cannot use cgroup statistics collection mode unless the agent runs as root.',
              file=sys.stderr)
        raise click.Abort()

    if cli_ctx.invoked_subcommand is None:

        if cfg['debug']['coredump']['enabled']:
            if not sys.platform.startswith('linux'):
                print('ConfigurationError: '
                      'Storing container coredumps is only supported in Linux.',
                      file=sys.stderr)
                raise click.Abort()
            core_pattern = Path('/proc/sys/kernel/core_pattern').read_text().strip()
            if core_pattern.startswith('|') or not core_pattern.startswith('/'):
                print('ConfigurationError: '
                      '/proc/sys/kernel/core_pattern must be an absolute path '
                      'to enable container coredumps.',
                      file=sys.stderr)
                raise click.Abort()
            cfg['debug']['coredump']['core_path'] = Path(core_pattern).parent

        cfg['agent']['pid-file'].write_text(str(os.getpid()))
        log_sockpath = Path(f'/tmp/backend.ai/ipc/agent-logger-{os.getpid()}.sock')
        log_sockpath.parent.mkdir(parents=True, exist_ok=True)
        log_endpoint = f'ipc://{log_sockpath}'
        cfg['logging']['endpoint'] = log_endpoint
        try:
            logger = Logger(cfg['logging'], is_master=True, log_endpoint=log_endpoint)
            with logger:
                ns = cfg['etcd']['namespace']
                setproctitle(f"backend.ai: agent {ns}")
                log.info('Backend.AI Agent {0}', VERSION)
                log.info('runtime: {0}', utils.env_info())

                log_config = logging.getLogger('ai.backend.agent.config')
                if debug:
                    log_config.debug('debug mode enabled.')

                if cfg['agent']['event-loop'] == 'uvloop':
                    import uvloop
                    uvloop.install()
                    log.info('Using uvloop as the event loop backend')
                aiotools.start_server(
                    server_main_logwrapper,
                    num_workers=1,
                    args=(cfg, log_endpoint),
                )
                log.info('exit.')
        finally:
            if cfg['agent']['pid-file'].is_file():
                # check is_file() to prevent deleting /dev/null!
                cfg['agent']['pid-file'].unlink()
    else:
        # Click is going to invoke a subcommand.
        pass
    return 0
Ejemplo n.º 28
0
def main(cli_ctx, config_path, debug):

    watcher_config_iv = t.Dict({
        t.Key('watcher'): t.Dict({
            t.Key('service-addr', default=('0.0.0.0', 6009)): tx.HostPortPair,
            t.Key('ssl-enabled', default=False): t.Bool,
            t.Key('ssl-cert', default=None): t.Null | tx.Path(type='file'),
            t.Key('ssl-key', default=None): t.Null | tx.Path(type='file'),
            t.Key('target-service', default='backendai-agent.service'): t.String,
            t.Key('soft-reset-available', default=False): t.Bool,
        }).allow_extra('*'),
        t.Key('logging'): t.Any,  # checked in ai.backend.common.logging
        t.Key('debug'): t.Dict({
            t.Key('enabled', default=False): t.Bool,
        }).allow_extra('*'),
    }).merge(config.etcd_config_iv).allow_extra('*')

    raw_cfg, cfg_src_path = config.read_from_file(config_path, 'agent')

    config.override_with_env(raw_cfg, ('etcd', 'namespace'), 'BACKEND_NAMESPACE')
    config.override_with_env(raw_cfg, ('etcd', 'addr'), 'BACKEND_ETCD_ADDR')
    config.override_with_env(raw_cfg, ('etcd', 'user'), 'BACKEND_ETCD_USER')
    config.override_with_env(raw_cfg, ('etcd', 'password'), 'BACKEND_ETCD_PASSWORD')
    config.override_with_env(raw_cfg, ('watcher', 'service-addr', 'host'),
                             'BACKEND_WATCHER_SERVICE_IP')
    config.override_with_env(raw_cfg, ('watcher', 'service-addr', 'port'),
                             'BACKEND_WATCHER_SERVICE_PORT')
    if debug:
        config.override_key(raw_cfg, ('debug', 'enabled'), True)

    try:
        cfg = config.check(raw_cfg, watcher_config_iv)
        if 'debug' in cfg and cfg['debug']['enabled']:
            print('== Watcher configuration ==')
            pprint(cfg)
        cfg['_src'] = cfg_src_path
    except config.ConfigurationError as e:
        print('Validation of watcher configuration has failed:', file=sys.stderr)
        print(pformat(e.invalid_data), file=sys.stderr)
        raise click.Abort()

    # Change the filename from the logging config's file section.
    log_sockpath = Path(f'/tmp/backend.ai/ipc/watcher-logger-{os.getpid()}.sock')
    log_sockpath.parent.mkdir(parents=True, exist_ok=True)
    log_endpoint = f'ipc://{log_sockpath}'
    cfg['logging']['endpoint'] = log_endpoint
    logger = Logger(cfg['logging'], is_master=True, log_endpoint=log_endpoint)
    if 'file' in cfg['logging']['drivers']:
        fn = Path(cfg['logging']['file']['filename'])
        cfg['logging']['file']['filename'] = f"{fn.stem}-watcher{fn.suffix}"

    setproctitle(f"backend.ai: watcher {cfg['etcd']['namespace']}")
    with logger:
        log.info('Backend.AI Agent Watcher {0}', VERSION)
        log.info('runtime: {0}', utils.env_info())

        log_config = logging.getLogger('ai.backend.agent.config')
        log_config.debug('debug mode enabled.')

        aiotools.start_server(
            watcher_server,
            num_workers=1,
            args=(cfg, ),
            stop_signals={signal.SIGINT, signal.SIGTERM, signal.SIGALRM},
        )
        log.info('exit.')
    return 0
Ejemplo n.º 29
0
@aiotools.actxmgr
async def worker_main(loop, pidx, args):
    log = get_logger('examples.zmqserver.worker', pidx)
    router = await aiozmq.create_zmq_stream(zmq.PULL,
                                            connect='ipc://example-events')

    async def process_incoming(router):
        while True:
            try:
                data = await router.read()
            except aiozmq.ZmqStreamClosed:
                break
            log.info(data)

    task = loop.create_task(process_incoming(router))
    log.info('started')

    yield

    router.close()
    await task
    log.info('terminated')


if __name__ == '__main__':
    server = aiotools.start_server(
        worker_main,
        num_workers=4,
        extra_procs=[router_main],
    )
Ejemplo n.º 30
0
 def run(self, *args, workers: int = 1, **kwargs) -> None:
     """Public run interface."""
     aiotools.start_server(self.run_worker, num_workers=workers)