Beispiel #1
0
        def _watch_version(_data, _stat, event):
            """Force exit if server node is deleted."""

            # If the node is deleted, we exit to pick up new version code.
            if event is not None and event.type == 'DELETED':
                # The version info not present, restart services and register
                # new checksum.
                _LOGGER.info('Upgrade requested, running: %s', cli_cmd)

                if cli_cmd:
                    try:
                        subproc.check_call(cli_cmd)
                        # Record successful upgrade.
                    except subprocess.CalledProcessError:
                        _LOGGER.exception('Upgrade failed.')
                        # Immediately trigger a watchdog timeout
                        watchdogs.create(
                            name='version_monitor',
                            timeout='0s',
                            content='Upgrade to '
                            '{code!r}({digest}) failed'.format(code=codepath,
                                                               digest=digest),
                        ).heartbeat()
                        del info['digest']

                _LOGGER.info('Upgrade complete.')
                utils.sys_exit(0)

            return True
Beispiel #2
0
    def on_deleted(path):
        """Invoked when a network rule is deleted."""
        # Edge case, if the directory where the rules are kept gets removed,
        # abort
        if path == rulemgr.path:
            _LOGGER.critical('Network rules directory was removed: %r',
                             path)
            utils.sys_exit(1)

        # The rule is the filename
        rule_file = os.path.basename(path)
        _LOGGER.info('Removing %r', rule_file)
        chain_rule = rulemgr.get_rule(rule_file)
        if chain_rule is not None:
            chain, rule = chain_rule
            iptables.delete_rule(rule, chain=chain)
            if isinstance(rule, fw.PassThroughRule):
                if passthrough[rule.src_ip] == 1:
                    # Remove the IPs from the passthrough set
                    passthrough.pop(rule.src_ip)
                    _LOGGER.info('Removing passthrough %r', rule.src_ip)
                    iptables.rm_ip_set(iptables.SET_PASSTHROUGHS, rule.src_ip)
                    iptables.flush_pt_conntrack_table(rule.src_ip)
                else:
                    passthrough[rule.src_ip] -= 1

        else:
            _LOGGER.warning('Ignoring unparseable file %r', rule_file)
Beispiel #3
0
 def _wrap(*args, **kwargs):
     """Wraps function to exit on unhandled exception."""
     try:
         return func(*args, **kwargs)
     except Exception:  # pylint: disable=W0703
         _LOGGER.exception('Unhandled exception - exiting.')
         utils.sys_exit(-1)
Beispiel #4
0
def _stop_on_lost(tm_env, state):
    _LOGGER.debug('ZK connection state: %s', state)
    if state == zkutils.states.KazooState.LOST:
        _LOGGER.info('ZK connection lost, stopping node.')
        _LOGGER.info('Terminating svscan in %s', tm_env.init_dir)
        supervisor.control_svscan(tm_env.init_dir,
                                  supervisor.SvscanControlAction.quit)
        # server_init should be terminated at this point but exit just in case.
        utils.sys_exit(-1)
Beispiel #5
0
    def run(self, name, image, entrypoint, cmd, **args):
        """Run
        """
        client = self._get_client()
        if 'volumes' in args:
            args['volumes'] = _transform_volumes(args['volumes'])

        if 'envdirs' in args:
            args['environment'] = _read_environ(args.pop('envdirs'))

        # simulate docker pull logic, if tag not provided, assume latest
        if ':' not in image:
            image += ':latest'

        try:
            image_meta = _pull_image(client, image)
        except docker.errors.ImageNotFound:
            raise exc.ContainerSetupError(
                'Fail to pull {}, check image name or disk size'.format(image),
                app_abort.AbortedReason.IMAGE)

        container = _create_container(client, name, image, image_meta,
                                      entrypoint, cmd, **args)

        # TODO: start docker container event
        container.start()

        container.reload()
        logs_gen = container.logs(stdout=True,
                                  stderr=True,
                                  stream=True,
                                  follow=True)

        _LOGGER.info('Container %s is running', name)
        while container.status == 'running':
            try:
                for log_lines in logs_gen:
                    sys.stderr.write(log_lines)
            except socket.error:
                pass

            container.reload()

        # container.wait returns dict with key 'StatusCode'
        rc = container.wait()['StatusCode']
        if os.WIFSIGNALED(rc):
            # Process died with a signal in docker
            sig = os.WTERMSIG(rc)
            os.kill(os.getpid(), sig)

        else:
            utils.sys_exit(os.WEXITSTATUS(rc))
Beispiel #6
0
    def run(self, name, image, entrypoint, cmd, **args):
        """Load Docker image and Run
        """
        client = self._get_client()
        if 'volumes' in args:
            args['volumes'] = _transform_volumes(args['volumes'])

        if 'envdirs' in args:
            args['environment'] = _read_environ(args.pop('envdirs'))

        ulimit = _get_ulimits(args.pop('ulimit'))

        image_meta = _fetch_image(client, image)

        container = _create_container(client, name, image_meta, entrypoint,
                                      cmd, ulimit, **args)

        # TODO: start docker container event
        container.start()

        container.reload()
        logs_gen = container.logs(stdout=True,
                                  stderr=True,
                                  stream=True,
                                  follow=True)

        _LOGGER.info('Container %s is running', name)
        while container.status == 'running':
            try:
                for log_lines in logs_gen:
                    print(log_lines.decode(),
                          file=sys.stderr,
                          end='',
                          flush=True)
            except socket.error:
                pass

            container.reload()

        # container.wait returns dict with key 'StatusCode'
        rc = container.wait()['StatusCode']
        if os.WIFSIGNALED(rc):
            # Process died with a signal in docker
            sig = os.WTERMSIG(rc)
            os.kill(os.getpid(), sig)

        else:
            utils.sys_exit(os.WEXITSTATUS(rc))
Beispiel #7
0
    def run(self, name, image, entrypoint, cmd, **args):
        """Run
        """
        client = self._get_client()
        try:
            if 'volumes' in args:
                args['volumes'] = _transform_volumes(args['volumes'])

            if 'envdirs' in args:
                args['environment'] = _read_environ(args.pop('envdirs'))

            container = _create_container(client, name, image, entrypoint, cmd,
                                          **args)

        except docker.errors.ImageNotFound:
            raise exc.ContainerSetupError(
                'Image {0} was not found'.format(image),
                app_abort.AbortedReason.IMAGE)

        container.start()
        container.reload()
        logs_gen = container.logs(stdout=True,
                                  stderr=True,
                                  stream=True,
                                  follow=True)

        _LOGGER.info('Container %s is running', name)
        while container.status == 'running':
            try:
                for log_lines in logs_gen:
                    sys.stderr.write(log_lines)
            except socket.error:
                pass

            container.reload()

        rc = container.wait()
        if os.WIFSIGNALED(rc):
            # Process died with a signal in docker
            sig = os.WTERMSIG(rc)
            os.kill(os.getpid(), sig)

        else:
            utils.sys_exit(os.WEXITSTATUS(rc))
Beispiel #8
0
    def top(ctx, exit_on_fail, zkid, notification_fd, approot, runtime):
        """Run treadmill init process."""
        _LOGGER.info('Initializing Treadmill: %s (%s)', approot, runtime)

        tm_env = appenv.AppEnvironment(approot)
        stop_on_lost = functools.partial(_stop_on_lost, tm_env)
        zkclient = zkutils.connect(context.GLOBAL.zk.url,
                                   idpath=zkid,
                                   listener=stop_on_lost)

        while not zkclient.exists(z.SERVER_PRESENCE):
            _LOGGER.warning('namespace not ready.')
            time.sleep(30)

        hostname = sysinfo.hostname()

        zk_blackout_path = z.path.blackedout_server(hostname)
        zk_server_path = z.path.server(hostname)
        zk_presence_path = z.path.server_presence(hostname)

        while not zkclient.exists(zk_server_path):
            _LOGGER.warning('server %s not defined in the cell.', hostname)
            time.sleep(30)

        _LOGGER.info('Checking blackout list.')
        blacklisted = bool(zkclient.exists(zk_blackout_path))

        root_cgroup = ctx.obj['ROOT_CGROUP']
        os_args = {}
        if os.name == 'posix':
            os_args['cgroup_prefix'] = root_cgroup

        if not blacklisted:
            # Node startup.
            _node_start(tm_env, runtime, zkclient, hostname, zk_server_path,
                        zk_presence_path, os_args)

            utils.report_ready(notification_fd)

            _init_network()

            _start_init1(tm_env)
            _LOGGER.info('Ready.')

            down_reason = _main_loop(tm_env, zkclient, zk_presence_path)

            if down_reason is not None:
                _LOGGER.warning('Shutting down: %s', down_reason)
                # Blackout the server.
                zkutils.ensure_exists(
                    zkclient,
                    zk_blackout_path,
                    acl=[zkclient.make_host_acl(hostname, 'rwcda')],
                    data=down_reason)
                trigger_postmortem = True
            else:
                # Blacked out manually
                trigger_postmortem = bool(zkclient.exists(zk_blackout_path))

            if trigger_postmortem:
                postmortem.run(approot, root_cgroup)

        else:
            # Node was already blacked out.
            _LOGGER.warning('Shutting down blacked out node.')

        # This is the shutdown phase.

        # Delete the node
        if zk_presence_path:
            zkutils.ensure_deleted(zkclient, zk_presence_path)
        zkclient.remove_listener(stop_on_lost)
        zkclient.stop()
        zkclient.close()

        _cleanup_network()

        # to ternminate all the running apps
        _blackout_terminate(tm_env)

        if exit_on_fail:
            utils.sys_exit(-1)
        else:
            # Sit forever in a broken state
            while True:
                time.sleep(1000000)
Beispiel #9
0
def exit_on_disconnect(state):
    """Watch for connection events and exit if disconnected."""
    _LOGGER.debug('ZK connection state: %s', state)
    if state != states.KazooState.CONNECTED:
        _LOGGER.info('Exiting on ZK connection lost.')
        utils.sys_exit(-1)
Beispiel #10
0
    def top(exit_on_fail, zkid, approot):
        """Run treadmill init process."""
        _LOGGER.info('Initializing Treadmill: %s', approot)

        tm_env = appenv.AppEnvironment(approot)
        zkclient = zkutils.connect(context.GLOBAL.zk.url,
                                   idpath=zkid,
                                   listener=_exit_clear_watchdog_on_lost)

        utils.report_ready()

        while not zkclient.exists(z.SERVER_PRESENCE):
            _LOGGER.warn('namespace not ready.')
            time.sleep(30)

        hostname = sysinfo.hostname()

        zk_blackout_path = z.path.blackedout_server(hostname)
        zk_presence_path = z.path.server_presence(hostname)
        zk_server_path = z.path.server(hostname)

        while not zkclient.exists(zk_server_path):
            _LOGGER.warn('server %s not defined in the cell.', hostname)
            time.sleep(30)

        _LOGGER.info('Checking blackout list.')
        blacklisted = bool(zkclient.exists(zk_blackout_path))

        if not blacklisted:
            # Node startup.
            _node_start(tm_env, zkclient, hostname, zk_server_path,
                        zk_presence_path)

            # Cleanup the watchdog directory
            tm_env.watchdogs.initialize()

            _init_network()

            _LOGGER.info('Ready.')

            down_reason = _main_loop(tm_env, zkclient, zk_presence_path)

            if down_reason is not None:
                _LOGGER.warning('Shutting down: %s', down_reason)

                # Blackout the server.
                zkutils.ensure_exists(
                    zkclient,
                    zk_blackout_path,
                    acl=[zkutils.make_host_acl(hostname, 'rwcda')],
                    data=down_reason)

        else:
            # Node was already blacked out.
            _LOGGER.warning('Shutting down blacked out node.')

        # This is the shutdown phase.

        # Delete the node
        zkutils.ensure_deleted(zkclient, zk_presence_path)
        zkclient.remove_listener(_exit_clear_watchdog_on_lost)
        zkclient.stop()
        zkclient.close()

        _cleanup_network()

        # to ternminate all the running apps
        _blackout_terminate(tm_env)

        if exit_on_fail:
            utils.sys_exit(-1)
        else:
            # Sit forever in a broken state
            while True:
                time.sleep(1000000)
Beispiel #11
0
def _exit_clear_watchdog_on_lost(state):
    _LOGGER.debug('ZK connection state: %s', state)
    if state == zkutils.states.KazooState.LOST:
        _LOGGER.info('Exiting on ZK connection lost.')
        utils.sys_exit(-1)