Beispiel #1
0
    def boot(self, lifecycle, model=Reactive, local=0):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model, Model), 'model must derive from ochopod.api.Model'
        assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - start logging to /var/log/ochopod.log
        #
        logger.info('EC2 marathon bindings started')
        web = Flask(__name__)

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the marathon application configuration for instance)
        # - the other settings come from marathon (namely the port bindings & application/task identifiers)
        # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API
        #
        env = \
            {
                'ochopod_application': '',
                'ochopod_cluster': 'default',
                'ochopod_debug': 'true',
                'ochopod_local': 'false',
                'ochopod_namespace': 'marathon',
                'ochopod_port': '8080',
                'ochopod_start': 'true',
                'ochopod_task': '',
                'PORT_8080': '8080'
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables (which are set by the marathon executor)
            # - extract the mesos PORT_* bindings and construct a small remapping dict
            #
            ports = {}
            logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            for key, val in env.items():
                if key.startswith('PORT_'):
                    ports[key[5:]] = int(val)

            #
            # - keep any "ochopod_" environment variable & trim its prefix
            # - default all our settings, especially the mandatory ones
            # - the ip and zookeeper are defaulted to localhost to enable easy testing
            #
            hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')}
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info('running in local mode (make sure you run a standalone zookeeper)')
                hints.update(
                    {
                        'fwk': 'marathon-ec2',
                        'ip': '127.0.0.1',
                        'node': 'local',
                        'ports': ports,
                        'public': '127.0.0.1',
                        'zk': '127.0.0.1:2181'
                    })
            else:

                #
                # - we are (assuming to be) deployed on EC2
                # - get our underlying metadata using curl
                #
                def _peek(token, strict=True):
                    code, lines = shell('curl -f http://169.254.169.254/latest/meta-data/%s' % token)
                    assert not strict or code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token
                    return lines[0]

                #
                # - get our local and public IPV4 addresses
                # - the "node" will show up as the EC2 instance ID
                # - note we allow the public IPv4 lookup to fail (in case we run in VPC)
                #
                hints.update(
                    {
                        'application': env['MARATHON_APP_ID'][1:],
                        'fwk': 'marathon-ec2',
                        'ip': _peek('local-ipv4'),
                        'node': _peek('instance-id'),
                        'ports': ports,
                        'public': _peek('public-ipv4', strict=False),
                        'task': env['MESOS_TASK_ID'],
                        'zk': ''
                    })

                def _install_from_package():

                    #
                    # - a regular package install will write the slave settings under /etc/mesos/zk
                    # - the snippet in there looks like zk://10.0.0.56:2181/mesos
                    #
                    code, lines = shell("cat /etc/mesos/zk")
                    assert code is 0 and lines[0], 'unable to retrieve the zk connection string'
                    return lines[0][5:].split('/')[0]

                def _dcos_deployment():

                    #
                    # - a DCOS slave is setup slightly differently with the settings being environment
                    #   variables set in /opt/mesosphere/etc/mesos-slave
                    # - the snippet in there is prefixed by MESOS_MASTER= and uses an alias
                    # - it looks like MESOS_MASTER=zk://leader.mesos:2181/mesos
                    #
                    code, lines = shell("grep MASTER /opt/mesosphere/etc/mesos-slave")
                    assert code is 0 and lines[0], 'unable to retrieve the zk connection string'
                    return lines[0][18:].split('/')[0]

                #
                # - depending on how the slave has been installed we might have to look in various places
                #   to find out what our zookeeper connection string is
                # - warning, a URL like format such as zk://<ip:port>,..,<ip:port>/mesos is used
                # - just keep the ip & port part and discard the rest
                #
                for method in [_install_from_package, _dcos_deployment]:
                    try:
                        hints['zk'] = method()
                        break

                    except:
                        pass

                assert hints['zk'], 'unable to determine where zookeeper is located (unsupported/bogus setup ?)'

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints['cluster'] and hints['namespace'], 'no cluster and/or namespace defined (user error ?)'

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (marathon/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            hints['metrics'] = {}
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(
                hints['zk'].split(','),
                hints['namespace'],
                hints['cluster'],
                int(hints['port']),
                breadcrumbs,
                model,
                hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():
                coordinator.tell({'request': 'reset'})
                return '{}', 200

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            #
            @web.route('/info', methods=['POST'])
            def _info():
                keys = \
                    [
                        'application',
                        'ip',
                        'metrics',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys, hints.iteritems()))
                return json.dumps(subset), 200

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):
                try:

                    ts = time.time()
                    logger.debug('http in -> /control/%s' % task)
                    latch = ThreadingFuture()
                    executor.tell({'request': task, 'latch': latch, 'data': request.data})
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():
                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200

            class _Runner(threading.Thread):
                """
                Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down
                using a dedicated HTTP POST.
                """

                def run(self):
                    web.run(host='0.0.0.0', port=int(hints['port']), threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')

                #
                # - simply idle forever (since the framework would restart any container that terminates)
                # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                # - any control request will now fail
                #
                while 1:
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))
Beispiel #2
0
    def boot(self, lifecycle, model=Reactive, tools=None, local=False):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model,
                          Model), 'model must derive from ochopod.api.Model'
        assert issubclass(
            lifecycle,
            LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - instantiate our flask endpoint
        # - default to a json handler for all HTTP errors (including an unexpected 500)
        #
        def _handler(error):
            http = error.code if isinstance(error, HTTPException) else 500
            return '{}', http, {
                'Content-Type': 'application/json; charset=utf-8'
            }

        web = Flask(__name__)
        for code in default_exceptions.iterkeys():
            web.error_handler_spec[None][code] = _handler

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the marathon application configuration for instance)
        # - the other settings come from marathon (namely the port bindings & application/task identifiers)
        # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API
        #
        env = \
            {
                'ochopod_application':  '',
                'ochopod_cluster':      'default',
                'ochopod_debug':        'true',
                'ochopod_local':        'false',
                'ochopod_namespace':    'marathon',
                'ochopod_port':         '8080',
                'ochopod_start':        'true',
                'ochopod_task':         '',
                'ochopod_zk':           '',
                'PORT_8080':            '8080'
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables (which are set by the marathon executor)
            # - extract the mesos PORT_* bindings and construct a small remapping dict
            #
            ports = {}
            logger.debug(
                'environment ->\n%s' %
                '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            for key, val in env.items():
                if key.startswith('PORT_'):
                    ports[key[5:]] = int(val)

            #
            # - keep any "ochopod_" environment variable & trim its prefix
            # - default all our settings, especially the mandatory ones
            # - the ip and zookeeper are defaulted to localhost to enable easy testing
            #
            hints = {
                k[8:]: v
                for k, v in env.items() if k.startswith('ochopod_')
            }
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info(
                    'running in local mode (make sure you run a standalone zookeeper)'
                )
                hints.update({
                    'fwk': 'marathon (debug)',
                    'ip': '127.0.0.1',
                    'node': 'local',
                    'ports': ports,
                    'public': '127.0.0.1',
                    'zk': '127.0.0.1:2181'
                })
            else:

                #
                # - extend our hints
                # - add the application + task
                #
                hints.update({
                    'application': env['MARATHON_APP_ID'][1:],
                    'fwk': 'marathon',
                    'ip': '',
                    'node': '',
                    'ports': ports,
                    'public': '',
                    'task': env['MESOS_TASK_ID'],
                    'zk': ''
                })

                #
                # - use whatever subclass is implementing us to infer 'ip', 'node' and 'public'
                #
                hints.update(self.get_node_details())

                #
                # - lookup for the zookeeper connection string from environment variable or on disk
                # - we have to look into different places depending on how mesos was installed
                #
                def _1():

                    #
                    # - most recent DCOS release
                    # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave-common
                    # - the snippet in there is prefixed by MESOS_MASTER=zk://<ip:port>/mesos
                    #
                    logger.debug(
                        'checking /opt/mesosphere/etc/mesos-slave-common...')
                    _, lines = shell(
                        "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave-common"
                    )
                    return lines[0][13:]

                def _2():

                    #
                    # - same as above except for slightly older DCOS releases
                    # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave
                    #
                    logger.debug('checking /opt/mesosphere/etc/mesos-slave...')
                    _, lines = shell(
                        "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave")
                    return lines[0][13:]

                def _3():

                    #
                    # - a regular package install will write the slave settings under /etc/mesos/zk (the snippet in
                    #   there looks like zk://10.0.0.56:2181/mesos)
                    #
                    logger.debug('checking /etc/mesos/zk...')
                    _, lines = shell("cat /etc/mesos/zk")
                    return lines[0]

                def _4():

                    #
                    # - look for ZK from environment variables
                    # - user can pass down ZK using $ochopod_zk
                    # - this last-resort situation is used mostly for debugging
                    #
                    logger.debug(
                        'checking $ochopod_zk environment variable...')
                    return env['ochopod_zk']

                #
                # - depending on how the slave has been installed we might have to look in various places
                #   to find out what our zookeeper connection string is
                # - use urlparse to keep the host:port part of the URL (possibly including a login+password)
                #
                for method in [_1, _2, _3, _4]:
                    try:
                        hints['zk'] = urlparse(method()).netloc
                        break

                    except Exception:
                        pass

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints[
                'zk'], 'unable to determine where zookeeper is located (unsupported/bogus mesos setup ?)'
            assert hints['cluster'] and hints[
                'namespace'], 'no cluster and/or namespace defined (user error ?)'

            #
            # - load the tools
            #
            if tools:
                tools = {
                    tool.tag: tool
                    for tool in
                    [clz() for clz in tools if issubclass(clz, Tool)]
                    if tool.tag
                }
                logger.info('supporting tools %s' % ', '.join(tools.keys()))

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (marathon) @ %s' %
                        (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            hints['metrics'] = {}
            hints['dependencies'] = model.depends_on
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(hints['zk'].split(','),
                                            hints['namespace'],
                                            hints['cluster'],
                                            int(hints['port']), breadcrumbs,
                                            model, hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():

                logger.debug('http in -> /reset')
                coordinator.tell({'request': 'reset'})
                return '{}', 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            # - from @pferro -> the pod's dependencies defined in the model are now added as well
            #
            @web.route('/info', methods=['POST'])
            def _info():

                logger.debug('http in -> /info')
                keys = \
                    [
                        'application',
                        'dependencies',
                        'ip',
                        'metrics',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys,
                                     hints.iteritems()))
                return json.dumps(subset), 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():

                logger.debug('http in -> /log')
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

            #
            # - RPC call to run a custom tool within the pod
            #
            @web.route('/exec', methods=['POST'])
            def _exec():

                logger.debug('http in -> /exec')

                #
                # - make sure the command (first token in the X-Shell header) maps to a tool
                # - if no match abort on a 404
                #
                line = request.headers['X-Shell']
                tokens = line.split(' ')
                cmd = tokens[0]
                if not tools or cmd not in tools:
                    return '{}', 404, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                code = 1
                tool = tools[cmd]

                #
                # - make sure the parser does not sys.exit()
                #
                class _Parser(ArgumentParser):
                    def exit(self, status=0, message=None):
                        raise ValueError(message)

                #
                # - prep a temporary directory
                # - invoke define_cmdline_parsing()
                # - switch off parsing if NotImplementedError is raised
                #
                use_parser = 1
                parser = _Parser(prog=tool.tag)
                try:
                    tool.define_cmdline_parsing(parser)

                except NotImplementedError:
                    use_parser = 0

                tmp = tempfile.mkdtemp()
                try:

                    #
                    # - parse the command line
                    # - upload any attachment
                    #
                    args = parser.parse_args(
                        tokens[1:]) if use_parser else ' '.join(tokens[1:])
                    for tag, upload in request.files.items():
                        where = path.join(tmp, tag)
                        logger.debug('uploading %s @ %s' % (tag, tmp))
                        upload.save(where)

                    #
                    # - run the tool method
                    # - pass the temporary directory as well
                    #
                    logger.info('invoking "%s"' % line)
                    code, lines = tool.body(args, tmp)

                except ValueError as failure:

                    lines = [
                        parser.format_help()
                        if failure.message is None else failure.message
                    ]

                except Exception as failure:

                    lines = ['unexpected failure -> %s' % failure]

                finally:

                    #
                    # - make sure to cleanup our temporary directory
                    #
                    shutil.rmtree(tmp)

                out = \
                    {
                        'code': code,
                        'stdout': lines
                    }

                return json.dumps(out), 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):

                logger.debug('http in -> /control/%s' % task)
                if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']:

                    #
                    # - fail on a HTTP 400 if the request is not supported
                    #
                    return '{}', 400, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                try:

                    ts = time.time()
                    latch = ThreadingFuture()
                    executor.tell({
                        'request': task,
                        'latch': latch,
                        'data': request.data
                    })
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():

                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - run werkzeug from a separate thread to avoid blocking the main one
            # - we'll have to shut it down using a dedicated HTTP POST
            #
            class _Runner(threading.Thread):
                def run(self):
                    web.run(host='0.0.0.0',
                            port=int(hints['port']),
                            threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')
                while 1:

                    #
                    # - simply idle forever (since the framework would restart any container that terminates)
                    # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                    # - any control request will now fail
                    #
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))
Beispiel #3
0
    def boot(self, lifecycle, model=Reactive, local=0):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model,
                          Model), 'model must derive from ochopod.api.Model'
        assert issubclass(
            lifecycle,
            LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - start logging to /var/log/ochopod.log
        #
        logger.info('EC2 kubernetes bindings started')
        web = Flask(__name__)

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the pod configuration yaml/json for instance)
        #
        env = \
            {
                'ochopod_application': '',
                'ochopod_cluster': '',
                'ochopod_debug': 'true',
                'ochopod_local': 'false',
                'ochopod_namespace': 'default',
                'ochopod_port': '8080',
                'ochopod_start': 'true',
                'ochopod_task': ''
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables
            # - isolate the ones prefixed with ochopod_
            #
            logger.debug(
                'environment ->\n%s' %
                '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            hints = {
                k[8:]: v
                for k, v in env.items() if k.startswith('ochopod_')
            }
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info(
                    'running in local mode (make sure you run a standalone zookeeper)'
                )
                hints.update({
                    'fwk': 'kubernetes',
                    'ip': '127.0.0.1',
                    'node': 'localhost',
                    'public': '127.0.0.1',
                    'zk': '127.0.0.1:2181'
                })
            else:

                #
                # - we are (assuming to be) deployed on EC2
                # - we'll retrieve the underlying metadata using curl
                #
                def _aws(token):
                    code, lines = shell(
                        'curl -f http://169.254.169.254/latest/meta-data/%s' %
                        token)
                    assert code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token
                    return lines[0]

                #
                # - lame workaround to fetch the master IP and credentials as there does not seem to be a way to
                #   use 10.0.0.2 from within the pod yet (or i'm too stupid to find out)
                # - curl to the master to retrieve info about our cluster
                # - don't forget to merge the resulting output
                #
                def _k8s(token):
                    code, lines = shell(
                        'curl -f -u %s:%s -k https://%s/api/v1beta3/namespaces/default/%s'
                        % (env['KUBERNETES_USER'], env['KUBERNETES_PWD'],
                           env['KUBERNETES_MASTER'], token))
                    assert code is 0, 'unable to look the RO service up (is the master running ?)'
                    return json.loads(''.join(lines))

                #
                # - look our local k8s pod up
                # - get our container ip
                # - extract the port bindings
                # - keep any "ochopod_" environment variable & trim its prefix
                #
                @retry(timeout=60, pause=1)
                def _spin():

                    #
                    # - wait til the k8s pod is running and publishing its IP
                    #
                    cfg = _k8s('pods/%s' % env['HOSTNAME'])
                    assert 'podIP' in cfg[
                        'status'], 'pod not ready yet -> %s' % cfg['status'][
                            'phase']
                    return cfg

                this_pod = _spin()
                hints['ip'] = this_pod['status']['podIP']

                #
                # - revert to the k8s pod name if no cluster is specified
                #
                if not hints['cluster']:
                    hints['cluster'] = this_pod['metadata']['name']

                #
                # - consider the 1st pod container
                # - grab the exposed ports (no remapping required)
                #
                ports = {}
                container = this_pod['spec']['containers'][0]
                for binding in container['ports']:
                    port = binding['containerPort']
                    ports[str(port)] = port

                #
                # - set 'task' to $HOSTNAME (the container is named after the k8s pod)
                # - get our public IPV4 address
                # - the "node" will show up as the EC2 instance ID
                #
                hints.update({
                    'fwk': 'k8s-ec2',
                    'node': _aws('instance-id'),
                    'ports': ports,
                    'public': _aws('public-ipv4'),
                    'task': env['HOSTNAME']
                })

                #
                # - look the k8s "ocho-proxy" pod up
                # - it should be design run our synchronization zookeeper
                #
                proxy = _k8s('pods/ocho-proxy')
                assert 'podIP' in proxy['status'], 'proxy not ready ?'
                hints['zk'] = _k8s('pods/ocho-proxy')['status']['podIP']

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints['namespace'], 'no namespace defined (user error ?)'

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (kubernetes/ec2) @ %s' %
                        (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(hints['zk'].split(','),
                                            hints['namespace'],
                                            hints['cluster'],
                                            int(hints['port']), breadcrumbs,
                                            model, hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():
                coordinator.tell({'request': 'reset'})
                return '{}', 200

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            #
            @web.route('/info', methods=['POST'])
            def _info():
                keys = \
                    [
                        'application',
                        'ip',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys,
                                     hints.iteritems()))
                return json.dumps(subset), 200

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):
                try:

                    ts = time.time()
                    logger.debug('http in -> /control/%s' % task)
                    latch = ThreadingFuture()
                    executor.tell({
                        'request': task,
                        'latch': latch,
                        'data': request.data
                    })
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():
                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200

            class _Runner(threading.Thread):
                """
                Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down
                using a dedicated HTTP POST.
                """
                def run(self):
                    web.run(host='0.0.0.0',
                            port=int(hints['port']),
                            threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')

                #
                # - simply idle forever (since the framework would restart any container that terminates)
                # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                # - any control request will now fail
                #
                while 1:
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))

        exit(1)
Beispiel #4
0
    def boot(self, lifecycle, model=Reactive, tools=None, local=False):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model, Model), 'model must derive from ochopod.api.Model'
        assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - instantiate our flask endpoint
        # - default to a json handler for all HTTP errors (including an unexpected 500)
        #
        def _handler(error):
            http = error.code if isinstance(error, HTTPException) else 500
            return '{}', http, {'Content-Type': 'application/json; charset=utf-8'}

        web = Flask(__name__)
        for code in default_exceptions.iterkeys():
            web.error_handler_spec[None][code] = _handler

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the marathon application configuration for instance)
        # - the other settings come from marathon (namely the port bindings & application/task identifiers)
        # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API
        #
        env = \
            {
                'ochopod_application':  '',
                'ochopod_cluster':      'default',
                'ochopod_debug':        'true',
                'ochopod_local':        'false',
                'ochopod_namespace':    'marathon',
                'ochopod_port':         '8080',
                'ochopod_start':        'true',
                'ochopod_task':         '',
                'ochopod_zk':           '',
                'PORT_8080':            '8080'
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables (which are set by the marathon executor)
            # - extract the mesos PORT_* bindings and construct a small remapping dict
            #
            ports = {}
            logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            for key, val in env.items():
                if key.startswith('PORT_'):
                    ports[key[5:]] = int(val)

            #
            # - keep any "ochopod_" environment variable & trim its prefix
            # - default all our settings, especially the mandatory ones
            # - the ip and zookeeper are defaulted to localhost to enable easy testing
            #
            hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')}
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info('running in local mode (make sure you run a standalone zookeeper)')
                hints.update(
                    {
                        'fwk':          'marathon (debug)',
                        'ip':           '127.0.0.1',
                        'node':         'local',
                        'ports':        ports,
                        'public':       '127.0.0.1',
                        'zk':           '127.0.0.1:2181'
                    })
            else:

                #
                # - extend our hints
                # - add the application + task
                #
                hints.update(
                    {
                        'application':  env['MARATHON_APP_ID'][1:],
                        'fwk':          'marathon',
                        'ip':           '',
                        'node':         '',
                        'ports':        ports,
                        'public':       '',
                        'task':         env['MESOS_TASK_ID'],
                        'zk':           ''
                    })

                #
                # - use whatever subclass is implementing us to infer 'ip', 'node' and 'public'
                #
                hints.update(self.get_node_details())

                #
                # - lookup for the zookeeper connection string from environment variable or on disk
                # - we have to look into different places depending on how mesos was installed
                #
                def _1():

                    #
                    # - most recent DCOS release
                    # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave-common
                    # - the snippet in there is prefixed by MESOS_ZK=zk://<ip:port>/mesos
                    #
                    logger.debug('checking /opt/mesosphere/etc/mesos-slave-common...')
                    _, lines = shell("grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave-common")
                    return lines[0][18:].split('/')[0]

                def _2():

                    #
                    # - same as above except for slightly older DCOS releases
                    # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave
                    #
                    logger.debug('checking /opt/mesosphere/etc/mesos-slave...')
                    _, lines = shell("grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave")
                    return lines[0][18:].split('/')[0]

                def _3():

                    #
                    # - a regular package install will write the slave settings under /etc/mesos/zk (the snippet in
                    #   there looks like zk://10.0.0.56:2181/mesos)
                    #
                    logger.debug('checking /etc/mesos/zk...')
                    _, lines = shell("cat /etc/mesos/zk")
                    return lines[0][5:].split('/')[0]

                def _4():

                    #
                    # - look for ZK from environment variables
                    # - user can pass down ZK using $ochopod_zk
                    #
                    logger.debug('checking $ochopod_zk environment variable...')
                    if env['ochopod_zk']:
                        logger.debug('found $ochopod_zk environment variable...')
                        return env['ochopod_zk'][5:].split('/')[0]

                #
                # - depending on how the slave has been installed we might have to look in various places
                #   to find out what our zookeeper connection string is
                # - warning, a URL like format such as zk://<ip:port>,..,<ip:port>/mesos is used
                # - just keep the ip & port part and discard the rest
                #
                for method in [_1, _2, _3, _4]:
                    try:
                        hints['zk'] = method()
                        break

                    except Exception:
                        pass

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints['zk'], 'unable to determine where zookeeper is located (unsupported/bogus mesos setup ?)'
            assert hints['cluster'] and hints['namespace'], 'no cluster and/or namespace defined (user error ?)'

            #
            # - load the tools
            #
            if tools:
                tools = {tool.tag: tool for tool in [clz() for clz in tools if issubclass(clz, Tool)] if tool.tag}
                logger.info('supporting tools %s' % ', '.join(tools.keys()))

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (marathon) @ %s' % (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            hints['metrics'] = {}
            hints['dependencies'] = model.depends_on
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(
                hints['zk'].split(','),
                hints['namespace'],
                hints['cluster'],
                int(hints['port']),
                breadcrumbs,
                model,
                hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():

                logger.debug('http in -> /reset')
                coordinator.tell({'request': 'reset'})
                return '{}', 200, {'Content-Type': 'application/json; charset=utf-8'}

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            # - from @pferro -> the pod's dependencies defined in the model are now added as well
            #
            @web.route('/info', methods=['POST'])
            def _info():

                logger.debug('http in -> /info')
                keys = \
                    [
                        'application',
                        'dependencies',
                        'ip',
                        'metrics',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys, hints.iteritems()))
                return json.dumps(subset), 200, {'Content-Type': 'application/json; charset=utf-8'}

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():

                logger.debug('http in -> /log')
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200, {'Content-Type': 'application/json; charset=utf-8'}

            #
            # - RPC call to run a custom tool within the pod
            #
            @web.route('/exec', methods=['POST'])
            def _exec():

                logger.debug('http in -> /exec')

                #
                # - make sure the command (first token in the X-Shell header) maps to a tool
                # - if no match abort on a 404
                #
                line = request.headers['X-Shell']
                tokens = line.split(' ')
                cmd = tokens[0]
                if not tools or cmd not in tools:
                    return '{}', 404, {'Content-Type': 'application/json; charset=utf-8'}

                code = 1
                tool = tools[cmd]

                #
                # - make sure the parser does not sys.exit()
                #
                class _Parser(ArgumentParser):
                    def exit(self, status=0, message=None):
                        raise ValueError(message)

                #
                # - prep a temporary directory
                # - invoke define_cmdline_parsing()
                # - switch off parsing if NotImplementedError is raised
                #
                use_parser = 1
                parser = _Parser(prog=tool.tag)
                try:
                    tool.define_cmdline_parsing(parser)

                except NotImplementedError:
                    use_parser = 0

                tmp = tempfile.mkdtemp()
                try:

                    #
                    # - parse the command line
                    # - upload any attachment
                    #
                    args = parser.parse_args(tokens[1:]) if use_parser else ' '.join(tokens[1:])
                    for tag, upload in request.files.items():
                        where = path.join(tmp, tag)
                        logger.debug('uploading %s @ %s' % (tag, tmp))
                        upload.save(where)

                    #
                    # - run the tool method
                    # - pass the temporary directory as well
                    #
                    logger.info('invoking "%s"' % line)
                    code, lines = tool.body(args, tmp)

                except ValueError as failure:

                    lines = [parser.format_help() if failure.message is None else failure.message]

                except Exception as failure:

                    lines = ['unexpected failure -> %s' % failure]

                finally:

                    #
                    # - make sure to cleanup our temporary directory
                    #
                    shutil.rmtree(tmp)

                out = \
                    {
                        'code': code,
                        'stdout': lines
                    }

                return json.dumps(out), 200, {'Content-Type': 'application/json; charset=utf-8'}

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):

                logger.debug('http in -> /control/%s' % task)
                if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']:

                    #
                    # - fail on a HTTP 400 if the request is not supported
                    #
                    return '{}', 400, {'Content-Type': 'application/json; charset=utf-8'}

                try:

                    ts = time.time()
                    latch = ThreadingFuture()
                    executor.tell({'request': task, 'latch': latch, 'data': request.data})
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code, {'Content-Type': 'application/json; charset=utf-8'}

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408, {'Content-Type': 'application/json; charset=utf-8'}

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410, {'Content-Type': 'application/json; charset=utf-8'}

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():

                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200, {'Content-Type': 'application/json; charset=utf-8'}

            #
            # - run werkzeug from a separate thread to avoid blocking the main one
            # - we'll have to shut it down using a dedicated HTTP POST
            #
            class _Runner(threading.Thread):

                def run(self):
                    web.run(host='0.0.0.0', port=int(hints['port']), threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')
                while 1:

                    #
                    # - simply idle forever (since the framework would restart any container that terminates)
                    # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                    # - any control request will now fail
                    #
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))
Beispiel #5
0
    def boot(self, lifecycle, model=Reactive, local=0):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model, Model), 'model must derive from ochopod.api.Model'
        assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - start logging to /var/log/ochopod.log
        #
        logger.info('EC2 kubernetes bindings started')
        web = Flask(__name__)

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the pod configuration yaml/json for instance)
        #
        env = \
            {
                'ochopod_application': '',
                'ochopod_cluster': '',
                'ochopod_debug': 'true',
                'ochopod_local': 'false',
                'ochopod_namespace': 'default',
                'ochopod_port': '8080',
                'ochopod_start': 'true',
                'ochopod_task': ''
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables
            # - isolate the ones prefixed with ochopod_
            #
            logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')}
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info('running in local mode (make sure you run a standalone zookeeper)')
                hints.update(
                    {
                        'fwk': 'kubernetes',
                        'ip': '127.0.0.1',
                        'node': 'localhost',
                        'public': '127.0.0.1',
                        'zk': '127.0.0.1:2181'
                    })
            else:

                #
                # - we are (assuming to be) deployed on EC2
                # - we'll retrieve the underlying metadata using curl
                #
                def _aws(token):
                    code, lines = shell('curl -f http://169.254.169.254/latest/meta-data/%s' % token)
                    assert code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token
                    return lines[0]

                #
                # - lame workaround to fetch the master IP and credentials as there does not seem to be a way to
                #   use 10.0.0.2 from within the pod yet (or i'm too stupid to find out)
                # - curl to the master to retrieve info about our cluster
                # - don't forget to merge the resulting output
                #
                def _k8s(token):
                    code, lines = shell('curl -f -u %s:%s -k https://%s/api/v1beta3/namespaces/default/%s' % (env['KUBERNETES_USER'], env['KUBERNETES_PWD'], env['KUBERNETES_MASTER'], token))
                    assert code is 0, 'unable to look the RO service up (is the master running ?)'
                    return json.loads(''.join(lines))

                #
                # - look our local k8s pod up
                # - get our container ip
                # - extract the port bindings
                # - keep any "ochopod_" environment variable & trim its prefix
                #
                @retry(timeout=60, pause=1)
                def _spin():

                    #
                    # - wait til the k8s pod is running and publishing its IP
                    #
                    cfg = _k8s('pods/%s' % env['HOSTNAME'])
                    assert 'podIP' in cfg['status'], 'pod not ready yet -> %s' % cfg['status']['phase']
                    return cfg

                this_pod = _spin()
                hints['ip'] = this_pod['status']['podIP']

                #
                # - revert to the k8s pod name if no cluster is specified
                #
                if not hints['cluster']:
                    hints['cluster'] = this_pod['metadata']['name']

                #
                # - consider the 1st pod container
                # - grab the exposed ports (no remapping required)
                #
                ports = {}
                container = this_pod['spec']['containers'][0]
                for binding in container['ports']:
                    port = binding['containerPort']
                    ports[str(port)] = port

                #
                # - set 'task' to $HOSTNAME (the container is named after the k8s pod)
                # - get our public IPV4 address
                # - the "node" will show up as the EC2 instance ID
                #
                hints.update(
                    {
                        'fwk': 'k8s-ec2',
                        'node': _aws('instance-id'),
                        'ports': ports,
                        'public': _aws('public-ipv4'),
                        'task': env['HOSTNAME']
                    })

                #
                # - look the k8s "ocho-proxy" pod up
                # - it should be design run our synchronization zookeeper
                #
                proxy = _k8s('pods/ocho-proxy')
                assert 'podIP' in proxy['status'], 'proxy not ready ?'
                hints['zk'] = _k8s('pods/ocho-proxy')['status']['podIP']

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints['namespace'], 'no namespace defined (user error ?)'

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (kubernetes/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(
                hints['zk'].split(','),
                hints['namespace'],
                hints['cluster'],
                int(hints['port']),
                breadcrumbs,
                model,
                hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():
                coordinator.tell({'request': 'reset'})
                return '{}', 200

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            #
            @web.route('/info', methods=['POST'])
            def _info():
                keys = \
                    [
                        'application',
                        'ip',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys, hints.iteritems()))
                return json.dumps(subset), 200

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):
                try:

                    ts = time.time()
                    logger.debug('http in -> /control/%s' % task)
                    latch = ThreadingFuture()
                    executor.tell({'request': task, 'latch': latch, 'data': request.data})
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():
                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200

            class _Runner(threading.Thread):
                """
                Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down
                using a dedicated HTTP POST.
                """

                def run(self):
                    web.run(host='0.0.0.0', port=int(hints['port']), threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')

                #
                # - simply idle forever (since the framework would restart any container that terminates)
                # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                # - any control request will now fail
                #
                while 1:
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))

        exit(1)