Example #1
0
    def run(self):
        try:

            def _query(zk):
                replies = fire(zk,
                               self.cluster,
                               'control/on',
                               subset=self.indices,
                               timeout=self.timeout)
                return len(replies), [
                    seq for seq, (_, _, code) in replies.items() if code == 200
                ]

            total, js = run(self.proxy, _query)
            assert len(js) == total, '1 or more pod failed to stop'

            self.out['on'] = js
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to switch on -> %s' %
                         (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to switch on -> %s' %
                         (self.cluster, diagnostic(failure)))
Example #2
0
        def _in():
            global threads
            threads += 1
            
            try:

                time.sleep(random.randrange(10, 50, 5)/10.0)
                chain = request.access_route + [request.remote_addr]
                host = chain[0]
                lines = \
                    [
                        settings['welcome'],
                        'container running @ %s (%s)' % (hints['node'], hints['ip']),
                        'http request from %s' % host
                    ]

                return json.dumps({'out': '<br/>'.join(lines)})

            except Exception as failure:

                logger.error('unexpected failure while receiving -> %s' % diagnostic(failure))
                return '', 500

            finally:

                threads -= 1
Example #3
0
    def run(self):
        try:

            #
            # - first turn off the pods
            # - keep track of the indices
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'control/off', subset=self.indices)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            pods = run(self.proxy, _query)

            #
            # - then turn those pod back on
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'control/on', subset=pods)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            assert pods == run(self.proxy, _query), 'one or more pods failed to switch back on'

            self.out['reset'] = pods
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to reset -> %s' % (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
Example #4
0
        def _from_web_shell():
            tmp = tempfile.mkdtemp()
            try:

                #
                # - get the shell snippet from the uri
                # - use the 'toolset' python package that's installed in the container
                # - open it
                #
                ts = time.time()
                line = request.args.get('line', 0, type=str)
                logger.debug('http -> shell request "%s"' % line)
                pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=PIPE, env=env, cwd=tmp)

                #
                # - wait for completion
                # - return as json ('out' contains the verbatim dump from the sub-process stdout)
                #
                pid.wait()
                ms = 1000 * (time.time() - ts)
                return json.dumps({'ok': pid.returncode == 0, 'ms': int(ms), 'out': pid.stdout.read()})

            except Exception as failure:

                why = diagnostic(failure)
                logger.warning('unexpected failure -> %s' % why)
                return json.dumps({'ok': False, 'out': 'unexpected failure -> %s' % why})

            finally:

                #
                # - make sure to cleanup our temporary directory
                #
                shutil.rmtree(tmp)
Example #5
0
        def _in():
            global threads
            threads += 1

            try:

                time.sleep(random.randrange(10, 50, 5) / 10.0)
                chain = request.access_route + [request.remote_addr]
                host = chain[0]
                lines = \
                    [
                        settings['welcome'],
                        'container running @ %s (%s)' % (hints['node'], hints['ip']),
                        'http request from %s' % host
                    ]

                return json.dumps({'out': '<br/>'.join(lines)})

            except Exception as failure:

                logger.error('unexpected failure while receiving -> %s' %
                             diagnostic(failure))
                return '', 500

            finally:

                threads -= 1
Example #6
0
    def kill(self, data):

        #
        # - the /kill request will first guarantee we terminate the process
        #
        if data.forked:
            raise Aborted('resetting to terminate pid %s' % data.forked.pid)

        try:

            #
            # - invoke the optional finalize() callback
            #
            logger.info('%s : finalizing pod' % self.path)
            self.finalize()

        except Exception as failure:

            #
            # - log something if for some reason finalize() failed as we can't really recover
            # - don't bother responding with a 406
            #
            logger.warning('%s : failed to finalize -> %s' % (self.path, diagnostic(failure)))

        #
        # - in any case request a termination and tag the pod as 'dead'
        #
        reply = {}, 200
        self.terminate = 1
        self.hints['process'] = 'dead'
        data.latch.set(reply)
        self.commands.popleft()
        return 'spin', data, 0
Example #7
0
    def run(self):
        try:

            def _query(zk):
                replies = fire(zk, self.cluster, 'control/off', subset=self.subset)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            js = run(self.proxy, _query)

            def _query(zk):
                replies = fire(zk, self.cluster, 'reset', subset=self.subset)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            assert js == run(self.proxy, _query), 'one or more pods did not respond'

            def _query(zk):
                replies = fire(zk, self.cluster, 'control/on', subset=self.subset)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            assert js == run(self.proxy, _query), 'one or more pods did not respond'

            self.out['reset'] = js
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to reset -> %s' % (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
Example #8
0
    def run(self):
        try:

            #
            # - first turn the pod off
            #
            def _query(zk):
                replies = fire(zk,
                               self.cluster,
                               'control/off',
                               subset=self.indices)
                return [
                    seq for _, (seq, _, code) in replies.items() if code == 200
                ]

            js = run(self.proxy, _query)

            #
            # - reset it
            # - this will force a reconnection to zookeeper
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'reset', subset=self.indices)
                return [
                    seq for _, (seq, _, code) in replies.items() if code == 200
                ]

            assert js == run(self.proxy,
                             _query), 'one or more pods did not respond'

            #
            # - then turn the pod back on
            #
            def _query(zk):
                replies = fire(zk,
                               self.cluster,
                               'control/on',
                               subset=self.indices)
                return [
                    seq for _, (seq, _, code) in replies.items() if code == 200
                ]

            assert js == run(self.proxy,
                             _query), 'one or more pods did not respond'

            self.out['reset'] = js
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to reset -> %s' %
                         (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to reset -> %s' %
                         (self.cluster, diagnostic(failure)))
Example #9
0
        def _from_curl():
            tmp = tempfile.mkdtemp()
            try:

                #
                # - download each multi-part file to a temporary folder
                #
                for tag, upload in request.files.items():
                    where = join(tmp, tag)
                    logger.debug('http -> upload @ %s' % where)
                    upload.save(where)

                #
                # - get the shell snippet to run from the X-Shell header
                # - use the 'toolset' python package that's installed in the container
                # - open it
                #
                ts = time.time()
                line = request.headers['X-Shell']
                logger.debug('http -> shell request "%s"' % line)

                #
                # - pipe the process stdout
                # - return as json ('out' contains the verbatim dump from the sub-process stdout)
                #
                outs = []
                pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=PIPE, env=env, cwd=tmp)
                while True:

                    line = pid.stdout.readline().rstrip('\n')
                    code = pid.poll()
                    if line == '' and code is not None:
                        break
                    outs += [line]

                ms = 1000 * (time.time() - ts)
                return json.dumps({'ok': pid.returncode == 0, 'ms': int(ms), 'out': '\n'.join(outs)})

            except Exception as failure:

                why = diagnostic(failure)
                logger.warning('unexpected failure -> %s' % why)
                return json.dumps({'ok': False, 'out': 'unexpected failure -> %s' % why})

            finally:

                #
                # - make sure to cleanup our temporary directory
                #
                shutil.rmtree(tmp)
Example #10
0
    def run(self):
        try:

            #
            # - first turn off the pods
            # - keep track of the indices
            #
            def _query(zk):
                replies = fire(zk,
                               self.cluster,
                               'control/off',
                               subset=self.indices,
                               timeout=self.timeout)
                return [
                    seq for _, (seq, _, code) in replies.items() if code == 200
                ]

            pods = run(self.proxy, _query)

            #
            # - then turn those pod back on
            #
            def _query(zk):
                replies = fire(zk,
                               self.cluster,
                               'control/on',
                               subset=pods,
                               timeout=self.timeout)
                return [
                    seq for _, (seq, _, code) in replies.items() if code == 200
                ]

            assert pods == run(
                self.proxy,
                _query), 'one or more pods failed to switch back on'

            self.out['reset'] = pods
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to reset -> %s' %
                         (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to reset -> %s' %
                         (self.cluster, diagnostic(failure)))
Example #11
0
        def _import(where, funcs):
            try:
                for script in [f for f in listdir(where) if isfile(join(where, f)) and f.endswith('.py')]:
                    try:
                        module = imp.load_source(script[:-3], join(where, script))
                        if hasattr(module, 'go') and callable(module.go):
                            tool = module.go()
                            assert isinstance(tool, Template), 'boo'
                            assert tool.tag, ''
                            funcs[tool.tag] = tool

                    except Exception as failure:
                        logger.debug('failed to import %s (%s)' % (script, diagnostic(failure)))

            except OSError:
                pass
Example #12
0
        def _static(path):
            global threads
            threads += 1

            try:

                time.sleep(random.randrange(10, 50, 5)/10.0)
                return send_from_directory('static', path)

            except Exception as failure:
                logger.error('unexpected failure while receiving -> %s' % diagnostic(failure))
                return '', 500

            finally:

                threads -= 1
Example #13
0
def servo(strict=True, verbose=False):
    try:

        #
        # - retrieve the portal coordinates from /opt/servo/.portal
        # - this file is rendered by the pod script upon boot
        #
        _, lines = shell('cat .portal', cwd='/opt/servo')
        portal = lines[0]
        assert portal, '/opt/servo/.portal not found (pod not yet configured ?)'

        def _proxy(cmdline):

            #
            # - this block is taken from cli.py in ochothon
            # - in debug mode the verbatim response from the portal is dumped on stdout
            # - slight modification : we force the json output (-j)
            #
            tokens = cmdline.split(' ') + ['-j']
            files = ['-F %s=@%s' % (basename(token), expanduser(token)) for token in tokens if isfile(expanduser(token))]
            line = ' '.join([basename(token) if isfile(expanduser(token)) else token for token in tokens])
            snippet = 'curl -X POST -H "X-Shell:%s" %s %s/shell' % (line, ' '.join(files), portal)
            code, lines = shell(snippet)
            assert code is 0, 'is the portal @ %s down ?' % portal
            js = json.loads(lines[0])
            ok = js['ok']
            if verbose:
                print '[%s] "%s"' % ('passed' if ok else 'failed', cmdline)
            assert not strict or ok, '"%s" failed' % cmdline
            return json.loads(js['out']) if ok else None

        yield _proxy

        #
        # - all clear, return 0 to signal a success
        #
        sys.exit(0)

    except AssertionError as failure:

        print 'failure -> %s' % failure

    except Exception as failure:

        print 'unexpected failure -> %s' % diagnostic(failure)

    sys.exit(1)
Example #14
0
        def _static(path):
            global threads
            threads += 1

            try:

                time.sleep(random.randrange(10, 50, 5) / 10.0)
                return send_from_directory('static', path)

            except Exception as failure:
                logger.error('unexpected failure while receiving -> %s' %
                             diagnostic(failure))
                return '', 500

            finally:

                threads -= 1
Example #15
0
        def _from_web_shell():
            tmp = tempfile.mkdtemp()
            try:

                #
                # - get the shell snippet from the uri
                # - use the 'toolset' python package that's installed in the container
                # - open it
                #
                ts = time.time()
                line = request.args.get('line', 0, type=str)
                logger.debug('http -> shell request "%s"' % line)
                pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=PIPE, env=env, cwd=tmp)

                #
                # - wait for completion
                # - return as json ('out' contains the verbatim dump from the sub-process stdout)
                #
                outs = []

                #
                # - taken from ochopod's subprocess piping; avoids issues with buffering
                #
                while True:

                    line = pid.stdout.readline().rstrip('\n')
                    code = pid.poll()
                    if line == '' and code is not None:
                        break
                    outs += [line]

                ms = 1000 * (time.time() - ts)
                return json.dumps({'ok': pid.returncode == 0, 'ms': int(ms), 'out': '\n'.join(outs)})

            except Exception as failure:

                why = diagnostic(failure)
                logger.warning('unexpected failure -> %s' % why)
                return json.dumps({'ok': False, 'out': 'unexpected failure -> %s' % why})

            finally:

                #
                # - make sure to cleanup our temporary directory
                #
                shutil.rmtree(tmp)
Example #16
0
        def _import(where, funcs):
            try:
                for script in [f for f in listdir(where) if isfile(join(where, f)) and f.endswith('.py')]:
                    try:
                        module = imp.load_source(script[:-3], join(where, script))
                        if hasattr(module, 'go') and callable(module.go):
                            tool = module.go()
                            assert isinstance(tool, Template), '%s is not inheriting from Template' % script[:-3]
                            assert tool.tag, 'missing tool tag (check the %s module)' % script[-3]
                            funcs[tool.tag] = tool

                    except Exception as failure:

                        logger.warning('failed to import %s (%s)' % (script, diagnostic(failure)))

            except OSError:
                pass
Example #17
0
    def signal(self, data):

        try:
            logger.debug('%s : user signal received' % self.path)
            js = self.signaled(data.js, process=data.forked)
            reply = js if js else {}, 200

        except Exception as failure:

            #
            # - abort on a 500 upon any failure
            #
            reply = {}, 500
            logger.warning('%s : failed to signal -> %s' % (self.path, diagnostic(failure)))

        data.latch.set(reply)
        self.commands.popleft()
        return 'spin', data, 0
Example #18
0
    def run(self):
        try:

            def _query(zk):
                replies = fire(zk, self.cluster, 'control/on', subset=self.indices, timeout=self.timeout)
                return len(replies), [seq for seq, (_, _, code) in replies.items() if code == 200]

            total, js = run(self.proxy, _query)
            assert len(js) == total, '1 or more pod failed to stop'

            self.out['on'] = js
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to switch on -> %s' % (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to switch on -> %s' % (self.cluster, diagnostic(failure)))
Example #19
0
        def index():
            global threads
            threads += 1

            try:

                time.sleep(random.randrange(10, 50, 5)/10.0)
                #
                # - index.html contains all the jquery magic that will run the shell and
                #   use ajax to I/O with us
                #
                return render_template('index.html')

            except Exception as failure:
                logger.error('unexpected failure while receiving -> %s' % diagnostic(failure))
                return '', 500

            finally:

                threads -= 1
Example #20
0
    def run(self):
        try:

            #
            # - first turn the pod off
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'control/off', subset=self.indices)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            js = run(self.proxy, _query)

            #
            # - reset it
            # - this will force a reconnection to zookeeper
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'reset', subset=self.indices)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            assert js == run(self.proxy, _query), 'one or more pods did not respond'

            #
            # - then turn the pod back on
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'control/on', subset=self.indices)
                return [seq for _, (seq, _, code) in replies.items() if code == 200]

            assert js == run(self.proxy, _query), 'one or more pods did not respond'

            self.out['reset'] = js
            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to reset -> %s' % (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
Example #21
0
        def index():
            global threads
            threads += 1

            try:

                time.sleep(random.randrange(10, 50, 5) / 10.0)
                #
                # - index.html contains all the jquery magic that will run the shell and
                #   use ajax to I/O with us
                #
                return render_template('index.html')

            except Exception as failure:
                logger.error('unexpected failure while receiving -> %s' %
                             diagnostic(failure))
                return '', 500

            finally:

                threads -= 1
Example #22
0
    def ok(self, data):

        try:

            assert data.js, 'control/ok received out of context (leader bug ?)'
            logger.debug('%s : cluster has been formed, invoking configured()' % self.path)
            cluster = _Cluster(data.js)
            self.configured(cluster)
            reply = {}, 200

        except Exception as failure:

            #
            # - abort on a 500 upon any failure
            #
            reply = {}, 500
            logger.warning('%s : failed to signal -> %s' % (self.path, diagnostic(failure)))

        data.latch.set(reply)
        self.commands.popleft()
        return 'spin', data, 0
Example #23
0
    def check(self, data):

        try:
            #
            # - simply invoke the user-defined readiness check (typically to allow making sure all
            #   the required dependencies are available before starting anything)
            #
            reply = {}, 200
            cluster = _Cluster(data.js)
            self.can_configure(cluster)
            data.latch.set(reply)

        except Exception as failure:

            #
            # - any failure trapped during the configuration -> HTTP 406
            #
            reply = {}, 406
            logger.warning('%s : failed to run pre-check -> %s' % (self.path, diagnostic(failure)))
            data.latch.set(reply)

        self.commands.popleft()
        return 'spin', data, 0
Example #24
0
    def config(self, data):

        try:

            #
            # - make sure we persist the latest snapshot to zk
            # - order the dict to make sure we always assign the same index to the same pod
            # - unroll our pods into one URL list
            #
            data.last = None
            pods = self.snapshots['local']
            self.hints['state'] = 'leader (configuring)'
            self.hints['status'] = '* configuring %d pods' % len(pods)

            #
            # - map each pod to its full control URL
            # - this will allow us to send requests directly without worrying about remapping the control port
            # - pay attention to order the pod list to guarantee consistent sequencing
            #
            logger.info('%s : configuring (%d pods, i/o port %d)' % (self.path, len(pods), self.port))
            ordered = sorted(pods.items())
            local = str(self.port)
            urls = \
                {key: ('http://%s:%d' % (js['ip'], js['ports'][local])) for key, js in ordered if local in js['ports']}

            #
            # - they should all expose their control port
            #
            assert len(urls) == len(pods), '1+ pods are not exposing TCP %d (user error ?)' % self.port

            #
            # - this is the basic json payload we'll send to all our pods
            # - it contains all the information they need to know to carry their configuration out
            # - we'll also add each pod identifier + index
            #
            js = \
                {
                    'pods': pods,
                    'dependencies': {k: v for k, v in self.snapshots.items() if k != 'local'}
                }

            def _control(task):
                threads = []
                for key, url in urls.items():

                    #
                    # - add the key for each pod
                    # - this json payload will be sent over and turned into a Cluster instance on the other side
                    # - inflate the receiving timeout a bit
                    #
                    payload = deepcopy(js)
                    payload['key'] = key
                    seconds = self.grace * 1.25
                    thread = _Post(key, '%s/control/%s/%d' % (url, task, self.grace), js=payload, timeout=seconds)
                    threads.append(thread)

                if self.sequential:

                    #
                    # - start each HTTP POST thread and join immediately
                    #
                    def _start_join():
                        thread.start()
                        return thread.join()

                    logger.debug('%s : -> /control/%s (%d pods, sequential)' % (self.path, task, len(pods)))
                    return [_start_join() for thread in threads]

                else:

                    #
                    # - start all the HTTP POST threads at once
                    # - join them one by one
                    #
                    for thread in threads:
                        thread.start()

                    logger.debug('%s : -> /control/%s (%d pods)' % (self.path, task, len(pods)))
                    return [thread.join() for thread in threads]

            #
            # - perform a pre-check, typically to make sure all our dependencies are there
            # - if this fails for whatever reason we'll postpone the configuration to later
            # - note that any dead pod will fail this test
            #
            replies = _control('check')
            dead = [key for key, code in replies if code == 410]
            if dead:
                logger.warning('%s : dropping %d dead pods' % (self.path, len(dead)))
                for key in dead:
                    del pods[key]
                    del urls[key]

            assert all(code in [200, 410] for _, code in replies), '1+ pods failing the pre-check or unreachable'
            if pods:

                #
                # - we have at least one pod alive
                # - if a full shutdown has been requested start by sending a /off to each pod in order
                #
                if self.full_shutdown:
                    _control('off')

                #
                # - send a /on to each pod in order to configure and (re-)start them
                # - note we include an extra 'index' integer to the payload passed to the pod (this index
                #   can be used to tag the pod in logs or perform specific setup procedures)
                #
                logger.debug('%s : json payload ->\n%s' % (self.path, json.dumps(js, indent=4, separators=(',', ': '))))
                logger.info('%s : asking %d pods to configure' % (self.path, len(pods)))
                replies = _control('on')
                assert all(code == 200 for _, code in replies), '1+ pods failing to configure or unreachable'

                #
                # - operation successful -> ask each pod to run its configured() callback
                # - just fire & forget
                #
                _control('ok')

            #
            # - in any case update the md5 hash
            # - update also our /snapshot node (which will propagate if this cluster is a dependency for somebody else)
            #
            latest = self._md5()
            local = json.dumps(pods)
            self.zk.set('%s/%s.%s/snapshot' % (ROOT, self.scope, self.tag), local)
            self.zk.set('%s/%s.%s/hash' % (ROOT, self.scope, self.tag), latest)
            logger.debug('%s : new hash -> %s' % (self.path, latest))
            logger.info('%s : configuration complete (%d pods alive)' % (self.path, len(pods)))

            #
            # - all cool, we can now unset our trigger
            # - keep track of the cluster description
            # - go back to spinning & force a call to probe() right away
            #
            data.dirty = 0
            data.last = js
            data.last['key'] = str(self.id)
            data.next_probe = 0

        except AssertionError as failure:

            #
            # - any assert aborts the procedure
            # - leave the trigger on and reset the timestamp to re-attempt
            #
            logger.warn('%s : configuration failed -> %s' % (self.path, diagnostic(failure)))
            self.hints['state'] = 'leader (configuration pending)'
            data.next = time.time() + self.damper
            data.last = None

        return 'spin', data, SAMPLING
Example #25
0
    def spin(self, data):

        #
        # - if the termination trigger is set or if we lost our connection, abort immediately
        # - this will free the lock and another controller will take the lead
        #
        if self.terminate:
            raise Aborted('terminating')
        
        #
        # - if it is time to run the probe callback do it now
        # - schedule the next one
        #
        now = time.time()
        if self.updated:

            #
            # - the update trigger is on
            # - unset it and query the last recorded hash
            # - any difference with what we have means we need to schedule a configuration
            #
            self.updated = 0
            last, stats = self.zk.get('%s/%s.%s/hash' % (ROOT, self.scope, self.tag))
            latest = self._md5()
            bad = latest != last
            if bad and not data.dirty:

                #
                # - the hash changed, switch the dirty trigger on
                # - this will start the countdown to configuration (which can be aborted if we fall back
                #   on the same hash again, typically after a transient zookeeper connection loss)
                #
                logger.info('%s : hash changed, configuration in %2.1f seconds' % (self.path, self.damper))
                logger.debug('%s : hash -> %s' % (self.path, latest))
                data.next = now + self.damper
                data.dirty = 1

            elif not bad:

                #
                # - this case would typically map to a pod losing cnx to zk and joining again later
                # - based on how much damper we allow we can bridge transient idempotent changes
                # - very important -> make sure we set the snapshot (which could have been reset to {})
                # - don't also forget to set data.last to enable probing
                #
                data.dirty = 0
                pods = self.snapshots['local']
                js = \
                    {
                        'pods': pods,
                        'dependencies': {k: v for k, v in self.snapshots.items() if k != 'local'}
                    }

                data.last = js
                data.last['key'] = str(self.id)
                self.zk.set('%s/%s.%s/snapshot' % (ROOT, self.scope, self.tag), json.dumps(pods))
                logger.debug('%s : pod update with no hash impact (did we just reconnect to zk ?)' % self.path)

        if not data.dirty:

            #
            # - all cool, the cluster is configured
            # - set the state as 'leader'
            # - fire a probe() if it is time to do so
            #
            self.hints['state'] = 'leader'
            if data.last and now > data.next_probe:
                try:

                    #
                    # - pass the latest cluster data to the probe() call
                    # - if successful (e.g did not assert) set the status to whatever the callable returned
                    # - unset if nothing was returned
                    #
                    snippet = self.probe(_Cluster(data.last))
                    self.hints['status'] = str(snippet) if snippet else ''

                except AssertionError as failure:

                    #
                    # - set the status to the assert message
                    #
                    self.hints['status'] = '* %s' % failure

                except Exception as failure:

                    #
                    # - something blew up in probe(), set the status accordingly
                    #
                    self.hints['status'] = '* probe() failed (check the code)'
                    logger.warning('%s : probe() failed -> %s' % (self.path, diagnostic(failure)))

                data.next_probe = now + self.probe_every
                if self.hints['status']:
                    logger.debug('%s : probe() -> "%s"' % (self.path, self.hints['status']))

        else:

            #
            # - trigger the configuration procedure
            #
            self.hints['state'] = 'leader (configuration pending)'
            remaining = max(0, data.next - now)
            self.hints['status'] = '* configuration in %2.1f seconds' % remaining
            if not remaining:
                return 'config', data, 0

            #
            # - print some cool countdown
            #
            else:
                logger.debug('%s : configuration in %2.1f seconds' % (self.path, remaining))

        return 'spin', data, SAMPLING
Example #26
0
        def _from_curl():

            out = []
            ok = False
            ts = time.time()
            tmp = tempfile.mkdtemp()
            try:

                #
                # - retrieve the command line
                #
                assert 'X-Shell' in request.headers, 'X-Shell header missing'
                line = request.headers['X-Shell']

                #
                # - compute the incoming command line HMAC and compare (use our pod token as the key)
                #
                if 'token' in os.environ and os.environ['token']:
                    assert 'X-Signature' in request.headers, 'signature missing (make sure you define $OCHOPOD_TOKEN)'
                    digest = 'sha1=' + hmac.new(os.environ['token'], line, hashlib.sha1).hexdigest()
                    assert digest == request.headers['X-Signature'], 'SHA1 signature mismatch (check your token)'

                #
                # - download each multi-part file to a temporary folder
                #
                for tag, upload in request.files.items():
                    where = join(tmp, tag)
                    logger.debug('http -> upload @ %s' % where)
                    upload.save(where)

                #
                # - get the shell snippet to run from the X-Shell header
                # - use the 'toolset' python package that's installed in the container
                # - open it
                #
                logger.debug('http -> shell request "%s"' % line)
                pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=None, env=env, cwd=tmp)

                #
                # - pipe the process stdout
                # - return as json ('out' contains the verbatim dump from the sub-process stdout)
                #
                while 1:
                    code = pid.poll()
                    line = pid.stdout.readline()
                    if not line and code is not None:
                        break
                    elif line:
                        out += [line.rstrip('\n')]

                ok = pid.returncode == 0

            except AssertionError as failure:

                out = ['failure -> %s' % failure]

            except Exception as failure:

                out = ['unexpected failure -> %s' % diagnostic(failure)]

            finally:

                #
                # - make sure to cleanup our temporary directory
                #
                shutil.rmtree(tmp)

            ms = 1000 * (time.time() - ts)
            js = \
                {
                    'ok': ok,
                    'ms': ms,
                    'out': '\n'.join(out)
                }

            return json.dumps(js), 200, \
                {
                    'Content-Type': 'application/json; charset=utf-8'
                }
Example #27
0
                #
                # - make sure to cleanup our temporary directory
                #
                shutil.rmtree(tmp)

            ms = 1000 * (time.time() - ts)
            js = \
                {
                    'ok': ok,
                    'ms': ms,
                    'out': '\n'.join(out)
                }

            return json.dumps(js), 200, \
                {
                    'Content-Type': 'application/json; charset=utf-8'
                }

        #
        # - run our flask endpoint on TCP 9000
        #
        web.run(host='0.0.0.0', port=9000, threaded=True)

    except Exception as failure:

        logger.fatal('unexpected condition -> %s' % diagnostic(failure))

    finally:

        sys.exit(1)
Example #28
0
    def run(self):
        try:

            #
            # - we need to pass the framework master IPs around (ugly)
            #
            assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)'
            master = choice(os.environ['MARATHON_MASTER'].split(','))
            headers = \
                {
                    'content-type': 'application/json',
                    'accept': 'application/json'
                }

            #
            # - first peek and see what pods we have
            # - they should all map to one single marathon application (abort if not)
            # - we'll use the application identifier to retrieve the configuration json later on
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'info')
                return [hints['application'] for (_, hints, _) in replies.values()]

            js = run(self.proxy, _query)
            assert len(set(js)) == 1, '%s is mapping to 2+ marathon applications' % self.cluster
            app = js[0]

            #
            # - fetch the various versions for our app
            # - we want to get hold of the most recent configuration
            #
            url = 'http://%s/v2/apps/%s/versions' % (master, app)
            reply = get(url, headers=headers)
            code = reply.status_code
            logger.debug('-> %s (HTTP %d)' % (url, code))
            assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code
            js = reply.json()

            #
            # - retrieve the latest one
            # - keep the docker container configuration and the # of tasks around
            #
            last = js['versions'][0]
            url = 'http://%s/v2/apps/%s/versions/%s' % (master, app, last)
            reply = get(url, headers=headers)
            code = reply.status_code
            logger.debug('-> %s (HTTP %d)' % (url, code))
            assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code
            js = reply.json()

            spec = js['container']
            tag = spec['docker']['image']
            capacity = js['instances']

            #
            # - kill all the pods using a POST /control/kill
            # - wait for them to be dead
            #
            @retry(timeout=self.timeout, pause=0)
            def _spin():
                def _query(zk):
                    replies = fire(zk, self.cluster, 'control/kill', timeout=self.timeout)
                    return [(code, seq) for seq, _, code in replies.values()]

                #
                # - fire the request one or more pods
                # - wait for every pod to report back a HTTP 410 (GONE)
                # - this means the ochopod state-machine is now idling (e.g dead)
                #
                js = run(self.proxy, _query)
                gone = sum(1 for code, _ in js if code == 410)
                assert gone == len(js), 'at least one pod is still running'
                return

            _spin()

            #
            # - grab the docker image
            # - just add a :<version> suffix (or replace it) but don't change the image  proper
            # - update the image and PUT the new configuration back
            # - marathon will then kill & re-start all the tasks
            #
            tokens = tag.split(':')
            spec['docker']['image'] = \
                '%s:%s' % (tag, self.version) if len(tokens) < 2 else '%s:%s' % (tokens[0], self.version)
            js = \
                {
                    'container': spec
                }

            url = 'http://%s/v2/apps/%s' % (master, app)
            reply = put(url, data=json.dumps(js), headers=headers)
            code = reply.status_code
            logger.debug('-> %s (HTTP %d)' % (url, code))
            logger.debug(reply.text)
            assert code == 200 or code == 201, 'update failed (HTTP %d)' % code

            #
            # - the pods should now be starting
            # - wait for all the pods to be in the 'running' mode (they are 'dead' right now)
            # - the sequence counters allocated to our new pods are returned as well
            #
            target = ['running'] if self.strict else ['stopped', 'running']
            @retry(timeout=self.timeout, pause=3, default={})
            def _spin():
                def _query(zk):
                    replies = fire(zk, self.cluster, 'info')
                    return [(hints['process'], seq) for seq, hints, _ in replies.values() if hints['process'] in target]

                js = run(self.proxy, _query)
                assert len(js) == capacity, 'not all pods running yet'
                return js

            js = _spin()
            up = [seq for _, seq in js]
            assert len(up) == capacity, '1+ pods still not up (%d/%d)' % (len(up), capacity)
            self.out['up'] = up
            self.out['ok'] = True

            logger.debug('%s : %d pods updated to version "%s"' % (self.cluster, capacity, self.version))

        except AssertionError as failure:

            logger.debug('%s : failed to bump -> %s' % (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to bump -> %s' % (self.cluster, diagnostic(failure)))
Example #29
0
    def run(self):
        try:

            #
            # - workaround to fetch the master IP and credentials as there does not seem to
            #   be a way to use 10.0.0.2 from within the pod
            #
            assert 'KUBERNETES_MASTER' in os.environ,   '$KUBERNETES_MASTER not specified (check your portal pod)'
            assert 'KUBERNETES_USER' in os.environ,     '$KUBERNETES_USER not specified (check your portal pod)'
            assert 'KUBERNETES_PWD' in os.environ,      '$KUBERNETES_PWD not specified (check your portal pod)'

            auth = HTTPBasicAuth(os.environ['KUBERNETES_USER'], os.environ['KUBERNETES_PWD'])

            with open(self.template, 'r') as f:

                #
                # - parse the yaml file
                # - add the ochopod control port if not specified
                #
                cfg = yaml.load(f)
                if 8080 not in cfg['ports']:
                    cfg['ports'].append(8080)

                #
                # -
                #
                suffix = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S')
                qualified = 'ochopod.%s.%s-%s' % (self.namespace, cfg['cluster'], suffix)

                env = \
                    {
                        'KUBERNETES_MASTER': os.environ['KUBERNETES_MASTER'],
                        'KUBERNETES_USER': os.environ['KUBERNETES_USER'],
                        'KUBERNETES_PWD': os.environ['KUBERNETES_PWD'],
                        'ochopod_cluster': cfg['cluster'],
                        'ochopod_namespace': self.namespace,
                        'ochopod_application': qualified,
                        'pod': json.dumps(cfg['settings']) if 'settings' in cfg else '{}'
                    }

                labels = \
                    {
                        'name': qualified
                    }

                container = \
                    {
                        'name': cfg['cluster'],
                        'image': cfg['image'],
                        'env': [{'name': key, 'value': value} for key, value in env.items()],
                        'ports': [{'containerPort': port} for port in cfg['ports']]
                    }

                controller = \
                    {
                        'kind': 'ReplicationController',
                        'apiVersion': 'v1beta3',
                        'metadata': {'name': qualified},
                        'spec':
                            {
                                'replicas': self.pods,
                                'selector': {'name': qualified},
                                'template':
                                    {
                                        'metadata': {'labels': labels},
                                        'spec':
                                            {
                                                'containers': [container]
                                            }
                                    }
                            }

                    }

                #
                # -
                #
                headers = \
                    {
                        'content-type': 'application/json',
                        'accept': 'application/json'
                    }

                url = 'https://%s/api/v1beta3/namespaces/default/replicationcontrollers' % os.environ['KUBERNETES_MASTER']
                reply = requests.post(url, auth=auth, data=json.dumps(controller), headers=headers, verify=False)
                code = reply.status_code
                logger.debug('-> POST %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'submission failed (HTTP %d)' % code

            self.deployed = self.pods
            self.ok = 1

        except AssertionError as failure:

            logger.debug('%s : failed to deploy -> %s' % (self.template, failure))

        except YAMLError as failure:

            if hasattr(failure, 'problem_mark'):
                mark = failure.problem_mark
                logger.debug('%s : invalid deploy.yml (line %s, column %s)' % (self.template, mark.line+1, mark.column+1))

        except Exception as failure:

            logger.debug('%s : failed to deploy -> %s' % (self.template, diagnostic(failure)))
Example #30
0
    def run(self):
        try:

            #
            # - workaround to fetch the master IP and credentials as there does not seem to
            #   be a way to use 10.0.0.2 from within the pod
            #
            assert 'KUBERNETES_MASTER' in os.environ, '$KUBERNETES_MASTER not specified (check your portal pod)'
            assert 'KUBERNETES_USER' in os.environ, '$KUBERNETES_USER not specified (check your portal pod)'
            assert 'KUBERNETES_PWD' in os.environ, '$KUBERNETES_PWD not specified (check your portal pod)'

            auth = HTTPBasicAuth(os.environ['KUBERNETES_USER'], os.environ['KUBERNETES_PWD'])

            def _query(zk):
                replies = fire(zk, self.cluster, 'info')
                return len(replies), {key: hints for key, (_, hints, code) in replies.items() if code == 200}

            #
            # - each pod refers to its controller via the 'application' hint
            #
            total, js = run(self.proxy, _query)
            assert total == len(js), 'failure to communicate with one or more pods'
            for key in set([hints['application'] for hints in js.values()]):

                #
                # - HTTP DELETE the controller via the master API
                #
                url = 'https://%s/api/v1beta3/namespaces/default/replicationcontrollers/%s' % (os.environ['KUBERNETES_MASTER'], key)
                reply = requests.delete(url, auth=auth,verify=False)
                code = reply.status_code
                logger.debug('-> DELETE %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'replication controller deletion failed (HTTP %d)' % code

            #
            # - the 'task' hint is the pod's identifier
            #
            for key, hints in js.items():

                #
                # - HTTP DELETE the pod via the master API
                #
                url = 'https://%s/api/v1beta3/namespaces/default/pods/%s' % (os.environ['KUBERNETES_MASTER'], hints['task'])
                reply = requests.delete(url, auth=auth,verify=False)
                code = reply.status_code
                logger.debug('-> DELETE %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'pod deletion failed (HTTP %d)' % code

            self.killed = len(js)
            self.ok = 1

        except AssertionError as failure:

            logger.debug('%s : failed to deploy -> %s' % (self.cluster, failure))

        except YAMLError as failure:

            if hasattr(failure, 'problem_mark'):
                mark = failure.problem_mark
                logger.debug('%s : invalid deploy.yml (line %s, column %s)' % (self.cluster, mark.line+1, mark.column+1))

        except Exception as failure:

            logger.debug('%s : failed to deploy -> %s' % (self.cluster, diagnostic(failure)))
Example #31
0
    def run(self):
        try:

            #
            # - we need to pass the framework master IPs around (ugly)
            #
            assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)'
            master = choice(os.environ['MARATHON_MASTER'].split(','))
            headers = \
                {
                    'content-type': 'application/json',
                    'accept': 'application/json'
                }

            #
            # - first peek and see what pods we have
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'info')
                return [(seq, hints['application'], hints['task'])
                        for (seq, hints, _) in replies.values()]

            #
            # - remap a bit differently and get an ordered list of task identifiers
            # - we'll use that to kill the newest pods
            #
            js = run(self.proxy, _query)
            total = len(js)
            if self.group is not None:

                #
                # - if -g was specify apply the scaling to the underlying marathon application containing that pod
                # - be careful to update the task list and total # of pods
                #
                keys = {seq: key for (seq, key, _) in js}
                assert self.group in keys, '#%d is not a valid pod index' % self.group
                app = keys[self.group]
                tasks = [(seq, task)
                         for (seq, key,
                              task) in sorted(js, key=(lambda _: _[0]))
                         if key == app]
                total = sum(1 for (_, key, _) in js if key == app)

            else:

                #
                # - check and make sure all our pods map to one single marathon application
                #
                keys = set([key for (_, key, _) in js])
                assert len(
                    keys
                ) == 1, '%s maps to more than one application, you must specify -g' % self.cluster
                tasks = [(seq, task)
                         for (seq, _, task) in sorted(js, key=(lambda _: _[0]))
                         ]
                app = keys.pop()

            #
            # - infer the target # of pods based on the user-defined factor
            #
            operator = self.factor[0]
            assert operator in ['@', 'x'], 'invalid operator'
            n = float(self.factor[1:])
            target = n if operator == '@' else total * n

            #
            # - clip the target # of pods down to 1
            #
            target = max(1, int(target))
            self.out['delta'] = target - total
            if target > total:

                #
                # - scale the application capacity up
                #
                js = \
                    {
                        'instances': target
                    }

                url = 'http://%s/v2/apps/%s' % (master, app)
                reply = put(url, data=json.dumps(js), headers=headers)
                code = reply.status_code
                logger.debug('-> %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'update failed (HTTP %d)' % code

                #
                # - wait for all our new pods to be there
                #
                @retry(timeout=self.timeout, pause=3, default={})
                def _spin():
                    def _query(zk):
                        replies = fire(zk, self.cluster, 'info')
                        return [seq for seq, _, _ in replies.values()]

                    js = run(self.proxy, _query)
                    assert len(js) == target, 'not all pods running yet'
                    return js

                _spin()

            elif target < total:

                #
                # - if the fifo switch is on make sure to pick the oldest pods for deletion
                #
                tasks = tasks[:total - target] if self.fifo else tasks[target:]

                #
                # - kill all (or part of) the pods using a POST /control/kill
                # - wait for them to be dead
                #
                @retry(timeout=self.timeout, pause=0)
                def _spin():
                    def _query(zk):
                        indices = [seq for (seq, _) in tasks]
                        replies = fire(zk,
                                       self.cluster,
                                       'control/kill',
                                       subset=indices,
                                       timeout=self.timeout)
                        return [(code, seq)
                                for seq, _, code in replies.values()]

                    #
                    # - fire the request one or more pods
                    # - wait for every pod to report back a HTTP 410 (GONE)
                    # - this means the ochopod state-machine is now idling (e.g dead)
                    #
                    js = run(self.proxy, _query)
                    gone = sum(1 for code, _ in js if code == 410)
                    assert gone == len(js), 'at least one pod is still running'
                    return

                _spin()

                #
                # - delete all the underlying tasks at once using POST v2/tasks/delete
                #
                js = \
                    {
                        'ids': [task for (_, task) in tasks]
                    }

                url = 'http://%s/v2/tasks/delete?scale=true' % master
                reply = post(url, data=json.dumps(js), headers=headers)
                code = reply.status_code
                logger.debug('-> %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code

            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to scale -> %s' %
                         (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to scale -> %s' %
                         (self.cluster, diagnostic(failure)))
Example #32
0
    def on(self, data):

        if data.forked and data.js and (self.strict or data.js['dependencies'] != self.last['dependencies']):

            #
            # - if we already have a process, we want to re-configure -> force a reset first
            # - this will go through a graceful termination process
            # - we'll come back here afterwards (with data.forked set to None)
            #
            raise Aborted('resetting to terminate pid %s first' % data.forked.pid)

        elif data.forked:

            #
            # - the process is already running, fail gracefully on a 200
            # - this is the code-path used for instance up a leader request when strict is false
            #
            reply = {}, 200
            logger.debug('%s : skipping /control/on request' % self.path)
            data.latch.set(reply)

        else:

            #
            # - no more process running, go on with the configuration
            #
            try:

                if not self.initialized:

                    #
                    # - if this is the 1st time the pod is running invoke the initialize() callback
                    # - this is typically used to run once-only stuff such as attaching storage volumes, etc.
                    #
                    logger.info('%s : initializing pod' % self.path)
                    self.initialize()
                    self.initialized = 1

                if data.js:

                    #
                    # - run the configuration procedure if we have some json
                    # - we'll use whatever it returns to popen() a new process
                    # - keep track of the shell command line returned by configure() for later
                    #
                    cluster = _Cluster(data.js)
                    logger.info('%s : configuring pod %d/%d' % (self.path, 1 + cluster.index, cluster.size))
                    data.command, data.env = self.configure(cluster)
                    self.last = data.js

                assert data.command, 'request to start process while not yet configured (user error ?)'

                #
                # - spawn a new sub-process if the auto-start flag is on OR if we already ran at least once
                # - the start flag comes from the $ochopod_start environment variable
                #
                now = time.time()
                if not data.js or self.start or data.pids > 0:

                    #
                    # - combine our environment variables with the overrides from configure()
                    # - popen() the new process
                    # - reset the sanity check counter
                    # - keep track of its pid to kill it later on
                    #
                    env = deepcopy(self.env)
                    env.update(data.env)
                    tokens = data.command if self.shell else data.command.split(' ')
                    data.forked = Popen(tokens, cwd=self.cwd, env=env, shell=self.shell)
                    data.checks = self.checks
                    data.pids += 1
                    self.hints['process'] = 'running'
                    logger.info('%s : popen() #%s -> started <%s> as pid %s' % (self.path, data.pids, data.command, data.forked.pid))
                    if data.env:
                        unrolled = '\n'.join(['\t%s -> %s' % (k, v) for k, v in data.env.items()])
                        logger.debug('%s : extra environment for pid %s ->\n%s' % (self.path, data.forked.pid, unrolled))

                reply = {}, 200
                data.next_sanity_check = now + self.check_every
                data.latch.set(reply)

            except Exception as failure:

                #
                # - any failure trapped during the configuration -> HTTP 406
                # - the pod will shutdown automatically as well
                #
                reply = {}, 406
                logger.warning('%s : failed to configure -> %s, shutting down' % (self.path, diagnostic(failure)))
                self._request(['kill'])
                data.latch.set(reply)

        self.commands.popleft()
        return 'spin', data, 0
Example #33
0
        def _from_curl(scripts):

            #
            # - retrieve the X-Signature header
            # - fast-fail on a HTTP 403 if not there or if there is a mismatch
            #
            if not "X-Signature" in request.headers:
                return "", 403

            #
            # - force a json output if the Accept header matches 'application/json'
            # - otherwise default to a text/plain response
            # - create a temporary directory to run from
            #
            ok = 0
            log = []
            alphabet = string.letters + string.digits
            token = "".join(alphabet[ord(c) % len(alphabet)] for c in os.urandom(8))
            raw = request.accept_mimetypes.best_match(["application/json"]) is None
            tmp = tempfile.mkdtemp()
            try:

                #
                # - any request header in the form X-Var-* will be kept around and passed as
                #   an environment variable when executing the script
                # - make sure the variable is spelled in uppercase
                #
                local = {key[6:].upper(): value for key, value in request.headers.items() if key.startswith("X-Var-")}

                #
                # - craft a unique callback URL that points to this pod
                # - this will be passed down to the script to enable transient testing jobs
                #
                cwd = path.join(tmp, "uploaded")
                local["CALLBACK"] = "http://%s/callback/%s" % (env["local"], token)
                blocked[token] = cwd
                for key, value in local.items():
                    log += ["$%s = %s" % (key, value)]

                #
                # - download the archive
                # - compute the HMAC and compare (use our pod token as the key)
                # - fail on a 403 if mismatch
                #
                where = path.join(tmp, "bundle.tgz")
                request.files["tgz"].save(where)
                with open(where, "rb") as f:
                    bytes = f.read()
                    digest = "sha1=" + hmac.new(env["token"], bytes, hashlib.sha1).hexdigest()
                    if digest != request.headers["X-Signature"]:
                        return "", 403

                #
                # - extract it into its own folder
                # - make sure the requested script is there
                #
                code, _ = shell("mkdir uploaded && tar zxf bundle.tgz -C uploaded", cwd=tmp)
                assert code == 0, "unable to open the archive (bogus payload ?)"

                #
                # - decrypt any file whose extension is .aes
                # - just run openssl directly and dump the output in the working directory
                # - note: at this point we just look for .aes file in the top level directory
                #
                for file in os.listdir(cwd):
                    bare, ext = path.splitext(file)
                    if ext != ".aes":
                        continue

                    code, _ = shell(
                        "openssl enc -d -base64 -aes-256-cbc -k %s -in %s -out %s" % (env["token"], file, bare), cwd=cwd
                    )
                    if code == 0:
                        log += ["decrypted %s" % file]

                #
                # - run each script in order
                # - abort immediately if the script exit code is not zero
                # - keep the script output as a json array
                #
                for script in scripts.split("+"):
                    now = time.time()
                    assert path.exists(path.join(cwd, script)), "unable to find %s (check your scripts)" % script
                    code, lines = shell("python %s 2>&1" % script, cwd=cwd, env=local)
                    log += lines + ["%s ran in %d seconds" % (script, int(time.time() - now))]
                    assert code == 0, "%s failed on exit code %d" % (script, code)

                ok = 1

            except AssertionError as failure:

                log += ["failure (%s)" % failure]

            except Exception as failure:

                log += ["unexpected failure (%s)" % diagnostic(failure)]

            finally:

                #
                # - make sure to cleanup our temporary directory
                #
                del blocked[token]
                shutil.rmtree(tmp)

            if raw:

                #
                # - if 'application/json' was not requested simply dump the log as is
                # - force the response code to be HTTP 412 upon failure and HTTP 200 otherwise
                #
                code = 200 if ok else 412
                return "\n".join(log), code, {"Content-Type": "text/plain; charset=utf-8"}

            else:

                #
                # - if 'application/json' was requested always respond with a HTTP 200
                # - the response body then contains our serialized JSON output
                #
                js = {"ok": ok, "log": log}

                return json.dumps(js), 200, {"Content-Type": "application/json; charset=utf-8"}
Example #34
0
    def run(self):
        try:

            #
            # - we need to pass the framework master IPs around (ugly)
            #
            assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)'
            master = choice(os.environ['MARATHON_MASTER'].split(','))
            headers = \
                {
                    'content-type': 'application/json',
                    'accept': 'application/json'
                }

            #
            # - first peek and see what pods we have
            #
            def _query(zk):
                replies = fire(zk, self.cluster, 'info')
                return [(seq, hints['application'], hints['task']) for (seq, hints, _) in replies.values()]

            #
            # - remap a bit differently and get an ordered list of task identifiers
            # - we'll use that to kill the newest pods
            #
            js = run(self.proxy, _query)
            total = len(js)
            if self.group is not None:

                #
                # - if -g was specify apply the scaling to the underlying marathon application containing that pod
                # - be careful to update the task list and total # of pods
                #
                keys = {seq: key for (seq, key, _) in js}
                assert self.group in keys, '#%d is not a valid pod index' % self.group
                app = keys[self.group]
                tasks = [(seq, task) for (seq, key, task) in sorted(js, key=(lambda _: _[0])) if key == app]
                total = sum(1 for (_, key, _) in js if key == app)

            else:

                #
                # - check and make sure all our pods map to one single marathon application
                #
                keys = set([key for (_, key, _) in js])
                assert len(keys) == 1, '%s maps to more than one application, you must specify -g' % self.cluster
                tasks = [(seq, task) for (seq, _, task) in sorted(js, key=(lambda _: _[0]))]
                app = keys.pop()

            #
            # - infer the target # of pods based on the user-defined factor
            #
            operator = self.factor[0]
            assert operator in ['@', 'x'], 'invalid operator'
            n = float(self.factor[1:])
            target = n if operator == '@' else total * n

            #
            # - clip the target # of pods down to 1
            #
            target = max(1, int(target))
            self.out['delta'] = target - total
            if target > total:

                #
                # - scale the application capacity up
                #
                js = \
                    {
                        'instances': target
                    }

                url = 'http://%s/v2/apps/%s' % (master, app)
                reply = put(url, data=json.dumps(js), headers=headers)
                code = reply.status_code
                logger.debug('-> %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'update failed (HTTP %d)' % code

                #
                # - wait for all our new pods to be there
                #
                @retry(timeout=self.timeout, pause=3, default={})
                def _spin():
                    def _query(zk):
                        replies = fire(zk, self.cluster, 'info')
                        return [seq for seq, _, _ in replies.values()]

                    js = run(self.proxy, _query)
                    assert len(js) == target, 'not all pods running yet'
                    return js

                _spin()

            elif target < total:

                #
                # - if the fifo switch is on make sure to pick the oldest pods for deletion
                #
                tasks = tasks[:total - target] if self.fifo else tasks[target:]

                #
                # - kill all (or part of) the pods using a POST /control/kill
                # - wait for them to be dead
                #
                @retry(timeout=self.timeout, pause=0)
                def _spin():
                    def _query(zk):
                        indices = [seq for (seq, _) in tasks]
                        replies = fire(zk, self.cluster, 'control/kill', subset=indices, timeout=self.timeout)
                        return [(code, seq) for seq, _, code in replies.values()]

                    #
                    # - fire the request one or more pods
                    # - wait for every pod to report back a HTTP 410 (GONE)
                    # - this means the ochopod state-machine is now idling (e.g dead)
                    #
                    js = run(self.proxy, _query)
                    gone = sum(1 for code, _ in js if code == 410)
                    assert gone == len(js), 'at least one pod is still running'
                    return

                _spin()

                #
                # - delete all the underlying tasks at once using POST v2/tasks/delete
                #
                js = \
                    {
                        'ids': [task for (_, task) in tasks]
                    }

                url = 'http://%s/v2/tasks/delete?scale=true' % master
                reply = post(url, data=json.dumps(js), headers=headers)
                code = reply.status_code
                logger.debug('-> %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code

            self.out['ok'] = True

        except AssertionError as failure:

            logger.debug('%s : failed to scale -> %s' % (self.cluster, failure))

        except Exception as failure:

            logger.debug('%s : failed to scale -> %s' % (self.cluster, diagnostic(failure)))
Example #35
0
def go():
    """
    Entry point for the slave tool-set. This script will look for python modules in the /commands sub-directory.
    """

    #
    # - start by simplifying a bit the console logger to look more CLI-ish
    #
    for handler in logger.handlers:
        handler.setFormatter(logging.Formatter('%(message)s'))

    try:

        def _import(where, funcs):
            try:
                for script in [f for f in listdir(where) if isfile(join(where, f)) and f.endswith('.py')]:
                    try:
                        module = imp.load_source(script[:-3], join(where, script))
                        if hasattr(module, 'go') and callable(module.go):
                            tool = module.go()
                            assert isinstance(tool, Template), '%s is not inheriting from Template' % script[:-3]
                            assert tool.tag, 'missing tool tag (check the %s module)' % script[-3]
                            funcs[tool.tag] = tool

                    except Exception as failure:

                        logger.warning('failed to import %s (%s)' % (script, diagnostic(failure)))

            except OSError:
                pass

        #
        # - disable .pyc generation
        # - scan for tools to import
        # - each .py module must have a go() callable as well as a COMMAND attribute
        # - the COMMAND attribute tells us what the command-line invocation looks like
        #
        tools = {}
        sys.dont_write_bytecode = True
        _import('%s/commands' % dirname(__file__), tools)

        def _usage():
            return 'available commands -> %s' % ', '.join(sorted(tools.keys()))

        parser = ArgumentParser(description='', prefix_chars='+', usage=_usage())
        parser.add_argument('command', type=str, help='command (e.g ls for instance)')
        parser.add_argument('extra', metavar='extra arguments', type=str, nargs='*', help='zero or more arguments')
        args = parser.parse_args()
        total = [args.command] + args.extra
        if args.command == 'help':
            logger.info(_usage())
            exit(0)

        def _sub(sub):
            for i in range(len(total)-len(sub)+1):
                if sub == total[i:i+len(sub)]:
                    return 1
            return 0

        matched = [tool for tool in tools.keys() if _sub(tool.split(' '))]
        if not matched:

            logger.info('unknown command (%s)' % _usage())

        elif len(matched) > 1:

            logger.info('more than one command were matched (%s)' % _usage())

        else:

            #
            # - simply invoke the tool
            # - remove the command tokens first and pass the rest as arguments
            # - each tool will parse its own commandline
            #
            picked = matched[0]
            tokens = len(picked.split(' ')) - 1
            exit(tools[picked].run(args.extra[tokens:]))

    except AssertionError as failure:

        logger.error('shutting down <- %s' % failure)

    except Exception as failure:

        logger.error('shutting down <- %s' % diagnostic(failure))

    exit(1)
Example #36
0
    def run(self):
        try:

            #
            # - we need to pass the framework master IPs around (ugly)
            #
            assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)'
            master = choice(os.environ['MARATHON_MASTER'].split(','))
            headers = \
                {
                    'content-type': 'application/json',
                    'accept': 'application/json'
                }

            with open(self.template, 'r') as f:

                #
                # - parse the template yaml file (e.g container definition)
                #
                raw = yaml.load(f)
                assert raw, 'empty YAML input (user error ?)'

                #
                # - merge with our defaults
                # - we want at least the cluster & image settings
                # - TCP 8080 is added by default to the port list
                #
                defaults = \
                    {
                        'start': True,
                        'debug': False,
                        'settings': {},
                        'ports': [8080],
                        'verbatim': {}
                    }

                cfg = merge(defaults, raw)
                assert 'cluster' in cfg, 'cluster identifier undefined (user error ?)'
                assert 'image' in cfg, 'docker image undefined (user error ?)'

                #
                # - if a suffix is specified append it to the cluster identifier
                #
                if self.suffix:
                    cfg['cluster'] = '%s-%s' % (cfg['cluster'], self.suffix)

                #
                # - timestamp the application (we really want a new uniquely identified application)
                # - lookup the optional overrides and merge with our pod settings if specified
                # - this is what happens when the -o option is used
                #
                stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S')
                qualified = '%s.%s' % (self.namespace, cfg['cluster'])
                application = 'ochopod.%s-%s' % (qualified, stamp)
                if qualified in self.overrides:

                    blk = self.overrides[qualified]
                    logger.debug('%s : overriding %d settings (%s)' % (self.template, len(blk), qualified))
                    cfg['settings'] = merge(cfg['settings'], blk)

                def _nullcheck(cfg, prefix):

                    #
                    # - walk through the settings and flag any null value
                    #
                    missing = []
                    if cfg is not None:
                        for key, value in cfg.items():
                            if value is None:
                                missing += ['%s.%s' % ('.'.join(prefix), key)]
                            elif isinstance(value, dict):
                                missing += _nullcheck(value, prefix + [key])

                    return missing

                missing = _nullcheck(cfg['settings'], ['pod'])
                assert not missing, '%d setting(s) missing ->\n\t - %s' % (len(missing), '\n\t - '.join(missing))

                #
                # - if we still have no target default it to 1 single pod
                #
                if not self.pods:
                    self.pods = 1

                #
                # - setup our port list
                # - the port binding is specified either by an integer (container port -> dynamic mesos port), by
                #   two integers (container port -> host port) or by an integer followed by a * (container port ->
                #   same port on the host)
                # - the marathon pods must by design map /etc/mesos
                #
                def _parse_port(token):
                    if isinstance(token, int):
                        return {'containerPort': token}
                    elif isinstance(token, str) and token.endswith(' *'):
                        port = int(token[:-2])
                        return {'containerPort': port, 'hostPort': port}
                    elif isinstance(token, str):
                        ports = token.split(' ')
                        assert len(ports) == 2, 'invalid port syntax (must be two integers separated by 1+ spaces)'
                        return {'containerPort': int(ports[0]), 'hostPort': int(ports[1])}
                    else:
                        assert 0, 'invalid port syntax ("%s")' % token

                #
                # - note the marathon-ec2 ochopod bindings will set the application hint automatically
                #   via environment variable (e.g no need to specify it here)
                # - make sure to mount /etc/mesos and /opt/mesosphere to account for various mesos installs
                #
                ports = [_parse_port(token) for token in cfg['ports']] if 'ports' in cfg else []
                spec = \
                    {
                        'id': application,
                        'instances': self.pods,
                        'env':
                            {
                                'ochopod_cluster': cfg['cluster'],
                                'ochopod_debug': str(cfg['debug']).lower(),
                                'ochopod_start': str(cfg['start']).lower(),
                                'ochopod_namespace': self.namespace,
                                'pod': json.dumps(cfg['settings'])
                            },
                        'container':
                            {
                                'type': 'DOCKER',
                                'docker':
                                    {
                                        'forcePullImage': True,
                                        'image': cfg['image'],
                                        'network': 'BRIDGE',
                                        'portMappings': ports
                                    },
                                'volumes':
                                    [
                                        {
                                            'containerPath': '/etc/mesos',
                                            'hostPath': '/etc/mesos',
                                            'mode': 'RO'
                                        },
                                        {
                                            'containerPath': '/opt/mesosphere',
                                            'hostPath': '/opt/mesosphere',
                                            'mode': 'RO'
                                        }
                                    ]
                            }
                    }

                #
                # - if we have a 'verbatim' block in our image definition yaml, merge it now
                #
                if 'verbatim' in cfg:
                    spec = merge(cfg['verbatim'], spec)

                #
                # - pick a marathon master at random
                # - fire the POST /v2/apps to create our application
                # - this will indirectly spawn our pods
                #
                url = 'http://%s/v2/apps' % master
                reply = post(url, data=json.dumps(spec), headers=headers)
                code = reply.status_code
                logger.debug('-> %s (HTTP %d)' % (url, code))
                assert code == 200 or code == 201, 'submission failed (HTTP %d)' % code

                #
                # - wait for all the pods to be in the 'running' mode
                # - the 'application' hint is set by design to the marathon application identifier
                # - the sequence counters allocated to our new pods are returned as well
                #
                target = ['dead', 'running'] if self.strict else ['dead', 'stopped', 'running']
                @retry(timeout=self.timeout, pause=3, default={})
                def _spin():
                    def _query(zk):
                        replies = fire(zk, qualified, 'info')
                        return [(hints['process'], seq) for seq, hints, _ in replies.values()
                                if hints['application'] == application and hints['process'] in target]

                    js = run(self.proxy, _query)
                    assert len(js) == self.pods, 'not all pods running yet'
                    return js

                js = _spin()
                running = sum(1 for state, _ in js if state is not 'dead')
                up = [seq for _, seq in js]
                self.out['up'] = up
                self.out['ok'] = self.pods == running
                logger.debug('%s : %d/%d pods are running ' % (self.template, running, self.pods))

                if not up:

                    #
                    # - nothing is running (typically because the image has an issue and is not
                    #   not booting the ochopod script for instance, which happens often)
                    # - in that case fire a HTTP DELETE against the marathon application to clean it up
                    #
                    url = 'http://%s/v2/apps/%s' % (master, application)
                    reply = delete(url, headers=headers)
                    code = reply.status_code
                    logger.debug('-> %s (HTTP %d)' % (url, code))
                    assert code == 200 or code == 204, 'application deletion failed (HTTP %d)' % code

        except AssertionError as failure:

            logger.debug('%s : failed to deploy -> %s' % (self.template, failure))

        except YAMLError as failure:

            if hasattr(failure, 'problem_mark'):
                mark = failure.problem_mark
                logger.debug('%s : invalid deploy.yml (line %s, column %s)' % (self.template, mark.line+1, mark.column+1))

        except Exception as failure:

            logger.debug('%s : failed to deploy -> %s' % (self.template, diagnostic(failure)))
Example #37
0
                    except AssertionError as failure:

                        log += ['* %s' % str(failure)]

                    except IOError:

                        log += ['* unable to load integration.yml (missing from the repo ?)']

                    except YAMLError as failure:

                        log += ['* invalid YAML syntax']

                    except Exception as failure:

                        log += ['* unexpected condition -> %s' % diagnostic(failure)]

                finally:

                    #
                    # - make sure to cleanup our temporary directory
                    # - update redis with
                    #
                    if not complete:
                        logger.error('build interrupted (%s)' % log[-1])

                    seconds = int(time.time() - started)
                    status = \
                        {
                            'ok': ok and complete,
                            'sha': sha,
Example #38
0
        def _from_curl():

            out = []
            ok = False
            ts = time.time()
            tmp = tempfile.mkdtemp()
            try:

                #
                # - retrieve the command line
                #
                assert 'X-Shell' in request.headers, 'X-Shell header missing'
                line = request.headers['X-Shell']

                #
                # - compute the incoming command line HMAC and compare (use our pod token as the key)
                #
                if 'token' in os.environ and os.environ['token']:
                    assert 'X-Signature' in request.headers, 'signature missing (make sure you define $OCHOPOD_TOKEN)'
                    digest = 'sha1=' + hmac.new(os.environ['token'], line,
                                                hashlib.sha1).hexdigest()
                    assert digest == request.headers[
                        'X-Signature'], 'SHA1 signature mismatch (check your token)'

                #
                # - download each multi-part file to a temporary folder
                #
                for tag, upload in request.files.items():
                    where = join(tmp, tag)
                    logger.debug('http -> upload @ %s' % where)
                    upload.save(where)

                #
                # - get the shell snippet to run from the X-Shell header
                # - use the 'toolset' python package that's installed in the container
                # - open it
                #
                logger.debug('http -> shell request "%s"' % line)
                pid = Popen('toolset %s' % line,
                            shell=True,
                            stdout=PIPE,
                            stderr=None,
                            env=env,
                            cwd=tmp)

                #
                # - pipe the process stdout
                # - return as json ('out' contains the verbatim dump from the sub-process stdout)
                #
                while 1:
                    code = pid.poll()
                    line = pid.stdout.readline()
                    if not line and code is not None:
                        break
                    elif line:
                        out += [line.rstrip('\n')]

                ok = pid.returncode == 0

            except AssertionError as failure:

                out = ['failure -> %s' % failure]

            except Exception as failure:

                out = ['unexpected failure -> %s' % diagnostic(failure)]

            finally:

                #
                # - make sure to cleanup our temporary directory
                #
                shutil.rmtree(tmp)

            ms = 1000 * (time.time() - ts)
            js = \
                {
                    'ok': ok,
                    'ms': ms,
                    'out': '\n'.join(out)
                }

            return json.dumps(js), 200, \
                {
                    'Content-Type': 'application/json; charset=utf-8'
                }
Example #39
0
    def spin(self, data):

        if self.terminate:
            if not data.forked:

                #
                # - kill the actor (which will release the latch and unlock the main loop)
                #
                self.exitcode()

            else:

                #
                # - this will force a reset and make sure we kill the process
                # - we'll loop back to spin() in any case and exitcode() this time
                #
                raise Aborted('terminating')

        if self.commands:

            #
            # - we have at least one request pending
            # - pop the next command and run it (e.g switch the state-machine to it)
            #
            req, js, latch = self.commands[0]
            data.js = js
            data.latch = latch
            return req, data, 0

        if data.forked:

            #
            # - no request to run
            # - check if the process is still running and run the user-defined sanity check once in a while
            #
            now = time.time()
            if data.forked.poll() is not None:
                code = data.forked.returncode
                if not code:

                    #
                    # - a successful exit code (0) will automatically force a shutdown
                    # - this is a convenient way for pods go down automatically once their task is done
                    #
                    logger.error('%s : pid %s exited, shutting down' % (self.path, data.forked.pid))
                    self._request(['kill'])

                else:

                    #
                    # - the process died on a non zero exit code
                    # - restart it gracefully
                    #
                    logger.error('%s : pid %s died (code %d), re-running' % (self.path, data.forked.pid, code))
                    self._request(['off', 'on'])

            elif now >= data.next_sanity_check:
                try:

                    #
                    # - run the sanity check and schedule the next one
                    # - reset it each time
                    #
                    data.next_sanity_check = now + self.check_every
                    self.sanity_check(data.forked.pid)
                    data.checks = self.checks
                    
                except Exception as failure:
        
                    #
                    # - any failure trapped during the sanity check will decrement our counter
                    # - eventually the process is stopped (up to the user to decide what to do)
                    #
                    data.checks -= 1
                    if not data.checks:
                        self._request(['off'])

                    logger.warning('%s : sanity check (%d/%d) failed -> %s' %
                                   (self.path, self.checks - data.checks, self.checks, diagnostic(failure)))

        return 'spin', data, SAMPLING