def run(self): try: def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=self.indices, timeout=self.timeout) return len(replies), [ seq for seq, (_, _, code) in replies.items() if code == 200 ] total, js = run(self.proxy, _query) assert len(js) == total, '1 or more pod failed to stop' self.out['on'] = js self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to switch on -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to switch on -> %s' % (self.cluster, diagnostic(failure)))
def _in(): global threads threads += 1 try: time.sleep(random.randrange(10, 50, 5)/10.0) chain = request.access_route + [request.remote_addr] host = chain[0] lines = \ [ settings['welcome'], 'container running @ %s (%s)' % (hints['node'], hints['ip']), 'http request from %s' % host ] return json.dumps({'out': '<br/>'.join(lines)}) except Exception as failure: logger.error('unexpected failure while receiving -> %s' % diagnostic(failure)) return '', 500 finally: threads -= 1
def run(self): try: # # - first turn off the pods # - keep track of the indices # def _query(zk): replies = fire(zk, self.cluster, 'control/off', subset=self.indices) return [seq for _, (seq, _, code) in replies.items() if code == 200] pods = run(self.proxy, _query) # # - then turn those pod back on # def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=pods) return [seq for _, (seq, _, code) in replies.items() if code == 200] assert pods == run(self.proxy, _query), 'one or more pods failed to switch back on' self.out['reset'] = pods self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
def _from_web_shell(): tmp = tempfile.mkdtemp() try: # # - get the shell snippet from the uri # - use the 'toolset' python package that's installed in the container # - open it # ts = time.time() line = request.args.get('line', 0, type=str) logger.debug('http -> shell request "%s"' % line) pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=PIPE, env=env, cwd=tmp) # # - wait for completion # - return as json ('out' contains the verbatim dump from the sub-process stdout) # pid.wait() ms = 1000 * (time.time() - ts) return json.dumps({'ok': pid.returncode == 0, 'ms': int(ms), 'out': pid.stdout.read()}) except Exception as failure: why = diagnostic(failure) logger.warning('unexpected failure -> %s' % why) return json.dumps({'ok': False, 'out': 'unexpected failure -> %s' % why}) finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp)
def _in(): global threads threads += 1 try: time.sleep(random.randrange(10, 50, 5) / 10.0) chain = request.access_route + [request.remote_addr] host = chain[0] lines = \ [ settings['welcome'], 'container running @ %s (%s)' % (hints['node'], hints['ip']), 'http request from %s' % host ] return json.dumps({'out': '<br/>'.join(lines)}) except Exception as failure: logger.error('unexpected failure while receiving -> %s' % diagnostic(failure)) return '', 500 finally: threads -= 1
def kill(self, data): # # - the /kill request will first guarantee we terminate the process # if data.forked: raise Aborted('resetting to terminate pid %s' % data.forked.pid) try: # # - invoke the optional finalize() callback # logger.info('%s : finalizing pod' % self.path) self.finalize() except Exception as failure: # # - log something if for some reason finalize() failed as we can't really recover # - don't bother responding with a 406 # logger.warning('%s : failed to finalize -> %s' % (self.path, diagnostic(failure))) # # - in any case request a termination and tag the pod as 'dead' # reply = {}, 200 self.terminate = 1 self.hints['process'] = 'dead' data.latch.set(reply) self.commands.popleft() return 'spin', data, 0
def run(self): try: def _query(zk): replies = fire(zk, self.cluster, 'control/off', subset=self.subset) return [seq for _, (seq, _, code) in replies.items() if code == 200] js = run(self.proxy, _query) def _query(zk): replies = fire(zk, self.cluster, 'reset', subset=self.subset) return [seq for _, (seq, _, code) in replies.items() if code == 200] assert js == run(self.proxy, _query), 'one or more pods did not respond' def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=self.subset) return [seq for _, (seq, _, code) in replies.items() if code == 200] assert js == run(self.proxy, _query), 'one or more pods did not respond' self.out['reset'] = js self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
def run(self): try: # # - first turn the pod off # def _query(zk): replies = fire(zk, self.cluster, 'control/off', subset=self.indices) return [ seq for _, (seq, _, code) in replies.items() if code == 200 ] js = run(self.proxy, _query) # # - reset it # - this will force a reconnection to zookeeper # def _query(zk): replies = fire(zk, self.cluster, 'reset', subset=self.indices) return [ seq for _, (seq, _, code) in replies.items() if code == 200 ] assert js == run(self.proxy, _query), 'one or more pods did not respond' # # - then turn the pod back on # def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=self.indices) return [ seq for _, (seq, _, code) in replies.items() if code == 200 ] assert js == run(self.proxy, _query), 'one or more pods did not respond' self.out['reset'] = js self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
def _from_curl(): tmp = tempfile.mkdtemp() try: # # - download each multi-part file to a temporary folder # for tag, upload in request.files.items(): where = join(tmp, tag) logger.debug('http -> upload @ %s' % where) upload.save(where) # # - get the shell snippet to run from the X-Shell header # - use the 'toolset' python package that's installed in the container # - open it # ts = time.time() line = request.headers['X-Shell'] logger.debug('http -> shell request "%s"' % line) # # - pipe the process stdout # - return as json ('out' contains the verbatim dump from the sub-process stdout) # outs = [] pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=PIPE, env=env, cwd=tmp) while True: line = pid.stdout.readline().rstrip('\n') code = pid.poll() if line == '' and code is not None: break outs += [line] ms = 1000 * (time.time() - ts) return json.dumps({'ok': pid.returncode == 0, 'ms': int(ms), 'out': '\n'.join(outs)}) except Exception as failure: why = diagnostic(failure) logger.warning('unexpected failure -> %s' % why) return json.dumps({'ok': False, 'out': 'unexpected failure -> %s' % why}) finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp)
def run(self): try: # # - first turn off the pods # - keep track of the indices # def _query(zk): replies = fire(zk, self.cluster, 'control/off', subset=self.indices, timeout=self.timeout) return [ seq for _, (seq, _, code) in replies.items() if code == 200 ] pods = run(self.proxy, _query) # # - then turn those pod back on # def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=pods, timeout=self.timeout) return [ seq for _, (seq, _, code) in replies.items() if code == 200 ] assert pods == run( self.proxy, _query), 'one or more pods failed to switch back on' self.out['reset'] = pods self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
def _import(where, funcs): try: for script in [f for f in listdir(where) if isfile(join(where, f)) and f.endswith('.py')]: try: module = imp.load_source(script[:-3], join(where, script)) if hasattr(module, 'go') and callable(module.go): tool = module.go() assert isinstance(tool, Template), 'boo' assert tool.tag, '' funcs[tool.tag] = tool except Exception as failure: logger.debug('failed to import %s (%s)' % (script, diagnostic(failure))) except OSError: pass
def _static(path): global threads threads += 1 try: time.sleep(random.randrange(10, 50, 5)/10.0) return send_from_directory('static', path) except Exception as failure: logger.error('unexpected failure while receiving -> %s' % diagnostic(failure)) return '', 500 finally: threads -= 1
def servo(strict=True, verbose=False): try: # # - retrieve the portal coordinates from /opt/servo/.portal # - this file is rendered by the pod script upon boot # _, lines = shell('cat .portal', cwd='/opt/servo') portal = lines[0] assert portal, '/opt/servo/.portal not found (pod not yet configured ?)' def _proxy(cmdline): # # - this block is taken from cli.py in ochothon # - in debug mode the verbatim response from the portal is dumped on stdout # - slight modification : we force the json output (-j) # tokens = cmdline.split(' ') + ['-j'] files = ['-F %s=@%s' % (basename(token), expanduser(token)) for token in tokens if isfile(expanduser(token))] line = ' '.join([basename(token) if isfile(expanduser(token)) else token for token in tokens]) snippet = 'curl -X POST -H "X-Shell:%s" %s %s/shell' % (line, ' '.join(files), portal) code, lines = shell(snippet) assert code is 0, 'is the portal @ %s down ?' % portal js = json.loads(lines[0]) ok = js['ok'] if verbose: print '[%s] "%s"' % ('passed' if ok else 'failed', cmdline) assert not strict or ok, '"%s" failed' % cmdline return json.loads(js['out']) if ok else None yield _proxy # # - all clear, return 0 to signal a success # sys.exit(0) except AssertionError as failure: print 'failure -> %s' % failure except Exception as failure: print 'unexpected failure -> %s' % diagnostic(failure) sys.exit(1)
def _static(path): global threads threads += 1 try: time.sleep(random.randrange(10, 50, 5) / 10.0) return send_from_directory('static', path) except Exception as failure: logger.error('unexpected failure while receiving -> %s' % diagnostic(failure)) return '', 500 finally: threads -= 1
def _from_web_shell(): tmp = tempfile.mkdtemp() try: # # - get the shell snippet from the uri # - use the 'toolset' python package that's installed in the container # - open it # ts = time.time() line = request.args.get('line', 0, type=str) logger.debug('http -> shell request "%s"' % line) pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=PIPE, env=env, cwd=tmp) # # - wait for completion # - return as json ('out' contains the verbatim dump from the sub-process stdout) # outs = [] # # - taken from ochopod's subprocess piping; avoids issues with buffering # while True: line = pid.stdout.readline().rstrip('\n') code = pid.poll() if line == '' and code is not None: break outs += [line] ms = 1000 * (time.time() - ts) return json.dumps({'ok': pid.returncode == 0, 'ms': int(ms), 'out': '\n'.join(outs)}) except Exception as failure: why = diagnostic(failure) logger.warning('unexpected failure -> %s' % why) return json.dumps({'ok': False, 'out': 'unexpected failure -> %s' % why}) finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp)
def _import(where, funcs): try: for script in [f for f in listdir(where) if isfile(join(where, f)) and f.endswith('.py')]: try: module = imp.load_source(script[:-3], join(where, script)) if hasattr(module, 'go') and callable(module.go): tool = module.go() assert isinstance(tool, Template), '%s is not inheriting from Template' % script[:-3] assert tool.tag, 'missing tool tag (check the %s module)' % script[-3] funcs[tool.tag] = tool except Exception as failure: logger.warning('failed to import %s (%s)' % (script, diagnostic(failure))) except OSError: pass
def signal(self, data): try: logger.debug('%s : user signal received' % self.path) js = self.signaled(data.js, process=data.forked) reply = js if js else {}, 200 except Exception as failure: # # - abort on a 500 upon any failure # reply = {}, 500 logger.warning('%s : failed to signal -> %s' % (self.path, diagnostic(failure))) data.latch.set(reply) self.commands.popleft() return 'spin', data, 0
def run(self): try: def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=self.indices, timeout=self.timeout) return len(replies), [seq for seq, (_, _, code) in replies.items() if code == 200] total, js = run(self.proxy, _query) assert len(js) == total, '1 or more pod failed to stop' self.out['on'] = js self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to switch on -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to switch on -> %s' % (self.cluster, diagnostic(failure)))
def index(): global threads threads += 1 try: time.sleep(random.randrange(10, 50, 5)/10.0) # # - index.html contains all the jquery magic that will run the shell and # use ajax to I/O with us # return render_template('index.html') except Exception as failure: logger.error('unexpected failure while receiving -> %s' % diagnostic(failure)) return '', 500 finally: threads -= 1
def run(self): try: # # - first turn the pod off # def _query(zk): replies = fire(zk, self.cluster, 'control/off', subset=self.indices) return [seq for _, (seq, _, code) in replies.items() if code == 200] js = run(self.proxy, _query) # # - reset it # - this will force a reconnection to zookeeper # def _query(zk): replies = fire(zk, self.cluster, 'reset', subset=self.indices) return [seq for _, (seq, _, code) in replies.items() if code == 200] assert js == run(self.proxy, _query), 'one or more pods did not respond' # # - then turn the pod back on # def _query(zk): replies = fire(zk, self.cluster, 'control/on', subset=self.indices) return [seq for _, (seq, _, code) in replies.items() if code == 200] assert js == run(self.proxy, _query), 'one or more pods did not respond' self.out['reset'] = js self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to reset -> %s' % (self.cluster, diagnostic(failure)))
def index(): global threads threads += 1 try: time.sleep(random.randrange(10, 50, 5) / 10.0) # # - index.html contains all the jquery magic that will run the shell and # use ajax to I/O with us # return render_template('index.html') except Exception as failure: logger.error('unexpected failure while receiving -> %s' % diagnostic(failure)) return '', 500 finally: threads -= 1
def ok(self, data): try: assert data.js, 'control/ok received out of context (leader bug ?)' logger.debug('%s : cluster has been formed, invoking configured()' % self.path) cluster = _Cluster(data.js) self.configured(cluster) reply = {}, 200 except Exception as failure: # # - abort on a 500 upon any failure # reply = {}, 500 logger.warning('%s : failed to signal -> %s' % (self.path, diagnostic(failure))) data.latch.set(reply) self.commands.popleft() return 'spin', data, 0
def check(self, data): try: # # - simply invoke the user-defined readiness check (typically to allow making sure all # the required dependencies are available before starting anything) # reply = {}, 200 cluster = _Cluster(data.js) self.can_configure(cluster) data.latch.set(reply) except Exception as failure: # # - any failure trapped during the configuration -> HTTP 406 # reply = {}, 406 logger.warning('%s : failed to run pre-check -> %s' % (self.path, diagnostic(failure))) data.latch.set(reply) self.commands.popleft() return 'spin', data, 0
def config(self, data): try: # # - make sure we persist the latest snapshot to zk # - order the dict to make sure we always assign the same index to the same pod # - unroll our pods into one URL list # data.last = None pods = self.snapshots['local'] self.hints['state'] = 'leader (configuring)' self.hints['status'] = '* configuring %d pods' % len(pods) # # - map each pod to its full control URL # - this will allow us to send requests directly without worrying about remapping the control port # - pay attention to order the pod list to guarantee consistent sequencing # logger.info('%s : configuring (%d pods, i/o port %d)' % (self.path, len(pods), self.port)) ordered = sorted(pods.items()) local = str(self.port) urls = \ {key: ('http://%s:%d' % (js['ip'], js['ports'][local])) for key, js in ordered if local in js['ports']} # # - they should all expose their control port # assert len(urls) == len(pods), '1+ pods are not exposing TCP %d (user error ?)' % self.port # # - this is the basic json payload we'll send to all our pods # - it contains all the information they need to know to carry their configuration out # - we'll also add each pod identifier + index # js = \ { 'pods': pods, 'dependencies': {k: v for k, v in self.snapshots.items() if k != 'local'} } def _control(task): threads = [] for key, url in urls.items(): # # - add the key for each pod # - this json payload will be sent over and turned into a Cluster instance on the other side # - inflate the receiving timeout a bit # payload = deepcopy(js) payload['key'] = key seconds = self.grace * 1.25 thread = _Post(key, '%s/control/%s/%d' % (url, task, self.grace), js=payload, timeout=seconds) threads.append(thread) if self.sequential: # # - start each HTTP POST thread and join immediately # def _start_join(): thread.start() return thread.join() logger.debug('%s : -> /control/%s (%d pods, sequential)' % (self.path, task, len(pods))) return [_start_join() for thread in threads] else: # # - start all the HTTP POST threads at once # - join them one by one # for thread in threads: thread.start() logger.debug('%s : -> /control/%s (%d pods)' % (self.path, task, len(pods))) return [thread.join() for thread in threads] # # - perform a pre-check, typically to make sure all our dependencies are there # - if this fails for whatever reason we'll postpone the configuration to later # - note that any dead pod will fail this test # replies = _control('check') dead = [key for key, code in replies if code == 410] if dead: logger.warning('%s : dropping %d dead pods' % (self.path, len(dead))) for key in dead: del pods[key] del urls[key] assert all(code in [200, 410] for _, code in replies), '1+ pods failing the pre-check or unreachable' if pods: # # - we have at least one pod alive # - if a full shutdown has been requested start by sending a /off to each pod in order # if self.full_shutdown: _control('off') # # - send a /on to each pod in order to configure and (re-)start them # - note we include an extra 'index' integer to the payload passed to the pod (this index # can be used to tag the pod in logs or perform specific setup procedures) # logger.debug('%s : json payload ->\n%s' % (self.path, json.dumps(js, indent=4, separators=(',', ': ')))) logger.info('%s : asking %d pods to configure' % (self.path, len(pods))) replies = _control('on') assert all(code == 200 for _, code in replies), '1+ pods failing to configure or unreachable' # # - operation successful -> ask each pod to run its configured() callback # - just fire & forget # _control('ok') # # - in any case update the md5 hash # - update also our /snapshot node (which will propagate if this cluster is a dependency for somebody else) # latest = self._md5() local = json.dumps(pods) self.zk.set('%s/%s.%s/snapshot' % (ROOT, self.scope, self.tag), local) self.zk.set('%s/%s.%s/hash' % (ROOT, self.scope, self.tag), latest) logger.debug('%s : new hash -> %s' % (self.path, latest)) logger.info('%s : configuration complete (%d pods alive)' % (self.path, len(pods))) # # - all cool, we can now unset our trigger # - keep track of the cluster description # - go back to spinning & force a call to probe() right away # data.dirty = 0 data.last = js data.last['key'] = str(self.id) data.next_probe = 0 except AssertionError as failure: # # - any assert aborts the procedure # - leave the trigger on and reset the timestamp to re-attempt # logger.warn('%s : configuration failed -> %s' % (self.path, diagnostic(failure))) self.hints['state'] = 'leader (configuration pending)' data.next = time.time() + self.damper data.last = None return 'spin', data, SAMPLING
def spin(self, data): # # - if the termination trigger is set or if we lost our connection, abort immediately # - this will free the lock and another controller will take the lead # if self.terminate: raise Aborted('terminating') # # - if it is time to run the probe callback do it now # - schedule the next one # now = time.time() if self.updated: # # - the update trigger is on # - unset it and query the last recorded hash # - any difference with what we have means we need to schedule a configuration # self.updated = 0 last, stats = self.zk.get('%s/%s.%s/hash' % (ROOT, self.scope, self.tag)) latest = self._md5() bad = latest != last if bad and not data.dirty: # # - the hash changed, switch the dirty trigger on # - this will start the countdown to configuration (which can be aborted if we fall back # on the same hash again, typically after a transient zookeeper connection loss) # logger.info('%s : hash changed, configuration in %2.1f seconds' % (self.path, self.damper)) logger.debug('%s : hash -> %s' % (self.path, latest)) data.next = now + self.damper data.dirty = 1 elif not bad: # # - this case would typically map to a pod losing cnx to zk and joining again later # - based on how much damper we allow we can bridge transient idempotent changes # - very important -> make sure we set the snapshot (which could have been reset to {}) # - don't also forget to set data.last to enable probing # data.dirty = 0 pods = self.snapshots['local'] js = \ { 'pods': pods, 'dependencies': {k: v for k, v in self.snapshots.items() if k != 'local'} } data.last = js data.last['key'] = str(self.id) self.zk.set('%s/%s.%s/snapshot' % (ROOT, self.scope, self.tag), json.dumps(pods)) logger.debug('%s : pod update with no hash impact (did we just reconnect to zk ?)' % self.path) if not data.dirty: # # - all cool, the cluster is configured # - set the state as 'leader' # - fire a probe() if it is time to do so # self.hints['state'] = 'leader' if data.last and now > data.next_probe: try: # # - pass the latest cluster data to the probe() call # - if successful (e.g did not assert) set the status to whatever the callable returned # - unset if nothing was returned # snippet = self.probe(_Cluster(data.last)) self.hints['status'] = str(snippet) if snippet else '' except AssertionError as failure: # # - set the status to the assert message # self.hints['status'] = '* %s' % failure except Exception as failure: # # - something blew up in probe(), set the status accordingly # self.hints['status'] = '* probe() failed (check the code)' logger.warning('%s : probe() failed -> %s' % (self.path, diagnostic(failure))) data.next_probe = now + self.probe_every if self.hints['status']: logger.debug('%s : probe() -> "%s"' % (self.path, self.hints['status'])) else: # # - trigger the configuration procedure # self.hints['state'] = 'leader (configuration pending)' remaining = max(0, data.next - now) self.hints['status'] = '* configuration in %2.1f seconds' % remaining if not remaining: return 'config', data, 0 # # - print some cool countdown # else: logger.debug('%s : configuration in %2.1f seconds' % (self.path, remaining)) return 'spin', data, SAMPLING
def _from_curl(): out = [] ok = False ts = time.time() tmp = tempfile.mkdtemp() try: # # - retrieve the command line # assert 'X-Shell' in request.headers, 'X-Shell header missing' line = request.headers['X-Shell'] # # - compute the incoming command line HMAC and compare (use our pod token as the key) # if 'token' in os.environ and os.environ['token']: assert 'X-Signature' in request.headers, 'signature missing (make sure you define $OCHOPOD_TOKEN)' digest = 'sha1=' + hmac.new(os.environ['token'], line, hashlib.sha1).hexdigest() assert digest == request.headers['X-Signature'], 'SHA1 signature mismatch (check your token)' # # - download each multi-part file to a temporary folder # for tag, upload in request.files.items(): where = join(tmp, tag) logger.debug('http -> upload @ %s' % where) upload.save(where) # # - get the shell snippet to run from the X-Shell header # - use the 'toolset' python package that's installed in the container # - open it # logger.debug('http -> shell request "%s"' % line) pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=None, env=env, cwd=tmp) # # - pipe the process stdout # - return as json ('out' contains the verbatim dump from the sub-process stdout) # while 1: code = pid.poll() line = pid.stdout.readline() if not line and code is not None: break elif line: out += [line.rstrip('\n')] ok = pid.returncode == 0 except AssertionError as failure: out = ['failure -> %s' % failure] except Exception as failure: out = ['unexpected failure -> %s' % diagnostic(failure)] finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp) ms = 1000 * (time.time() - ts) js = \ { 'ok': ok, 'ms': ms, 'out': '\n'.join(out) } return json.dumps(js), 200, \ { 'Content-Type': 'application/json; charset=utf-8' }
# # - make sure to cleanup our temporary directory # shutil.rmtree(tmp) ms = 1000 * (time.time() - ts) js = \ { 'ok': ok, 'ms': ms, 'out': '\n'.join(out) } return json.dumps(js), 200, \ { 'Content-Type': 'application/json; charset=utf-8' } # # - run our flask endpoint on TCP 9000 # web.run(host='0.0.0.0', port=9000, threaded=True) except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure)) finally: sys.exit(1)
def run(self): try: # # - we need to pass the framework master IPs around (ugly) # assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)' master = choice(os.environ['MARATHON_MASTER'].split(',')) headers = \ { 'content-type': 'application/json', 'accept': 'application/json' } # # - first peek and see what pods we have # - they should all map to one single marathon application (abort if not) # - we'll use the application identifier to retrieve the configuration json later on # def _query(zk): replies = fire(zk, self.cluster, 'info') return [hints['application'] for (_, hints, _) in replies.values()] js = run(self.proxy, _query) assert len(set(js)) == 1, '%s is mapping to 2+ marathon applications' % self.cluster app = js[0] # # - fetch the various versions for our app # - we want to get hold of the most recent configuration # url = 'http://%s/v2/apps/%s/versions' % (master, app) reply = get(url, headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code js = reply.json() # # - retrieve the latest one # - keep the docker container configuration and the # of tasks around # last = js['versions'][0] url = 'http://%s/v2/apps/%s/versions/%s' % (master, app, last) reply = get(url, headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code js = reply.json() spec = js['container'] tag = spec['docker']['image'] capacity = js['instances'] # # - kill all the pods using a POST /control/kill # - wait for them to be dead # @retry(timeout=self.timeout, pause=0) def _spin(): def _query(zk): replies = fire(zk, self.cluster, 'control/kill', timeout=self.timeout) return [(code, seq) for seq, _, code in replies.values()] # # - fire the request one or more pods # - wait for every pod to report back a HTTP 410 (GONE) # - this means the ochopod state-machine is now idling (e.g dead) # js = run(self.proxy, _query) gone = sum(1 for code, _ in js if code == 410) assert gone == len(js), 'at least one pod is still running' return _spin() # # - grab the docker image # - just add a :<version> suffix (or replace it) but don't change the image proper # - update the image and PUT the new configuration back # - marathon will then kill & re-start all the tasks # tokens = tag.split(':') spec['docker']['image'] = \ '%s:%s' % (tag, self.version) if len(tokens) < 2 else '%s:%s' % (tokens[0], self.version) js = \ { 'container': spec } url = 'http://%s/v2/apps/%s' % (master, app) reply = put(url, data=json.dumps(js), headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) logger.debug(reply.text) assert code == 200 or code == 201, 'update failed (HTTP %d)' % code # # - the pods should now be starting # - wait for all the pods to be in the 'running' mode (they are 'dead' right now) # - the sequence counters allocated to our new pods are returned as well # target = ['running'] if self.strict else ['stopped', 'running'] @retry(timeout=self.timeout, pause=3, default={}) def _spin(): def _query(zk): replies = fire(zk, self.cluster, 'info') return [(hints['process'], seq) for seq, hints, _ in replies.values() if hints['process'] in target] js = run(self.proxy, _query) assert len(js) == capacity, 'not all pods running yet' return js js = _spin() up = [seq for _, seq in js] assert len(up) == capacity, '1+ pods still not up (%d/%d)' % (len(up), capacity) self.out['up'] = up self.out['ok'] = True logger.debug('%s : %d pods updated to version "%s"' % (self.cluster, capacity, self.version)) except AssertionError as failure: logger.debug('%s : failed to bump -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to bump -> %s' % (self.cluster, diagnostic(failure)))
def run(self): try: # # - workaround to fetch the master IP and credentials as there does not seem to # be a way to use 10.0.0.2 from within the pod # assert 'KUBERNETES_MASTER' in os.environ, '$KUBERNETES_MASTER not specified (check your portal pod)' assert 'KUBERNETES_USER' in os.environ, '$KUBERNETES_USER not specified (check your portal pod)' assert 'KUBERNETES_PWD' in os.environ, '$KUBERNETES_PWD not specified (check your portal pod)' auth = HTTPBasicAuth(os.environ['KUBERNETES_USER'], os.environ['KUBERNETES_PWD']) with open(self.template, 'r') as f: # # - parse the yaml file # - add the ochopod control port if not specified # cfg = yaml.load(f) if 8080 not in cfg['ports']: cfg['ports'].append(8080) # # - # suffix = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S') qualified = 'ochopod.%s.%s-%s' % (self.namespace, cfg['cluster'], suffix) env = \ { 'KUBERNETES_MASTER': os.environ['KUBERNETES_MASTER'], 'KUBERNETES_USER': os.environ['KUBERNETES_USER'], 'KUBERNETES_PWD': os.environ['KUBERNETES_PWD'], 'ochopod_cluster': cfg['cluster'], 'ochopod_namespace': self.namespace, 'ochopod_application': qualified, 'pod': json.dumps(cfg['settings']) if 'settings' in cfg else '{}' } labels = \ { 'name': qualified } container = \ { 'name': cfg['cluster'], 'image': cfg['image'], 'env': [{'name': key, 'value': value} for key, value in env.items()], 'ports': [{'containerPort': port} for port in cfg['ports']] } controller = \ { 'kind': 'ReplicationController', 'apiVersion': 'v1beta3', 'metadata': {'name': qualified}, 'spec': { 'replicas': self.pods, 'selector': {'name': qualified}, 'template': { 'metadata': {'labels': labels}, 'spec': { 'containers': [container] } } } } # # - # headers = \ { 'content-type': 'application/json', 'accept': 'application/json' } url = 'https://%s/api/v1beta3/namespaces/default/replicationcontrollers' % os.environ['KUBERNETES_MASTER'] reply = requests.post(url, auth=auth, data=json.dumps(controller), headers=headers, verify=False) code = reply.status_code logger.debug('-> POST %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'submission failed (HTTP %d)' % code self.deployed = self.pods self.ok = 1 except AssertionError as failure: logger.debug('%s : failed to deploy -> %s' % (self.template, failure)) except YAMLError as failure: if hasattr(failure, 'problem_mark'): mark = failure.problem_mark logger.debug('%s : invalid deploy.yml (line %s, column %s)' % (self.template, mark.line+1, mark.column+1)) except Exception as failure: logger.debug('%s : failed to deploy -> %s' % (self.template, diagnostic(failure)))
def run(self): try: # # - workaround to fetch the master IP and credentials as there does not seem to # be a way to use 10.0.0.2 from within the pod # assert 'KUBERNETES_MASTER' in os.environ, '$KUBERNETES_MASTER not specified (check your portal pod)' assert 'KUBERNETES_USER' in os.environ, '$KUBERNETES_USER not specified (check your portal pod)' assert 'KUBERNETES_PWD' in os.environ, '$KUBERNETES_PWD not specified (check your portal pod)' auth = HTTPBasicAuth(os.environ['KUBERNETES_USER'], os.environ['KUBERNETES_PWD']) def _query(zk): replies = fire(zk, self.cluster, 'info') return len(replies), {key: hints for key, (_, hints, code) in replies.items() if code == 200} # # - each pod refers to its controller via the 'application' hint # total, js = run(self.proxy, _query) assert total == len(js), 'failure to communicate with one or more pods' for key in set([hints['application'] for hints in js.values()]): # # - HTTP DELETE the controller via the master API # url = 'https://%s/api/v1beta3/namespaces/default/replicationcontrollers/%s' % (os.environ['KUBERNETES_MASTER'], key) reply = requests.delete(url, auth=auth,verify=False) code = reply.status_code logger.debug('-> DELETE %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'replication controller deletion failed (HTTP %d)' % code # # - the 'task' hint is the pod's identifier # for key, hints in js.items(): # # - HTTP DELETE the pod via the master API # url = 'https://%s/api/v1beta3/namespaces/default/pods/%s' % (os.environ['KUBERNETES_MASTER'], hints['task']) reply = requests.delete(url, auth=auth,verify=False) code = reply.status_code logger.debug('-> DELETE %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'pod deletion failed (HTTP %d)' % code self.killed = len(js) self.ok = 1 except AssertionError as failure: logger.debug('%s : failed to deploy -> %s' % (self.cluster, failure)) except YAMLError as failure: if hasattr(failure, 'problem_mark'): mark = failure.problem_mark logger.debug('%s : invalid deploy.yml (line %s, column %s)' % (self.cluster, mark.line+1, mark.column+1)) except Exception as failure: logger.debug('%s : failed to deploy -> %s' % (self.cluster, diagnostic(failure)))
def run(self): try: # # - we need to pass the framework master IPs around (ugly) # assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)' master = choice(os.environ['MARATHON_MASTER'].split(',')) headers = \ { 'content-type': 'application/json', 'accept': 'application/json' } # # - first peek and see what pods we have # def _query(zk): replies = fire(zk, self.cluster, 'info') return [(seq, hints['application'], hints['task']) for (seq, hints, _) in replies.values()] # # - remap a bit differently and get an ordered list of task identifiers # - we'll use that to kill the newest pods # js = run(self.proxy, _query) total = len(js) if self.group is not None: # # - if -g was specify apply the scaling to the underlying marathon application containing that pod # - be careful to update the task list and total # of pods # keys = {seq: key for (seq, key, _) in js} assert self.group in keys, '#%d is not a valid pod index' % self.group app = keys[self.group] tasks = [(seq, task) for (seq, key, task) in sorted(js, key=(lambda _: _[0])) if key == app] total = sum(1 for (_, key, _) in js if key == app) else: # # - check and make sure all our pods map to one single marathon application # keys = set([key for (_, key, _) in js]) assert len( keys ) == 1, '%s maps to more than one application, you must specify -g' % self.cluster tasks = [(seq, task) for (seq, _, task) in sorted(js, key=(lambda _: _[0])) ] app = keys.pop() # # - infer the target # of pods based on the user-defined factor # operator = self.factor[0] assert operator in ['@', 'x'], 'invalid operator' n = float(self.factor[1:]) target = n if operator == '@' else total * n # # - clip the target # of pods down to 1 # target = max(1, int(target)) self.out['delta'] = target - total if target > total: # # - scale the application capacity up # js = \ { 'instances': target } url = 'http://%s/v2/apps/%s' % (master, app) reply = put(url, data=json.dumps(js), headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'update failed (HTTP %d)' % code # # - wait for all our new pods to be there # @retry(timeout=self.timeout, pause=3, default={}) def _spin(): def _query(zk): replies = fire(zk, self.cluster, 'info') return [seq for seq, _, _ in replies.values()] js = run(self.proxy, _query) assert len(js) == target, 'not all pods running yet' return js _spin() elif target < total: # # - if the fifo switch is on make sure to pick the oldest pods for deletion # tasks = tasks[:total - target] if self.fifo else tasks[target:] # # - kill all (or part of) the pods using a POST /control/kill # - wait for them to be dead # @retry(timeout=self.timeout, pause=0) def _spin(): def _query(zk): indices = [seq for (seq, _) in tasks] replies = fire(zk, self.cluster, 'control/kill', subset=indices, timeout=self.timeout) return [(code, seq) for seq, _, code in replies.values()] # # - fire the request one or more pods # - wait for every pod to report back a HTTP 410 (GONE) # - this means the ochopod state-machine is now idling (e.g dead) # js = run(self.proxy, _query) gone = sum(1 for code, _ in js if code == 410) assert gone == len(js), 'at least one pod is still running' return _spin() # # - delete all the underlying tasks at once using POST v2/tasks/delete # js = \ { 'ids': [task for (_, task) in tasks] } url = 'http://%s/v2/tasks/delete?scale=true' % master reply = post(url, data=json.dumps(js), headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to scale -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to scale -> %s' % (self.cluster, diagnostic(failure)))
def on(self, data): if data.forked and data.js and (self.strict or data.js['dependencies'] != self.last['dependencies']): # # - if we already have a process, we want to re-configure -> force a reset first # - this will go through a graceful termination process # - we'll come back here afterwards (with data.forked set to None) # raise Aborted('resetting to terminate pid %s first' % data.forked.pid) elif data.forked: # # - the process is already running, fail gracefully on a 200 # - this is the code-path used for instance up a leader request when strict is false # reply = {}, 200 logger.debug('%s : skipping /control/on request' % self.path) data.latch.set(reply) else: # # - no more process running, go on with the configuration # try: if not self.initialized: # # - if this is the 1st time the pod is running invoke the initialize() callback # - this is typically used to run once-only stuff such as attaching storage volumes, etc. # logger.info('%s : initializing pod' % self.path) self.initialize() self.initialized = 1 if data.js: # # - run the configuration procedure if we have some json # - we'll use whatever it returns to popen() a new process # - keep track of the shell command line returned by configure() for later # cluster = _Cluster(data.js) logger.info('%s : configuring pod %d/%d' % (self.path, 1 + cluster.index, cluster.size)) data.command, data.env = self.configure(cluster) self.last = data.js assert data.command, 'request to start process while not yet configured (user error ?)' # # - spawn a new sub-process if the auto-start flag is on OR if we already ran at least once # - the start flag comes from the $ochopod_start environment variable # now = time.time() if not data.js or self.start or data.pids > 0: # # - combine our environment variables with the overrides from configure() # - popen() the new process # - reset the sanity check counter # - keep track of its pid to kill it later on # env = deepcopy(self.env) env.update(data.env) tokens = data.command if self.shell else data.command.split(' ') data.forked = Popen(tokens, cwd=self.cwd, env=env, shell=self.shell) data.checks = self.checks data.pids += 1 self.hints['process'] = 'running' logger.info('%s : popen() #%s -> started <%s> as pid %s' % (self.path, data.pids, data.command, data.forked.pid)) if data.env: unrolled = '\n'.join(['\t%s -> %s' % (k, v) for k, v in data.env.items()]) logger.debug('%s : extra environment for pid %s ->\n%s' % (self.path, data.forked.pid, unrolled)) reply = {}, 200 data.next_sanity_check = now + self.check_every data.latch.set(reply) except Exception as failure: # # - any failure trapped during the configuration -> HTTP 406 # - the pod will shutdown automatically as well # reply = {}, 406 logger.warning('%s : failed to configure -> %s, shutting down' % (self.path, diagnostic(failure))) self._request(['kill']) data.latch.set(reply) self.commands.popleft() return 'spin', data, 0
def _from_curl(scripts): # # - retrieve the X-Signature header # - fast-fail on a HTTP 403 if not there or if there is a mismatch # if not "X-Signature" in request.headers: return "", 403 # # - force a json output if the Accept header matches 'application/json' # - otherwise default to a text/plain response # - create a temporary directory to run from # ok = 0 log = [] alphabet = string.letters + string.digits token = "".join(alphabet[ord(c) % len(alphabet)] for c in os.urandom(8)) raw = request.accept_mimetypes.best_match(["application/json"]) is None tmp = tempfile.mkdtemp() try: # # - any request header in the form X-Var-* will be kept around and passed as # an environment variable when executing the script # - make sure the variable is spelled in uppercase # local = {key[6:].upper(): value for key, value in request.headers.items() if key.startswith("X-Var-")} # # - craft a unique callback URL that points to this pod # - this will be passed down to the script to enable transient testing jobs # cwd = path.join(tmp, "uploaded") local["CALLBACK"] = "http://%s/callback/%s" % (env["local"], token) blocked[token] = cwd for key, value in local.items(): log += ["$%s = %s" % (key, value)] # # - download the archive # - compute the HMAC and compare (use our pod token as the key) # - fail on a 403 if mismatch # where = path.join(tmp, "bundle.tgz") request.files["tgz"].save(where) with open(where, "rb") as f: bytes = f.read() digest = "sha1=" + hmac.new(env["token"], bytes, hashlib.sha1).hexdigest() if digest != request.headers["X-Signature"]: return "", 403 # # - extract it into its own folder # - make sure the requested script is there # code, _ = shell("mkdir uploaded && tar zxf bundle.tgz -C uploaded", cwd=tmp) assert code == 0, "unable to open the archive (bogus payload ?)" # # - decrypt any file whose extension is .aes # - just run openssl directly and dump the output in the working directory # - note: at this point we just look for .aes file in the top level directory # for file in os.listdir(cwd): bare, ext = path.splitext(file) if ext != ".aes": continue code, _ = shell( "openssl enc -d -base64 -aes-256-cbc -k %s -in %s -out %s" % (env["token"], file, bare), cwd=cwd ) if code == 0: log += ["decrypted %s" % file] # # - run each script in order # - abort immediately if the script exit code is not zero # - keep the script output as a json array # for script in scripts.split("+"): now = time.time() assert path.exists(path.join(cwd, script)), "unable to find %s (check your scripts)" % script code, lines = shell("python %s 2>&1" % script, cwd=cwd, env=local) log += lines + ["%s ran in %d seconds" % (script, int(time.time() - now))] assert code == 0, "%s failed on exit code %d" % (script, code) ok = 1 except AssertionError as failure: log += ["failure (%s)" % failure] except Exception as failure: log += ["unexpected failure (%s)" % diagnostic(failure)] finally: # # - make sure to cleanup our temporary directory # del blocked[token] shutil.rmtree(tmp) if raw: # # - if 'application/json' was not requested simply dump the log as is # - force the response code to be HTTP 412 upon failure and HTTP 200 otherwise # code = 200 if ok else 412 return "\n".join(log), code, {"Content-Type": "text/plain; charset=utf-8"} else: # # - if 'application/json' was requested always respond with a HTTP 200 # - the response body then contains our serialized JSON output # js = {"ok": ok, "log": log} return json.dumps(js), 200, {"Content-Type": "application/json; charset=utf-8"}
def run(self): try: # # - we need to pass the framework master IPs around (ugly) # assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)' master = choice(os.environ['MARATHON_MASTER'].split(',')) headers = \ { 'content-type': 'application/json', 'accept': 'application/json' } # # - first peek and see what pods we have # def _query(zk): replies = fire(zk, self.cluster, 'info') return [(seq, hints['application'], hints['task']) for (seq, hints, _) in replies.values()] # # - remap a bit differently and get an ordered list of task identifiers # - we'll use that to kill the newest pods # js = run(self.proxy, _query) total = len(js) if self.group is not None: # # - if -g was specify apply the scaling to the underlying marathon application containing that pod # - be careful to update the task list and total # of pods # keys = {seq: key for (seq, key, _) in js} assert self.group in keys, '#%d is not a valid pod index' % self.group app = keys[self.group] tasks = [(seq, task) for (seq, key, task) in sorted(js, key=(lambda _: _[0])) if key == app] total = sum(1 for (_, key, _) in js if key == app) else: # # - check and make sure all our pods map to one single marathon application # keys = set([key for (_, key, _) in js]) assert len(keys) == 1, '%s maps to more than one application, you must specify -g' % self.cluster tasks = [(seq, task) for (seq, _, task) in sorted(js, key=(lambda _: _[0]))] app = keys.pop() # # - infer the target # of pods based on the user-defined factor # operator = self.factor[0] assert operator in ['@', 'x'], 'invalid operator' n = float(self.factor[1:]) target = n if operator == '@' else total * n # # - clip the target # of pods down to 1 # target = max(1, int(target)) self.out['delta'] = target - total if target > total: # # - scale the application capacity up # js = \ { 'instances': target } url = 'http://%s/v2/apps/%s' % (master, app) reply = put(url, data=json.dumps(js), headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'update failed (HTTP %d)' % code # # - wait for all our new pods to be there # @retry(timeout=self.timeout, pause=3, default={}) def _spin(): def _query(zk): replies = fire(zk, self.cluster, 'info') return [seq for seq, _, _ in replies.values()] js = run(self.proxy, _query) assert len(js) == target, 'not all pods running yet' return js _spin() elif target < total: # # - if the fifo switch is on make sure to pick the oldest pods for deletion # tasks = tasks[:total - target] if self.fifo else tasks[target:] # # - kill all (or part of) the pods using a POST /control/kill # - wait for them to be dead # @retry(timeout=self.timeout, pause=0) def _spin(): def _query(zk): indices = [seq for (seq, _) in tasks] replies = fire(zk, self.cluster, 'control/kill', subset=indices, timeout=self.timeout) return [(code, seq) for seq, _, code in replies.values()] # # - fire the request one or more pods # - wait for every pod to report back a HTTP 410 (GONE) # - this means the ochopod state-machine is now idling (e.g dead) # js = run(self.proxy, _query) gone = sum(1 for code, _ in js if code == 410) assert gone == len(js), 'at least one pod is still running' return _spin() # # - delete all the underlying tasks at once using POST v2/tasks/delete # js = \ { 'ids': [task for (_, task) in tasks] } url = 'http://%s/v2/tasks/delete?scale=true' % master reply = post(url, data=json.dumps(js), headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'delete failed (HTTP %d)' % code self.out['ok'] = True except AssertionError as failure: logger.debug('%s : failed to scale -> %s' % (self.cluster, failure)) except Exception as failure: logger.debug('%s : failed to scale -> %s' % (self.cluster, diagnostic(failure)))
def go(): """ Entry point for the slave tool-set. This script will look for python modules in the /commands sub-directory. """ # # - start by simplifying a bit the console logger to look more CLI-ish # for handler in logger.handlers: handler.setFormatter(logging.Formatter('%(message)s')) try: def _import(where, funcs): try: for script in [f for f in listdir(where) if isfile(join(where, f)) and f.endswith('.py')]: try: module = imp.load_source(script[:-3], join(where, script)) if hasattr(module, 'go') and callable(module.go): tool = module.go() assert isinstance(tool, Template), '%s is not inheriting from Template' % script[:-3] assert tool.tag, 'missing tool tag (check the %s module)' % script[-3] funcs[tool.tag] = tool except Exception as failure: logger.warning('failed to import %s (%s)' % (script, diagnostic(failure))) except OSError: pass # # - disable .pyc generation # - scan for tools to import # - each .py module must have a go() callable as well as a COMMAND attribute # - the COMMAND attribute tells us what the command-line invocation looks like # tools = {} sys.dont_write_bytecode = True _import('%s/commands' % dirname(__file__), tools) def _usage(): return 'available commands -> %s' % ', '.join(sorted(tools.keys())) parser = ArgumentParser(description='', prefix_chars='+', usage=_usage()) parser.add_argument('command', type=str, help='command (e.g ls for instance)') parser.add_argument('extra', metavar='extra arguments', type=str, nargs='*', help='zero or more arguments') args = parser.parse_args() total = [args.command] + args.extra if args.command == 'help': logger.info(_usage()) exit(0) def _sub(sub): for i in range(len(total)-len(sub)+1): if sub == total[i:i+len(sub)]: return 1 return 0 matched = [tool for tool in tools.keys() if _sub(tool.split(' '))] if not matched: logger.info('unknown command (%s)' % _usage()) elif len(matched) > 1: logger.info('more than one command were matched (%s)' % _usage()) else: # # - simply invoke the tool # - remove the command tokens first and pass the rest as arguments # - each tool will parse its own commandline # picked = matched[0] tokens = len(picked.split(' ')) - 1 exit(tools[picked].run(args.extra[tokens:])) except AssertionError as failure: logger.error('shutting down <- %s' % failure) except Exception as failure: logger.error('shutting down <- %s' % diagnostic(failure)) exit(1)
def run(self): try: # # - we need to pass the framework master IPs around (ugly) # assert 'MARATHON_MASTER' in os.environ, '$MARATHON_MASTER not specified (check your portal pod)' master = choice(os.environ['MARATHON_MASTER'].split(',')) headers = \ { 'content-type': 'application/json', 'accept': 'application/json' } with open(self.template, 'r') as f: # # - parse the template yaml file (e.g container definition) # raw = yaml.load(f) assert raw, 'empty YAML input (user error ?)' # # - merge with our defaults # - we want at least the cluster & image settings # - TCP 8080 is added by default to the port list # defaults = \ { 'start': True, 'debug': False, 'settings': {}, 'ports': [8080], 'verbatim': {} } cfg = merge(defaults, raw) assert 'cluster' in cfg, 'cluster identifier undefined (user error ?)' assert 'image' in cfg, 'docker image undefined (user error ?)' # # - if a suffix is specified append it to the cluster identifier # if self.suffix: cfg['cluster'] = '%s-%s' % (cfg['cluster'], self.suffix) # # - timestamp the application (we really want a new uniquely identified application) # - lookup the optional overrides and merge with our pod settings if specified # - this is what happens when the -o option is used # stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S') qualified = '%s.%s' % (self.namespace, cfg['cluster']) application = 'ochopod.%s-%s' % (qualified, stamp) if qualified in self.overrides: blk = self.overrides[qualified] logger.debug('%s : overriding %d settings (%s)' % (self.template, len(blk), qualified)) cfg['settings'] = merge(cfg['settings'], blk) def _nullcheck(cfg, prefix): # # - walk through the settings and flag any null value # missing = [] if cfg is not None: for key, value in cfg.items(): if value is None: missing += ['%s.%s' % ('.'.join(prefix), key)] elif isinstance(value, dict): missing += _nullcheck(value, prefix + [key]) return missing missing = _nullcheck(cfg['settings'], ['pod']) assert not missing, '%d setting(s) missing ->\n\t - %s' % (len(missing), '\n\t - '.join(missing)) # # - if we still have no target default it to 1 single pod # if not self.pods: self.pods = 1 # # - setup our port list # - the port binding is specified either by an integer (container port -> dynamic mesos port), by # two integers (container port -> host port) or by an integer followed by a * (container port -> # same port on the host) # - the marathon pods must by design map /etc/mesos # def _parse_port(token): if isinstance(token, int): return {'containerPort': token} elif isinstance(token, str) and token.endswith(' *'): port = int(token[:-2]) return {'containerPort': port, 'hostPort': port} elif isinstance(token, str): ports = token.split(' ') assert len(ports) == 2, 'invalid port syntax (must be two integers separated by 1+ spaces)' return {'containerPort': int(ports[0]), 'hostPort': int(ports[1])} else: assert 0, 'invalid port syntax ("%s")' % token # # - note the marathon-ec2 ochopod bindings will set the application hint automatically # via environment variable (e.g no need to specify it here) # - make sure to mount /etc/mesos and /opt/mesosphere to account for various mesos installs # ports = [_parse_port(token) for token in cfg['ports']] if 'ports' in cfg else [] spec = \ { 'id': application, 'instances': self.pods, 'env': { 'ochopod_cluster': cfg['cluster'], 'ochopod_debug': str(cfg['debug']).lower(), 'ochopod_start': str(cfg['start']).lower(), 'ochopod_namespace': self.namespace, 'pod': json.dumps(cfg['settings']) }, 'container': { 'type': 'DOCKER', 'docker': { 'forcePullImage': True, 'image': cfg['image'], 'network': 'BRIDGE', 'portMappings': ports }, 'volumes': [ { 'containerPath': '/etc/mesos', 'hostPath': '/etc/mesos', 'mode': 'RO' }, { 'containerPath': '/opt/mesosphere', 'hostPath': '/opt/mesosphere', 'mode': 'RO' } ] } } # # - if we have a 'verbatim' block in our image definition yaml, merge it now # if 'verbatim' in cfg: spec = merge(cfg['verbatim'], spec) # # - pick a marathon master at random # - fire the POST /v2/apps to create our application # - this will indirectly spawn our pods # url = 'http://%s/v2/apps' % master reply = post(url, data=json.dumps(spec), headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 201, 'submission failed (HTTP %d)' % code # # - wait for all the pods to be in the 'running' mode # - the 'application' hint is set by design to the marathon application identifier # - the sequence counters allocated to our new pods are returned as well # target = ['dead', 'running'] if self.strict else ['dead', 'stopped', 'running'] @retry(timeout=self.timeout, pause=3, default={}) def _spin(): def _query(zk): replies = fire(zk, qualified, 'info') return [(hints['process'], seq) for seq, hints, _ in replies.values() if hints['application'] == application and hints['process'] in target] js = run(self.proxy, _query) assert len(js) == self.pods, 'not all pods running yet' return js js = _spin() running = sum(1 for state, _ in js if state is not 'dead') up = [seq for _, seq in js] self.out['up'] = up self.out['ok'] = self.pods == running logger.debug('%s : %d/%d pods are running ' % (self.template, running, self.pods)) if not up: # # - nothing is running (typically because the image has an issue and is not # not booting the ochopod script for instance, which happens often) # - in that case fire a HTTP DELETE against the marathon application to clean it up # url = 'http://%s/v2/apps/%s' % (master, application) reply = delete(url, headers=headers) code = reply.status_code logger.debug('-> %s (HTTP %d)' % (url, code)) assert code == 200 or code == 204, 'application deletion failed (HTTP %d)' % code except AssertionError as failure: logger.debug('%s : failed to deploy -> %s' % (self.template, failure)) except YAMLError as failure: if hasattr(failure, 'problem_mark'): mark = failure.problem_mark logger.debug('%s : invalid deploy.yml (line %s, column %s)' % (self.template, mark.line+1, mark.column+1)) except Exception as failure: logger.debug('%s : failed to deploy -> %s' % (self.template, diagnostic(failure)))
except AssertionError as failure: log += ['* %s' % str(failure)] except IOError: log += ['* unable to load integration.yml (missing from the repo ?)'] except YAMLError as failure: log += ['* invalid YAML syntax'] except Exception as failure: log += ['* unexpected condition -> %s' % diagnostic(failure)] finally: # # - make sure to cleanup our temporary directory # - update redis with # if not complete: logger.error('build interrupted (%s)' % log[-1]) seconds = int(time.time() - started) status = \ { 'ok': ok and complete, 'sha': sha,
def _from_curl(): out = [] ok = False ts = time.time() tmp = tempfile.mkdtemp() try: # # - retrieve the command line # assert 'X-Shell' in request.headers, 'X-Shell header missing' line = request.headers['X-Shell'] # # - compute the incoming command line HMAC and compare (use our pod token as the key) # if 'token' in os.environ and os.environ['token']: assert 'X-Signature' in request.headers, 'signature missing (make sure you define $OCHOPOD_TOKEN)' digest = 'sha1=' + hmac.new(os.environ['token'], line, hashlib.sha1).hexdigest() assert digest == request.headers[ 'X-Signature'], 'SHA1 signature mismatch (check your token)' # # - download each multi-part file to a temporary folder # for tag, upload in request.files.items(): where = join(tmp, tag) logger.debug('http -> upload @ %s' % where) upload.save(where) # # - get the shell snippet to run from the X-Shell header # - use the 'toolset' python package that's installed in the container # - open it # logger.debug('http -> shell request "%s"' % line) pid = Popen('toolset %s' % line, shell=True, stdout=PIPE, stderr=None, env=env, cwd=tmp) # # - pipe the process stdout # - return as json ('out' contains the verbatim dump from the sub-process stdout) # while 1: code = pid.poll() line = pid.stdout.readline() if not line and code is not None: break elif line: out += [line.rstrip('\n')] ok = pid.returncode == 0 except AssertionError as failure: out = ['failure -> %s' % failure] except Exception as failure: out = ['unexpected failure -> %s' % diagnostic(failure)] finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp) ms = 1000 * (time.time() - ts) js = \ { 'ok': ok, 'ms': ms, 'out': '\n'.join(out) } return json.dumps(js), 200, \ { 'Content-Type': 'application/json; charset=utf-8' }
def spin(self, data): if self.terminate: if not data.forked: # # - kill the actor (which will release the latch and unlock the main loop) # self.exitcode() else: # # - this will force a reset and make sure we kill the process # - we'll loop back to spin() in any case and exitcode() this time # raise Aborted('terminating') if self.commands: # # - we have at least one request pending # - pop the next command and run it (e.g switch the state-machine to it) # req, js, latch = self.commands[0] data.js = js data.latch = latch return req, data, 0 if data.forked: # # - no request to run # - check if the process is still running and run the user-defined sanity check once in a while # now = time.time() if data.forked.poll() is not None: code = data.forked.returncode if not code: # # - a successful exit code (0) will automatically force a shutdown # - this is a convenient way for pods go down automatically once their task is done # logger.error('%s : pid %s exited, shutting down' % (self.path, data.forked.pid)) self._request(['kill']) else: # # - the process died on a non zero exit code # - restart it gracefully # logger.error('%s : pid %s died (code %d), re-running' % (self.path, data.forked.pid, code)) self._request(['off', 'on']) elif now >= data.next_sanity_check: try: # # - run the sanity check and schedule the next one # - reset it each time # data.next_sanity_check = now + self.check_every self.sanity_check(data.forked.pid) data.checks = self.checks except Exception as failure: # # - any failure trapped during the sanity check will decrement our counter # - eventually the process is stopped (up to the user to decide what to do) # data.checks -= 1 if not data.checks: self._request(['off']) logger.warning('%s : sanity check (%d/%d) failed -> %s' % (self.path, self.checks - data.checks, self.checks, diagnostic(failure))) return 'spin', data, SAMPLING