def cleanup(self): def get_zombie_instances(): wl = logalyzer.WorkersLog(self.path_workers, self.taskconf.command) for worker in wl.workers: if worker.instanceid and not worker.instancetime: yield worker zombie_workers = list(get_zombie_instances()) if not zombie_workers: return zombie_instances = [worker.instanceid for worker in zombie_workers] self.log("destroying zombie instances: " + " ".join(sorted(zombie_instances))) hub = Hub(self.taskconf.hub_apikey) retrier = Retrier(self.DESTROY_ERROR_TIMEOUT, self.DESTROY_ERROR_SLEEP, self.logfh) destroyed_instances = [ instanceid for ipaddress, instanceid in retrier( hub.destroy, *zombie_instances) ] self.log("destroyed zombie instances: " + " ".join(sorted(destroyed_instances))) # log destruction to the respective worker logs for zombie_worker in zombie_workers: if zombie_worker.instanceid not in destroyed_instances: continue worker_log = file( "%s/%d" % (self.path_workers, zombie_worker.worker_id), "a") timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) print >> worker_log, "\n# %s [watchdog] destroyed worker %s" % ( timestamp, zombie_worker.instanceid) worker_log.close()
def thread(): def callback(): return not self.event_stop.is_set() hub = Hub(taskconf.hub_apikey) i = None try: for i, instance in enumerate( hub.launch(new_workers, VerboseLog(session_logs.manager), callback, **taskconf.ec2_opts)): launchq.put(instance) except Exception, e: unlaunched_workers = new_workers - (i + 1) \ if i is not None \ else new_workers for i in range(unlaunched_workers): launchq.put(None) if not isinstance(e, hub.Stopped): traceback.print_exc(file=session_logs.manager)
def __init__(self, session_logs, taskconf, sshkey, ipaddress=None, destroy=None, event_stop=None, launchq=None): self.pid = os.getpid() if event_stop: signal.signal(signal.SIGINT, signal.SIG_IGN) self.event_stop = event_stop self.logs = session_logs self.sshkey = sshkey self.strikes = taskconf.strikes self.strike = 0 self.timeout = taskconf.timeout self.cleanup_command = taskconf.post self.user = taskconf.user self.ipaddress = ipaddress self.instanceid = None self.hub = None self.ssh = None if destroy is None: if ipaddress: destroy = False else: destroy = True self.destroy = destroy if not ipaddress: if not taskconf.hub_apikey: raise self.Error( "can't auto launch a worker without a Hub API KEY") self.hub = Hub(taskconf.hub_apikey) if launchq: with sighandle.sigignore(signal.SIGINT, signal.SIGTERM): instance = launchq.get() else: class Bool: value = False stopped = Bool() def handler(s, f): stopped.value = True with sighandle.sighandle(handler, signal.SIGINT, signal.SIGTERM): def callback(): return not stopped.value instance = list( self.hub.launch(1, VerboseLog(session_logs.manager), callback, **taskconf.ec2_opts))[0] if not instance or (event_stop and event_stop.is_set()): raise self.Terminated self.ipaddress, self.instanceid = instance self.status("launched worker %s" % self.instanceid) else: self.status("using existing worker") self.handle_stop = self._stop_handler(event_stop) try: self.ssh = SSH(self.ipaddress, identity_file=self.sshkey.path, login_name=taskconf.user, callback=self.handle_stop) except SSH.Error, e: self.status("unreachable via ssh: " + str(e)) traceback.print_exc(file=self.logs.worker) raise self.Error(e)