def __exit__(self, _type, _value, _traceback): logging.log(1, "__exit__({}, {}, {})".format(_type, _value, _traceback)) if self._exited: return # clean up any remaining runner processes if _type or _value or _traceback: logging.error('An error was raised in the Cluster context', exc_info=(_type, _value, _traceback)) self.stop_background_threads(_value) else: self.stop_background_threads() try: for w in self.workers: w.stop() for w in self.dead_workers: w.stop() # Wait on runners to finish waiting on their subprocesses to exit for w in self.runners: # Check thread ident to avoid error when joining an un-started # thread. if w.ident: # ident is set when a thread is started w.join(self.worker_join_timeout) alive = [] while self.workers: w = self.workers.pop() if w.is_alive(): alive.append(w) else: self.dead_workers.append(w) if alive: alive_names = ', '.join((w.name for w in alive)) raise ClusterError("Runners [{}] failed to exit cleanly after" " {} seconds.".format( alive_names, self.worker_join_timeout)) # check for workes that exited with a non-0 return code # note that workers killed in the previous step have code -15 bad_exit = [] for w in self.dead_workers: c = w.returncode() if c not in (0, -9, -15): # -9: SIGKILL, -15: SIGTERM bad_exit.append(w) if bad_exit: raise ClusterError("The following workers terminated with " "a bad exit code: {}".format([ "(name: {}, pid: {}, code:{})".format( w.name, w.pid, w.returncode()) for w in bad_exit ])) finally: self.__finally__() if self.errors: logging.error("Errors were encountered when running" " the cluster") for e in self.errors: logging.exception(e) self._exited = True
def run(self): while not self.stopped(): if self.func: crashed = self.cluster.get_crashed_workers(self.func) else: crashed = self.cluster.get_crashed_workers() if crashed: logging.debug("CrashChecker, results: {}".format(crashed)) err = ClusterError( "A crash was detected in the workers: {}".format(",".join( ("{} ({}): {}").format(w.name, w.pid, w.returncode()) for w in crashed))) self.cluster.raise_from_error(err) self.stop() break time.sleep(0.5)
def add_runner(worker_id, runners, command, source_addrs, sink_addrs, metrics_addr, control_addr, res_dir, workers, my_control_addr, my_data_addr, my_external_addr, alt_block=None, alt_func=lambda x: False, spikes={}): cmd_stub = BASE_COMMAND.format( command=command, in_block=(IN_BLOCK.format( inputs=','.join(source_addrs)) if source_addrs else ''), out_block=(OUT_BLOCK.format( outputs=','.join(sink_addrs)) if sink_addrs else ''), metrics_addr=metrics_addr, res_dir=res_dir) # Test that the new worker *can* join if len(runners) < 1: raise ClusterError("There must be at least 1 worker to join!") if not any(r.is_alive() for r in runners): raise ClusterError("There must be at least 1 live worker to " "join!") if worker_id in spikes: logging.info("Enabling spike for joining worker{}".format(x)) sc = spikes[worker_id] spike_block = SPIKE_CMD.format( prob=SPIKE_PROB.format(sc.probability), margin=SPIKE_MARGIN.format(sc.margin), seed=SPIKE_SEED.format(sc.seed) if sc.seed else '') else: spike_block = '' cmd = cmd_stub.format(name='worker{}'.format(worker_id), initializer_block='', worker_block=WORKER_CMD.format( control_addr=my_control_addr, data_addr=my_data_addr, external_addr=my_external_addr), join_block=JOIN_CMD.format( join_addr=control_addr, worker_count=(WORKER_COUNT_CMD.format( worker_count=workers) if workers else '')), alt_block=alt_block if alt_func(worker_id) else '', spike_block=spike_block) runner = Runner(command=cmd, name='worker{}'.format(worker_id), control=my_control_addr, data=my_data_addr, external=my_external_addr) runners.append(runner) # start the new worker runner.start() time.sleep(0.05) # check the runner hasn't exited with any errors try: assert (runner.is_alive()) except Exception as err: raise CrashedWorkerError try: assert (runner.error is None) except Exception as err: raise err return runner
def start_runners(runners, command, source_addrs, sink_addrs, metrics_addr, res_dir, workers, worker_addrs=[], alt_block=None, alt_func=lambda x: False, spikes={}): cmd_stub = BASE_COMMAND.format( command=command, in_block=(IN_BLOCK.format( inputs=','.join(source_addrs)) if source_addrs else ''), out_block=(OUT_BLOCK.format( outputs=','.join(sink_addrs)) if sink_addrs else ''), metrics_addr=metrics_addr, res_dir=res_dir) # for each worker, assign `name` and `cluster-initializer` values if workers < 1: raise ClusterError("workers must be 1 or more") x = 0 if x in spikes: logging.info("Enabling spike for initializer") sc = spikes[x] spike_block = SPIKE_CMD.format( prob=SPIKE_PROB.format(prob=sc.probability), margin=SPIKE_MARGIN.format(margin=sc.margin), seed=SPIKE_SEED.format(seed=sc.seed) if sc.seed else '') else: spike_block = '' cmd = cmd_stub.format( name='initializer', initializer_block=INITIALIZER_CMD.format( worker_count=WORKER_COUNT_CMD.format(worker_count=workers), data_addr=worker_addrs[0][1], external_addr=worker_addrs[0][2]), worker_block='', join_block=CONTROL_CMD.format(control_addr=worker_addrs[0][0]), alt_block=alt_block if alt_func(x) else '', spike_block=spike_block) runners.append( Runner(command=cmd, name='initializer', control=worker_addrs[0][0], data=worker_addrs[0][1], external=worker_addrs[0][2])) for x in range(1, workers): if x in spikes: logging.info("Enabling spike for worker{}".format(x)) sc = spikes[x] spike_block = SPIKE_CMD.format( prob=SPIKE_PROB.format(prob=sc.probability), margin=SPIKE_MARGIN.format(margin=sc.margin), seed=SPIKE_SEED.format(seed=sc.seed) if sc.seed else '') else: spike_block = '' cmd = cmd_stub.format( name='worker{}'.format(x), initializer_block='', worker_block=WORKER_CMD.format(control_addr=worker_addrs[x][0], data_addr=worker_addrs[x][1], external_addr=worker_addrs[x][2]), join_block=CONTROL_CMD.format(control_addr=worker_addrs[0][0]), alt_block=alt_block if alt_func(x) else '', spike_block=spike_block) runners.append( Runner(command=cmd, name='worker{}'.format(x), control=worker_addrs[x][0], data=worker_addrs[x][1], external=worker_addrs[x][2])) # start the workers, 50ms apart for idx, r in enumerate(runners): r.start() time.sleep(0.05) # check the runners haven't exited with any errors for idx, r in enumerate(runners): try: assert (r.is_alive()) except RunnerHasntStartedError: raise except Exception as err: stdout = r.get_output() raise ClusterError("Runner %d of %d has exited with an error: " "\n---\n%s" % (idx + 1, len(runners), stdout)) try: assert (r.error is None) except Exception as err: raise ClusterError("Runner %d of %d has exited with an error: " "\n---\n%s" % (idx + 1, len(runners), r.error))