def check(self, chain_state, step): from acmacs_base import htcondor now = datetime.datetime.now() job = htcondor.Job(step.htcondor["cluster"], step.htcondor["log"]) state = job.state() if state["FAILED"]: step.FAILED = True module_logger.error( f"""{step.step_id()} htcondor jobs {step.htcondor["cluster"]} FAILED""" ) self._done(step, now) raise StepFailed() elif state["DONE"]: module_logger.info( f"""{step.step_id()} htcondor jobs {step.htcondor["cluster"]} done""" ) self._done(step, now) step.runtime = str(step.finish - step.start) return True elif (now - step.htcondor.get("check_reported", step.start)).seconds > 300: step.htcondor["check_reported"] = now module_logger.info( f"""{step.step_id()} {state["PERCENT"]}% """) return False
def wait(self, state): job = htcondor.Job(clusters=state["raxmlng"]["cluster"], condor_log=state["raxmlng"]["condor_log"]) status = job.wait(timeout=self.config["wait_timout"]) if status == "done": state["raxmlng"]["overall_time"] = time_m.time( ) - state["raxmlng"]["started"] module_logger.info( "RaXML jobs completed in " + maker_base.Result.time_str(state["raxmlng"]["overall_time"]))
def analyse_logs(self, state): def load_log_file(filepath): for attempt in range(10): try: r = [{ "t": float(e[0]), "s": -float(e[1]), "f": str(filepath).split(".")[-1] } for e in (line.strip().split() for line in filepath.open())] if not r: # pending r = [{"t": 0, "s": 0}] return r except ValueError as err: pass # file is being written at the moment, try again later module_logger.info( '(ignored) cannot process {}: {}'.format( filepath.name, err)) time_m.sleep(3) raise RuntimeError("Cannot process {}".format(filepath)) def time_score_from_log(files): return min((load_log_file(filepath)[-1] for filepath in files), key=operator.itemgetter("s")) completed = [ run_id for run_id in state["raxml"]["run_ids"] if Path(state["raxml"]["output_dir"], "RAxML_bestTree." + run_id).exists() ] if completed: best_completed = time_score_from_log( Path(state["raxml"]["output_dir"], "RAxML_log." + run_id) for run_id in completed) # module_logger.info('completed: {} best: {}'.format(len(completed), best_completed)) running_logs = [ f for f in (Path(state["raxml"]["output_dir"], "RAxML_log." + run_id) for run_id in state["raxml"]["run_ids"] if run_id not in completed) if f.exists() ] data = { int(str(f).split(".")[-1]): load_log_file(f) for f in running_logs } scores_for_longer_worse_than_best_completed = { k: v[-1]["s"] for k, v in data.items() if v[-1]["t"] > best_completed["t"] and v[-1]["s"] > best_completed["s"] } by_score = sorted( scores_for_longer_worse_than_best_completed, key=lambda e: scores_for_longer_worse_than_best_completed[e]) n_to_kill = int(len(by_score) * self.config["raxml_kill_rate"]) if n_to_kill > 0: to_kill = by_score[-n_to_kill:] module_logger.info( 'completed: {} best: {} worse_than_best_completed: {} to kill: {}' .format(len(completed), best_completed, by_score, to_kill)) job = htcondor.Job(clusters=state["raxml"]["cluster"], condor_log=state["raxml"]["condor_log"]) job.kill_tasks(to_kill) run_id_to_del = [ ri for ri in state["raxml"]["run_ids"] if int(ri.split(".")[-1]) in to_kill ] # module_logger.info('run_id_to_del {}'.format(run_id_to_del)) for ri in run_id_to_del: state["raxml"]["run_ids"].remove(ri) state["raxml"]["survived_tasks"] -= len(run_id_to_del)