Example #1
0
 def check(self, chain_state, step):
     from acmacs_base import htcondor
     now = datetime.datetime.now()
     job = htcondor.Job(step.htcondor["cluster"], step.htcondor["log"])
     state = job.state()
     if state["FAILED"]:
         step.FAILED = True
         module_logger.error(
             f"""{step.step_id()} htcondor jobs {step.htcondor["cluster"]} FAILED"""
         )
         self._done(step, now)
         raise StepFailed()
     elif state["DONE"]:
         module_logger.info(
             f"""{step.step_id()} htcondor jobs {step.htcondor["cluster"]} done"""
         )
         self._done(step, now)
         step.runtime = str(step.finish - step.start)
         return True
     elif (now -
           step.htcondor.get("check_reported", step.start)).seconds > 300:
         step.htcondor["check_reported"] = now
         module_logger.info(
             f"""{step.step_id()}   {state["PERCENT"]}%          """)
     return False
Example #2
0
 def wait(self, state):
     job = htcondor.Job(clusters=state["raxmlng"]["cluster"],
                        condor_log=state["raxmlng"]["condor_log"])
     status = job.wait(timeout=self.config["wait_timout"])
     if status == "done":
         state["raxmlng"]["overall_time"] = time_m.time(
         ) - state["raxmlng"]["started"]
         module_logger.info(
             "RaXML jobs completed in " +
             maker_base.Result.time_str(state["raxmlng"]["overall_time"]))
Example #3
0
    def analyse_logs(self, state):
        def load_log_file(filepath):
            for attempt in range(10):
                try:
                    r = [{
                        "t": float(e[0]),
                        "s": -float(e[1]),
                        "f": str(filepath).split(".")[-1]
                    } for e in (line.strip().split()
                                for line in filepath.open())]
                    if not r:  # pending
                        r = [{"t": 0, "s": 0}]
                    return r
                except ValueError as err:
                    pass  # file is being written at the moment, try again later
                    module_logger.info(
                        '(ignored) cannot process {}: {}'.format(
                            filepath.name, err))
                time_m.sleep(3)
            raise RuntimeError("Cannot process {}".format(filepath))

        def time_score_from_log(files):
            return min((load_log_file(filepath)[-1] for filepath in files),
                       key=operator.itemgetter("s"))

        completed = [
            run_id for run_id in state["raxml"]["run_ids"]
            if Path(state["raxml"]["output_dir"], "RAxML_bestTree." +
                    run_id).exists()
        ]
        if completed:
            best_completed = time_score_from_log(
                Path(state["raxml"]["output_dir"], "RAxML_log." + run_id)
                for run_id in completed)
            # module_logger.info('completed: {} best: {}'.format(len(completed), best_completed))
            running_logs = [
                f for f in (Path(state["raxml"]["output_dir"], "RAxML_log." +
                                 run_id)
                            for run_id in state["raxml"]["run_ids"]
                            if run_id not in completed) if f.exists()
            ]
            data = {
                int(str(f).split(".")[-1]): load_log_file(f)
                for f in running_logs
            }
            scores_for_longer_worse_than_best_completed = {
                k: v[-1]["s"]
                for k, v in data.items() if v[-1]["t"] > best_completed["t"]
                and v[-1]["s"] > best_completed["s"]
            }
            by_score = sorted(
                scores_for_longer_worse_than_best_completed,
                key=lambda e: scores_for_longer_worse_than_best_completed[e])
            n_to_kill = int(len(by_score) * self.config["raxml_kill_rate"])
            if n_to_kill > 0:
                to_kill = by_score[-n_to_kill:]
                module_logger.info(
                    'completed: {} best: {} worse_than_best_completed: {} to kill: {}'
                    .format(len(completed), best_completed, by_score, to_kill))
                job = htcondor.Job(clusters=state["raxml"]["cluster"],
                                   condor_log=state["raxml"]["condor_log"])
                job.kill_tasks(to_kill)
                run_id_to_del = [
                    ri for ri in state["raxml"]["run_ids"]
                    if int(ri.split(".")[-1]) in to_kill
                ]
                # module_logger.info('run_id_to_del {}'.format(run_id_to_del))
                for ri in run_id_to_del:
                    state["raxml"]["run_ids"].remove(ri)
                state["raxml"]["survived_tasks"] -= len(run_id_to_del)