class _Broker(mp.Process): """ _Broker process handling dependencies and submission of jobs """ def __init__(self, broker_queue, worker_queue, results_queue, results_queue_worker, pending_dict, db_path=None): mp.Process.__init__(self) self.__input_queue = broker_queue self.__results_queue_master = results_queue self.__results_queue_worker = results_queue_worker self.__worker_queue = worker_queue self.__pending_dict = pending_dict self.__db = PersistentDict(db_path) def __del__(self): try: # kill remaining runing commands for k, v in self.__db.RangeIter(): if v["status"] not in [EStatus.EXIT, EStatus.DONE]: os.killpg(os.getpgid(v["job_id"]), signal.SIGKILL) self.__db.close() except AttributeError: pass def run(self): while True: try: args = self.__input_queue.get(True, 0.1) ejob, args = args if ejob == EJob.STOP: # terminate broker self.terminate() elif ejob == EJob.MONITOR: # monitor request self.__results_queue_master.put(self.__monitor(args)) elif ejob == EJob.CANCEL: # cancel job status = self.__cancel(args) self.__results_queue_master.put(True) self.__update_status(args.command_id, status) elif ejob == EJob.UPDATE: # updating command status (e.g, RUN, EXIT, DONE) # some command has started message is coming from worker process c_id, status = args self.__update_status(c_id, status) elif ejob == EJob.PID: # updating process ID of submitted command job_id, p_id = args entry = yaml.load(self.__db[job_id], yaml.RoundTripLoader) entry["job_id"] = p_id self.__db[job_id] = yaml.dump(entry, Dumper=yaml.RoundTripDumper) self.__db.sync() else: # submitting job to worker job, dependent = args self.__add_command(job) if dependent is not None: self.__pending_dict[job] = dependent else: self.__worker_queue.put(job) except queue.Empty: # go through pending jobs and update status # if one of the dependent jobs terminated with an error pending job is also terminated with error for job, dependent in list(self.__pending_dict.items()): status = self.__condition_fulfilled(dependent) if job is not None: if status == EStatus.EXIT: # job cannot be called due to termination of dependent jobs self.__update_status(job.command_id, EStatus.EXIT) del self.__pending_dict[job] elif status == EStatus.RUN: self.__worker_queue.put(job) del self.__pending_dict[job] else: # makes sure that a submitted job is properly registered and join works as intended if status == EStatus.RUN: del self.__pending_dict[job] except Exception as e: tb = traceback.format_exc() self.__results_queue_master.put((e, tb)) def __condition_fulfilled(self, dependent): for d in dependent: status = self.__monitor(d) if status == EStatus.EXIT: return EStatus.EXIT if status != EStatus.DONE: return EStatus.PEND return EStatus.RUN def __add_command(self, command): try: # update entry if existing: entry = yaml.load(self.__db[command.command_id], yaml.RoundTripLoader) entry["name"] = command.name entry["tries"] += 1 entry["job_id"] = None entry["status"] = EStatus.PEND entry["command"] = command.command entry["resources"] = command.resources entry["workdir"] = command.workdir entry["environment"] = command.environment self.__db[command.command_id] = yaml.dump( entry, Dumper=yaml.RoundTripDumper) except KeyError: # add new entry entry = { "name": command.name, "job_id": None, "tries": 1, "status": EStatus.PEND, "command": command.command, "resources": command.resources, "workdir": command.workdir, "environment": command.environment } self.__db[command.command_id] = yaml.dump( entry, Dumper=yaml.RoundTripDumper) self.__db.sync() def __cancel(self, command): try: entry = yaml.load(self.__db[command.command_id], yaml.RoundTripLoader) except KeyError: raise ValueError("Command " + repr(command.command_id) + " has not been submitted yet.") p_id = entry["job_id"] status = self.__monitor(command) if status == EStatus.PEND: del self.__pending_dict[command] elif status in [EStatus.SUSP, EStatus.RUN]: os.killpg(os.getpgid(p_id), signal.SIGKILL) elif status == EStatus.DONE: return status return EStatus.EXIT def __monitor(self, command): """ local function to monitor a command via assigned pID Parameters ---------- command: Command The Command object Returns ------- EStatus The status of the command """ if command in self.__pending_dict: return EStatus.PEND try: entry = yaml.load(self.__db[command.command_id], yaml.RoundTripLoader) except KeyError: raise ValueError("Command " + repr(command.command_id) + " has not been submitted yet.") p_id = entry["job_id"] cmd = entry["command"][0] try: # I think if PID is completed this should through an error .... p = psutil.Process(pid=p_id) status = p.status() p_cmd = " ".join(p.cmdline()) # test status types # if status is ZOMBIE also kill the job # pid can be already in use by other process... one has to check the command as well.... # if p_cmd is different than the original process is probably completed if cmd not in p_cmd: c_stat = EStatus.DONE elif status in [psutil.STATUS_DEAD, psutil.STATUS_ZOMBIE]: p.kill() c_stat = EStatus.EXIT elif status == psutil.STATUS_RUNNING: c_stat = EStatus.SUSP else: c_stat = EStatus.SUSP except psutil.NoSuchProcess: c_stat = EStatus.DONE except psutil.AccessDenied: c_stat = EStatus.RUN entry["status"] = c_stat self.__db[command.command_id] = yaml.dump(entry, Dumper=yaml.RoundTripDumper) self.__db.sync() return c_stat def __update_status(self, c_id, status): """ updates the status of a command Parameters ---------- c_id: str The Command id status: EStatus The new status """ try: entry = yaml.load(self.__db[c_id], yaml.RoundTripLoader) except KeyError: raise ValueError("Command " + repr(c_id) + " has not been submitted yet.") entry["status"] = status self.__db[c_id] = yaml.dump(entry, Dumper=yaml.RoundTripDumper) self.__db.sync()
class SlurmSubmitter(AClusterSubmitter): """ Implements an LSF submitter """ __name = "slurm" __submit = "sbatch --job-name={name} {dependent} {resources} --wrap 'srun {cmd}'" __monitor = "squeue -t all -j {job_id}" __cancel = "scancel {job_id}" __resources = "" __resources_flag = { EResource.queue: "-p", EResource.time: "-t", EResource.mem: "--mem-per-cpu", EResource.nodes: "-c", EResource.error: "-e", EResource.out: "-o" } __job_id_pattern = re.compile(r"Submitted batch job ([0-9]*)") def __init__(self, blocking=False, db_path=None): """ Init function Parameters ---------- blocking: bool determines whether join() blocks or not db_path: str the string to a LevelDB for command persistence """ self.__blocking = blocking if db_path is None: tmp_db = NamedTemporaryFile(delete=False, dir=os.getcwd(), suffix=".db") tmp_db.close() self.__is_temp_db = True self.__db_path = tmp_db.name else: self.__is_temp_db = False self.__db_path = db_path self.__db = PersistentDict(self.__db_path) def __del__(self): try: self.__db.close() if self.__is_temp_db: os.remove(self.__db_path) except AttributeError: pass @property def isBlocking(self): return self.__blocking @property def name(self): return self.__name @property def monitor_command(self): return self.__monitor @property def resource_flags(self): return self.__resources_flag @property def submit_command(self): return self.__submit @property def db(self): return self.__db @property def cancel_command(self): return self.__cancel @property def job_id_pattern(self): return self.__job_id_pattern def _prepare_dependencies(self, dependent): dep = "" if dependent is not None: try: if isinstance(dependent, Command): d_info = yaml.load(self.__db[dependent.command_id], yaml.RoundTripLoader) dep = "--kill-on-invalid-dep=yes --dependency=afterok:{}".format( d_info["job_id"]) else: dep_jobs = [] for d in dependent: d_info = yaml.load(self.__db[d.command_id], yaml.RoundTripLoader) dep_jobs.append(d_info["job_id"]) # not sure if comma-separated is correct dep = "--kill-on-invalid-dep=yes --dependency=afterok:{}".format( ":".join("ended({})".format(d) for d in dep_jobs)) except KeyError: raise ValueError( "Specified depended jobs have not been submitted yet.") return dep def _prepare_resources(self, resources): return " ".join("{} {}".format(self.resource_flags[k], v) for k, v in resources.items()) def _get_status(self, stdo): def status_map(st): if st in ["PD", "CF"]: return EStatus.PEND elif st in ["R", "CG"]: return EStatus.RUN elif st == "CD": return EStatus.DONE elif st in ["BF", "PR", "TO", "NF", "F", "CA"]: return EStatus.EXIT else: return EStatus.SUSP return status_map(stdo.split("\n")[1].split()[4].strip())
class SGESubmitter(AClusterSubmitter): """ Implements an LSF submitter """ __name = "sge" __submit = "echo '{cmd}' | qsub -N {name} {dependent} {resources}" __monitor = "qstat" __cancel = "qdel {job_id}" __resources = "" __resources_flag = { EResource.queue: "-q", EResource.time: '-l h_rt=', EResource.mem: '-l h_vmem=', EResource.nodes: "-pe smp", EResource.error: "-e", EResource.out: "-o" } __job_id_pattern = re.compile(r'Your job ([0-9]+) .*') def __init__(self, blocking=False, db_path=None): """ Init function Parameters ---------- blocking: bool determines whether join() blocks or not db_path: str the string to a LevelDB for command persistence """ self.__blocking = blocking if db_path is None: tmp_db = NamedTemporaryFile(delete=False, dir=os.getcwd(), suffix=".db") tmp_db.close() self.__is_temp_db = True self.__db_path = tmp_db.name else: self.__is_temp_db = False self.__db_path = db_path self.__db = PersistentDict(self.__db_path) def __del__(self): try: self.__db.close() if self.__is_temp_db: os.remove(self.__db_path) except AttributeError: pass @property def isBlocking(self): return self.__blocking @property def name(self): return self.__name @property def monitor_command(self): return self.__monitor @property def resource_flags(self): return self.__resources_flag @property def submit_command(self): return self.__submit @property def db(self): return self.__db @property def cancel_command(self): return self.__cancel @property def job_id_pattern(self): return self.__job_id_pattern def _prepare_resources(self, resources): special_res = {EResource.mem, EResource.time} return " ".join( "{} {}".format(self.resource_flags[k], v) if k not in special_res else "{}{}".format(self.resource_flags[k], v) for k, v in resources.items()) def _prepare_dependencies(self, dependent): dep = "" if dependent is not None: try: if isinstance(dependent, Command): d_info = yaml.load(self.__db[dependent.command_id], yaml.RoundTripLoader) dep = "{}".format(d_info["job_id"]) else: dep_jobs = [] for d in dependent: d_info = yaml.load(self.__db[d.command_id], yaml.RoundTripLoader) dep_jobs.append(d_info["job_id"]) dep = ",".join(dep_jobs) dep = "-hold_jid " + dep except KeyError: raise ValueError( "Specified depended jobs have not been submitted yet.") return dep def _internal_monitor(self, command_id): try: job_id = yaml.load(self.db[command_id], yaml.RoundTripLoader)["job_id"] except KeyError: raise ValueError("Command " + repr(command_id) + " has not been submitted yet.") submit = self.monitor_command.format(job_id=job_id) try: p = subprocess.Popen(submit, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdo, stde = p.communicate() stdr = p.returncode if stdr > 0: raise RuntimeError("Unsuccessful monitoring of " + repr(command_id) + " (EXIT!=0) with error: " + stde) except Exception as e: raise RuntimeError(e) status = self._get_status(stdo, job_id) entry = yaml.load(self.db[command_id], yaml.RoundTripLoader) entry["status"] = status self.db[command_id] = yaml.dump(entry, Dumper=yaml.RoundTripDumper) self.db.sync() return status def _get_status(self, stdo, job_id): def status_map(status): if status == "r": return EStatus.RUN elif status == "qw": return EStatus.PEND elif status in ["Ewq", "e", "E"]: return EStatus.SUSP else: return EStatus.EXIT # search in list for command_id and extract status for l in stdo.split("\n"): if "" == l.strip(): continue splits = l.split() if job_id == splits[0]: return status_map(splits[4]) return EStatus.DONE
class LSFSubmitter(AClusterSubmitter): """ Implements an LSF submitter """ __name = "lsf" __submit = "bsub -J {name} {dependent} {resources} '{cmd}'" __monitor = "bjobs {job_id}" __cancel = "bkill {job_id}" __resources = "" __resources_flag = { EResource.queue: "-q", EResource.time: "-W", EResource.mem: "-R", EResource.nodes: "-n", EResource.error: "-e", EResource.out: "-o" } __job_id_pattern = re.compile(r"^Job <([0-9]*)>") def __init__(self, blocking=False, db_path=None): """ Init function Parameters ---------- blocking: bool determines whether join() blocks or not db_path: str the string to a LevelDB for command persistence """ self.__blocking = blocking if db_path is None: tmp_db = NamedTemporaryFile(delete=False, dir=os.getcwd(), suffix=".db") tmp_db.close() self.__is_temp_db = True self.__db_path = tmp_db.name else: self.__is_temp_db = False self.__db_path = db_path self.__db = PersistentDict(self.__db_path) def __del__(self): try: self.__db.close() if self.__is_temp_db: os.remove(self.__db_path) except AttributeError: pass @property def isBlocking(self): return self.__blocking @property def name(self): return self.__name @property def monitor_command(self): return self.__monitor @property def resource_flags(self): return self.__resources_flag @property def submit_command(self): return self.__submit @property def db(self): return self.__db @property def cancel_command(self): return self.__cancel @property def job_id_pattern(self): return self.__job_id_pattern def _get_status(self, stdo): def status_map(st): if st == "PEND": return EStatus.PEND elif st == "RUN": return EStatus.RUN elif st == "DONE": return EStatus.DONE elif st == "EXIT": return EStatus.EXIT else: return EStatus.SUSP return status_map(stdo.split("\n")[1].split()[2].strip()) def _prepare_dependencies(self, dependent): dep = "" if dependent is not None: try: if isinstance(dependent, Command): d_info = yaml.load(self.__db[dependent.command_id], yaml.RoundTripLoader) dep = "-w {}".format(d_info["job_id"]) else: dep_jobs = [] for d in dependent: d_info = yaml.load(self.__db[d.command_id], yaml.RoundTripLoader) dep_jobs.append(d_info["job_id"]) # not sure if comma-separated is correct dep = "-w {}".format(" && ".join("ended({})".format(d) for d in dep_jobs)) except KeyError: raise ValueError( "Specified depended jobs have not been submitted yet.") return dep def _prepare_resources(self, resources): return " ".join( "{} {}".format(self.resource_flags[k], v) if k != EResource.mem else "{} 'rusage[mem={}]'".format(self.resource_flags[k], v) for k, v in resources.items())