def _getJobStatusTree(self): import xml.etree.ElementTree as ET import getpass import time if self.jobname is None: raise ValueError("The jobname needs to be defined.") user = getpass.getuser() cmd = [self._qstatus, "-u", user, "-xml"] if self.queue is not None: cmd += ["-q", ",".join(ensurelist(self.queue))] logger.debug(cmd) # This command randomly fails so I need to allow it to repeat or it crashes adaptive tries = 0 while tries < 3: try: ret = check_output(cmd, stderr=DEVNULL) except CalledProcessError: if tries == 2: raise tries += 1 time.sleep(3) continue break logger.debug(ret.decode("ascii")) return ET.fromstring(ret.decode("ascii").strip())
def jobInfo(self): from jobqueues.simqueue import QueueJobStatus if self.jobname is None: raise ValueError("The jobname needs to be defined.") cmd = [ self._qjobinfo, "--name", self.jobname, "-u", self.user, "-o", "JobID,JobName,State,ExitCode,Reason,Timelimit", "-P", "-X", ] if self.partition is not None: cmd += ["--partition", ",".join(ensurelist(self.partition))] logger.debug(cmd) ret = self._robust_check_output(cmd).decode("ascii") logger.debug(ret) # TODO: Is there a specific exit code for this? if "Slurm accounting storage is disabled" in ret: return None lines = ret.splitlines() if len(lines) < 2: return None info = {} for line in lines[1:]: jobid, _, state, exitcode, reason, timelimit = line.split("|") if state in JOB_STATE_CODES: state = JOB_STATE_CODES[state] else: raise RuntimeError(f'Unknown SLURM job state "{state}"') info[jobid] = { "state": state, "exitcode": exitcode, "reason": reason, "timelimit": timelimit, } return info
def inprogress(self): """Returns the sum of the number of running and queued workunits of the specific group in the engine. Returns ------- total : int Total running and queued workunits """ import time if self.jobname is None: raise ValueError("The jobname needs to be defined.") cmd = [ self._qstatus, "-n", self.jobname, "-u", self.user, ] if self.partition is not None: cmd += ["--partition", ",".join(ensurelist(self.partition))] logger.debug(cmd) ret = self._robust_check_output(cmd).decode("ascii") logger.debug(ret) # Count the number of lines returned by squeue as number of "in progress" jobs lines = ret.splitlines() inprog = max(0, len(lines) - 1) # Check also with sacct because squeue sometimes fails to report the right number try: res = self.jobInfo() if res is None: return inprog info = [ key for key, val in res.items() if val["state"] in _inProgressStatus ] if len(info) != inprog: logger.warning( f"squeue and sacct gave different number of running jobs ({inprog}/{len(info)}) with name {self.jobname}. Using the max of the two." ) inprog = max(inprog, len(info)) except Exception as e: logger.warning(f"Failed to get jobInfo with error: {e}") return inprog
def stop(self): """Cancels all currently running and queued jobs""" if self.jobname is None: raise ValueError("The jobname needs to be defined.") if self.partition is not None: for q in ensurelist(self.partition): cmd = [self._qcancel, "-n", self.jobname, "-u", self.user, "-p", q] logger.debug(cmd) ret = check_output(cmd) logger.debug(ret.decode("ascii")) else: cmd = [self._qcancel, "-n", self.jobname, "-u", self.user] logger.debug(cmd) ret = check_output(cmd) logger.debug(ret.decode("ascii"))
def submit(self, folders): from jobqueues.util import ensurelist import uuid if self.group is None: self.group = uuid.uuid4() for f in ensurelist(folders): if self.jobname is None: name = os.path.basename(os.path.abspath(f)) else: name = self.jobname job = self.session.startApp(self.app) job.readArguments(os.path.join(f, self.configname)) job.name = name job.group = self.group job.submit(childOf=self.parentjob._execid if self. parentjob is not None else None)
def inprogress(self): """Returns the sum of the number of running and queued workunits of the specific group in the engine. Returns ------- total : int Total running and queued workunits """ import time import getpass if self.queue is None: raise ValueError("The queue needs to be defined.") if self.jobname is None: raise ValueError("The jobname needs to be defined.") user = getpass.getuser() l_total = 0 for q in ensurelist(self.queue): cmd = [self._qstatus, "-J", self.jobname, "-u", user, "-q", q] logger.debug(cmd) # This command randomly fails so I need to allow it to repeat or it crashes adaptive tries = 0 while tries < 3: try: ret = check_output(cmd, stderr=DEVNULL) except CalledProcessError: if tries == 2: raise tries += 1 time.sleep(3) continue break logger.debug(ret.decode("ascii")) # TODO: check lines and handle errors l = ret.decode("ascii").split("\n") l = len(l) - 2 if l < 0: l = 0 # something odd happened l_total += l return l_total
def _createJobScript(self, fname, workdir, runsh): from jobqueues.config import template_env workdir = os.path.abspath(workdir) sentinel = os.path.normpath(os.path.join(workdir, self._sentinel)) # Move completed trajectories odir = None if self.datadir is not None: datadir = os.path.abspath(self.datadir) if not os.path.isdir(datadir): os.makedirs(datadir, exist_ok=True) simname = os.path.basename(os.path.normpath(workdir)) # create directory for new file odir = os.path.join(datadir, simname) os.makedirs(odir, exist_ok=True) memory = int(ceil(self.memory / 1000)) if self.memory is not None else None template = template_env.get_template("SGE_job.sh.j2") job_str = template.render( jobname=self.jobname, queue=",".join(ensurelist(self.queue)), pe=self.pe, cores=self.ncpu, ngpu=self.ngpu, memory=memory, workdir=workdir, envvars=self.envvars, walltime=self.walltime, sentinel=sentinel, prerun=self.prerun, runsh=runsh, datadir=self.datadir, odir=odir, trajext=self.trajext, ) with open(fname, "w") as f: f.write(job_str) os.chmod(fname, 0o700)
def stop(self): """Cancels all currently running and queued jobs""" import getpass if self.jobname is None: raise ValueError("The jobname needs to be defined.") user = getpass.getuser() if self.queue is not None: for q in ensurelist(self.queue): cmd = [self._qcancel, "-J", self.jobname, "-u", user, "-q", q] logger.debug(cmd) ret = check_output(cmd, stderr=DEVNULL) logger.debug(ret.decode("ascii")) else: cmd = [self._qcancel, "-J", self.jobname, "-u", user] logger.debug(cmd) ret = check_output(cmd, stderr=DEVNULL) logger.debug(ret.decode("ascii"))
def retrieve(self): import shutil from glob import glob from playmolecule import JobStatus from jobqueues.util import ensurelist retrievedir = self.retrievedir if self.retrievedir is not None else self.datadir jobs = self._getJobs(returnDict=False, status=(JobStatus.COMPLETED, JobStatus.ERROR)) for job in jobs: targetdir = os.path.join(self.datadir, job.name) retdir = job.retrieve(path=retrievedir, skip=True) if job.getStatus() != JobStatus.ERROR: for fglob in ensurelist(self.copy): currglob = os.path.join(retdir, fglob) if "*" in currglob: for ff in glob(currglob): _symlinkorcopy(ff, targetdir, self.symlink) else: _symlinkorcopy(currglob, targetdir, self.symlink)
def _createJobScript(self, fname, workdir, runsh): from jobqueues.util import ensurelist workdir = os.path.abspath(workdir) with open(fname, "w") as f: f.write("#!/bin/bash\n") f.write("#\n") f.write("#BSUB -J {}\n".format(self.jobname)) f.write('#BSUB -q "{}"\n'.format(" ".join(ensurelist(self.queue)))) f.write("#BSUB -n {}\n".format(self.ncpu)) if self.app is not None: f.write("#BSUB -app {}\n".format(self.app)) if self.ngpu != 0: if self.version == 9: if self.gpu_options is not None: logger.warning( "gpu_options argument was set while it is not needed for LSF version 9" ) f.write( '#BSUB -R "select[ngpus>0] rusage[ngpus_excl_p={}]"\n'. format(self.ngpu)) elif self.version == 10: if not self.gpu_options: self.gpu_options = {"mode": "exclusive_process"} gpu_requirements = list() gpu_requirements.append("num={}".format(self.ngpu)) for i in self.gpu_options: gpu_requirements.append("{}={}".format( i, self.gpu_options[i])) f.write('#BSUB -gpu "{}"\n'.format( ":".join(gpu_requirements))) else: raise AttributeError("Version not supported") if self.resources is not None: for resource in ensurelist(self.resources): f.write('#BSUB -R "{}"\n'.format(resource)) f.write("#BSUB -M {}\n".format(self.memory)) f.write("#BSUB -cwd {}\n".format(workdir)) f.write("#BSUB -outdir {}\n".format(workdir)) f.write("#BSUB -o {}\n".format(self.outputstream)) f.write("#BSUB -e {}\n".format(self.errorstream)) if self.envvars is not None: f.write("#BSUB --env {}\n".format(self.envvars)) if self.walltime is not None: f.write("#BSUB -W {}\n".format(self.walltime)) # Trap kill signals to create sentinel file f.write('\ntrap "touch {}" EXIT SIGTERM\n'.format( os.path.normpath(os.path.join(workdir, self._sentinel)))) f.write("\n") if self.prerun is not None: for call in ensurelist(self.prerun): f.write("{}\n".format(call)) f.write("\ncd {}\n".format(workdir)) f.write("{}".format(runsh)) # Move completed trajectories if self.datadir is not None: simname = os.path.basename(os.path.normpath(workdir)) datadir = os.path.abspath(os.path.join(self.datadir, simname)) os.makedirs(datadir, exist_ok=True) f.write(f"\nmv *.{self.trajext} {datadir}") os.chmod(fname, 0o700)
def _createJobScript(self, fname, workdir, runsh): workdir = os.path.abspath(workdir) with open(fname, "w") as f: f.write("#!/bin/bash\n") f.write("#\n") f.write("#SBATCH --job-name={}\n".format(self.jobname)) f.write( "#SBATCH --partition={}\n".format(",".join(ensurelist(self.partition))) ) if self.ngpu != 0: f.write("#SBATCH --gres=gpu:{}".format(self.ngpu)) if self.gpumemory is not None: f.write(",gpu_mem:{}".format(self.gpumemory)) f.write("\n") f.write("#SBATCH --cpus-per-task={}\n".format(self.ncpu)) f.write("#SBATCH --mem={}\n".format(self.memory)) f.write("#SBATCH --priority={}\n".format(self.priority)) f.write( "#SBATCH -D {}\n".format(workdir) ) # Don't use the long version. Depending on SLURM version it's workdir or chdir f.write("#SBATCH --output={}\n".format(self.outputstream)) f.write("#SBATCH --error={}\n".format(self.errorstream)) if self.envvars is not None: f.write("#SBATCH --export={}\n".format(self.envvars)) if self.walltime is not None: f.write("#SBATCH --time={}\n".format(self.walltime)) if self.mailtype is not None and self.mailuser is not None: f.write("#SBATCH --mail-type={}\n".format(self.mailtype)) f.write("#SBATCH --mail-user={}\n".format(self.mailuser)) if self.nodelist is not None: f.write( "#SBATCH --nodelist={}\n".format( ",".join(ensurelist(self.nodelist)) ) ) if self.exclude is not None: f.write( "#SBATCH --exclude={}\n".format(",".join(ensurelist(self.exclude))) ) if self.account is not None: f.write("#SBATCH --account={}\n".format(self.account)) # Trap kill signals to create sentinel file f.write( '\ntrap "touch {}" EXIT SIGTERM\n'.format( os.path.normpath(os.path.join(workdir, self._sentinel)) ) ) f.write("\n") if self.prerun is not None: for call in ensurelist(self.prerun): f.write("{}\n".format(call)) f.write("\ncd {}\n".format(workdir)) f.write("{}".format(runsh)) # Move completed trajectories if self.datadir is not None: simname = os.path.basename(os.path.normpath(workdir)) datadir = os.path.abspath(os.path.join(self.datadir, simname)) os.makedirs(datadir, exist_ok=True) f.write(f"\nmv *.{self.trajext} {datadir}") os.chmod(fname, 0o700)