Beispiel #1
0
    def _getJobStatusTree(self):
        import xml.etree.ElementTree as ET
        import getpass
        import time

        if self.jobname is None:
            raise ValueError("The jobname needs to be defined.")

        user = getpass.getuser()
        cmd = [self._qstatus, "-u", user, "-xml"]
        if self.queue is not None:
            cmd += ["-q", ",".join(ensurelist(self.queue))]

        logger.debug(cmd)

        # This command randomly fails so I need to allow it to repeat or it crashes adaptive
        tries = 0
        while tries < 3:
            try:
                ret = check_output(cmd, stderr=DEVNULL)
            except CalledProcessError:
                if tries == 2:
                    raise
                tries += 1
                time.sleep(3)
                continue
            break

        logger.debug(ret.decode("ascii"))
        return ET.fromstring(ret.decode("ascii").strip())
Beispiel #2
0
    def jobInfo(self):
        from jobqueues.simqueue import QueueJobStatus

        if self.jobname is None:
            raise ValueError("The jobname needs to be defined.")

        cmd = [
            self._qjobinfo,
            "--name",
            self.jobname,
            "-u",
            self.user,
            "-o",
            "JobID,JobName,State,ExitCode,Reason,Timelimit",
            "-P",
            "-X",
        ]
        if self.partition is not None:
            cmd += ["--partition", ",".join(ensurelist(self.partition))]

        logger.debug(cmd)
        ret = self._robust_check_output(cmd).decode("ascii")
        logger.debug(ret)

        # TODO: Is there a specific exit code for this?
        if "Slurm accounting storage is disabled" in ret:
            return None

        lines = ret.splitlines()
        if len(lines) < 2:
            return None

        info = {}
        for line in lines[1:]:
            jobid, _, state, exitcode, reason, timelimit = line.split("|")

            if state in JOB_STATE_CODES:
                state = JOB_STATE_CODES[state]
            else:
                raise RuntimeError(f'Unknown SLURM job state "{state}"')

            info[jobid] = {
                "state": state,
                "exitcode": exitcode,
                "reason": reason,
                "timelimit": timelimit,
            }

        return info
Beispiel #3
0
    def inprogress(self):
        """Returns the sum of the number of running and queued workunits of the specific group in the engine.

        Returns
        -------
        total : int
            Total running and queued workunits
        """
        import time

        if self.jobname is None:
            raise ValueError("The jobname needs to be defined.")

        cmd = [
            self._qstatus,
            "-n",
            self.jobname,
            "-u",
            self.user,
        ]
        if self.partition is not None:
            cmd += ["--partition", ",".join(ensurelist(self.partition))]

        logger.debug(cmd)
        ret = self._robust_check_output(cmd).decode("ascii")
        logger.debug(ret)

        # Count the number of lines returned by squeue as number of "in progress" jobs
        lines = ret.splitlines()
        inprog = max(0, len(lines) - 1)

        # Check also with sacct because squeue sometimes fails to report the right number
        try:
            res = self.jobInfo()
            if res is None:
                return inprog
            info = [
                key for key, val in res.items() if val["state"] in _inProgressStatus
            ]
            if len(info) != inprog:
                logger.warning(
                    f"squeue and sacct gave different number of running jobs ({inprog}/{len(info)}) with name {self.jobname}. Using the max of the two."
                )
            inprog = max(inprog, len(info))
        except Exception as e:
            logger.warning(f"Failed to get jobInfo with error: {e}")

        return inprog
Beispiel #4
0
    def stop(self):
        """Cancels all currently running and queued jobs"""
        if self.jobname is None:
            raise ValueError("The jobname needs to be defined.")

        if self.partition is not None:
            for q in ensurelist(self.partition):
                cmd = [self._qcancel, "-n", self.jobname, "-u", self.user, "-p", q]
                logger.debug(cmd)
                ret = check_output(cmd)
                logger.debug(ret.decode("ascii"))
        else:
            cmd = [self._qcancel, "-n", self.jobname, "-u", self.user]
            logger.debug(cmd)
            ret = check_output(cmd)
            logger.debug(ret.decode("ascii"))
Beispiel #5
0
    def submit(self, folders):
        from jobqueues.util import ensurelist
        import uuid

        if self.group is None:
            self.group = uuid.uuid4()

        for f in ensurelist(folders):
            if self.jobname is None:
                name = os.path.basename(os.path.abspath(f))
            else:
                name = self.jobname

            job = self.session.startApp(self.app)
            job.readArguments(os.path.join(f, self.configname))
            job.name = name
            job.group = self.group
            job.submit(childOf=self.parentjob._execid if self.
                       parentjob is not None else None)
Beispiel #6
0
    def inprogress(self):
        """Returns the sum of the number of running and queued workunits of the specific group in the engine.

        Returns
        -------
        total : int
            Total running and queued workunits
        """
        import time
        import getpass

        if self.queue is None:
            raise ValueError("The queue needs to be defined.")
        if self.jobname is None:
            raise ValueError("The jobname needs to be defined.")
        user = getpass.getuser()
        l_total = 0
        for q in ensurelist(self.queue):
            cmd = [self._qstatus, "-J", self.jobname, "-u", user, "-q", q]
            logger.debug(cmd)

            # This command randomly fails so I need to allow it to repeat or it crashes adaptive
            tries = 0
            while tries < 3:
                try:
                    ret = check_output(cmd, stderr=DEVNULL)
                except CalledProcessError:
                    if tries == 2:
                        raise
                    tries += 1
                    time.sleep(3)
                    continue
                break

            logger.debug(ret.decode("ascii"))

            # TODO: check lines and handle errors
            l = ret.decode("ascii").split("\n")
            l = len(l) - 2
            if l < 0:
                l = 0  # something odd happened
            l_total += l
        return l_total
Beispiel #7
0
    def _createJobScript(self, fname, workdir, runsh):
        from jobqueues.config import template_env

        workdir = os.path.abspath(workdir)
        sentinel = os.path.normpath(os.path.join(workdir, self._sentinel))
        # Move completed trajectories
        odir = None
        if self.datadir is not None:
            datadir = os.path.abspath(self.datadir)
            if not os.path.isdir(datadir):
                os.makedirs(datadir, exist_ok=True)
            simname = os.path.basename(os.path.normpath(workdir))
            # create directory for new file
            odir = os.path.join(datadir, simname)
            os.makedirs(odir, exist_ok=True)

        memory = int(ceil(self.memory /
                          1000)) if self.memory is not None else None

        template = template_env.get_template("SGE_job.sh.j2")
        job_str = template.render(
            jobname=self.jobname,
            queue=",".join(ensurelist(self.queue)),
            pe=self.pe,
            cores=self.ncpu,
            ngpu=self.ngpu,
            memory=memory,
            workdir=workdir,
            envvars=self.envvars,
            walltime=self.walltime,
            sentinel=sentinel,
            prerun=self.prerun,
            runsh=runsh,
            datadir=self.datadir,
            odir=odir,
            trajext=self.trajext,
        )

        with open(fname, "w") as f:
            f.write(job_str)

        os.chmod(fname, 0o700)
Beispiel #8
0
    def stop(self):
        """Cancels all currently running and queued jobs"""
        import getpass

        if self.jobname is None:
            raise ValueError("The jobname needs to be defined.")

        user = getpass.getuser()

        if self.queue is not None:
            for q in ensurelist(self.queue):
                cmd = [self._qcancel, "-J", self.jobname, "-u", user, "-q", q]
                logger.debug(cmd)
                ret = check_output(cmd, stderr=DEVNULL)
                logger.debug(ret.decode("ascii"))
        else:
            cmd = [self._qcancel, "-J", self.jobname, "-u", user]
            logger.debug(cmd)
            ret = check_output(cmd, stderr=DEVNULL)
            logger.debug(ret.decode("ascii"))
Beispiel #9
0
    def retrieve(self):
        import shutil
        from glob import glob
        from playmolecule import JobStatus
        from jobqueues.util import ensurelist

        retrievedir = self.retrievedir if self.retrievedir is not None else self.datadir

        jobs = self._getJobs(returnDict=False,
                             status=(JobStatus.COMPLETED, JobStatus.ERROR))

        for job in jobs:
            targetdir = os.path.join(self.datadir, job.name)

            retdir = job.retrieve(path=retrievedir, skip=True)

            if job.getStatus() != JobStatus.ERROR:
                for fglob in ensurelist(self.copy):
                    currglob = os.path.join(retdir, fglob)
                    if "*" in currglob:
                        for ff in glob(currglob):
                            _symlinkorcopy(ff, targetdir, self.symlink)
                    else:
                        _symlinkorcopy(currglob, targetdir, self.symlink)
Beispiel #10
0
    def _createJobScript(self, fname, workdir, runsh):
        from jobqueues.util import ensurelist

        workdir = os.path.abspath(workdir)
        with open(fname, "w") as f:
            f.write("#!/bin/bash\n")
            f.write("#\n")
            f.write("#BSUB -J {}\n".format(self.jobname))
            f.write('#BSUB -q "{}"\n'.format(" ".join(ensurelist(self.queue))))
            f.write("#BSUB -n {}\n".format(self.ncpu))
            if self.app is not None:
                f.write("#BSUB -app {}\n".format(self.app))
            if self.ngpu != 0:
                if self.version == 9:
                    if self.gpu_options is not None:
                        logger.warning(
                            "gpu_options argument was set while it is not needed for LSF version 9"
                        )
                    f.write(
                        '#BSUB -R "select[ngpus>0] rusage[ngpus_excl_p={}]"\n'.
                        format(self.ngpu))
                elif self.version == 10:
                    if not self.gpu_options:
                        self.gpu_options = {"mode": "exclusive_process"}
                    gpu_requirements = list()
                    gpu_requirements.append("num={}".format(self.ngpu))
                    for i in self.gpu_options:
                        gpu_requirements.append("{}={}".format(
                            i, self.gpu_options[i]))
                    f.write('#BSUB -gpu "{}"\n'.format(
                        ":".join(gpu_requirements)))
                else:
                    raise AttributeError("Version not supported")
            if self.resources is not None:
                for resource in ensurelist(self.resources):
                    f.write('#BSUB -R "{}"\n'.format(resource))
            f.write("#BSUB -M {}\n".format(self.memory))
            f.write("#BSUB -cwd {}\n".format(workdir))
            f.write("#BSUB -outdir {}\n".format(workdir))
            f.write("#BSUB -o {}\n".format(self.outputstream))
            f.write("#BSUB -e {}\n".format(self.errorstream))
            if self.envvars is not None:
                f.write("#BSUB --env {}\n".format(self.envvars))
            if self.walltime is not None:
                f.write("#BSUB -W {}\n".format(self.walltime))
            # Trap kill signals to create sentinel file
            f.write('\ntrap "touch {}" EXIT SIGTERM\n'.format(
                os.path.normpath(os.path.join(workdir, self._sentinel))))
            f.write("\n")
            if self.prerun is not None:
                for call in ensurelist(self.prerun):
                    f.write("{}\n".format(call))
            f.write("\ncd {}\n".format(workdir))
            f.write("{}".format(runsh))

            # Move completed trajectories
            if self.datadir is not None:
                simname = os.path.basename(os.path.normpath(workdir))
                datadir = os.path.abspath(os.path.join(self.datadir, simname))
                os.makedirs(datadir, exist_ok=True)
                f.write(f"\nmv *.{self.trajext} {datadir}")

        os.chmod(fname, 0o700)
Beispiel #11
0
    def _createJobScript(self, fname, workdir, runsh):
        workdir = os.path.abspath(workdir)
        with open(fname, "w") as f:
            f.write("#!/bin/bash\n")
            f.write("#\n")
            f.write("#SBATCH --job-name={}\n".format(self.jobname))
            f.write(
                "#SBATCH --partition={}\n".format(",".join(ensurelist(self.partition)))
            )
            if self.ngpu != 0:
                f.write("#SBATCH --gres=gpu:{}".format(self.ngpu))
                if self.gpumemory is not None:
                    f.write(",gpu_mem:{}".format(self.gpumemory))
                f.write("\n")
            f.write("#SBATCH --cpus-per-task={}\n".format(self.ncpu))
            f.write("#SBATCH --mem={}\n".format(self.memory))
            f.write("#SBATCH --priority={}\n".format(self.priority))
            f.write(
                "#SBATCH -D {}\n".format(workdir)
            )  # Don't use the long version. Depending on SLURM version it's workdir or chdir
            f.write("#SBATCH --output={}\n".format(self.outputstream))
            f.write("#SBATCH --error={}\n".format(self.errorstream))
            if self.envvars is not None:
                f.write("#SBATCH --export={}\n".format(self.envvars))
            if self.walltime is not None:
                f.write("#SBATCH --time={}\n".format(self.walltime))
            if self.mailtype is not None and self.mailuser is not None:
                f.write("#SBATCH --mail-type={}\n".format(self.mailtype))
                f.write("#SBATCH --mail-user={}\n".format(self.mailuser))
            if self.nodelist is not None:
                f.write(
                    "#SBATCH --nodelist={}\n".format(
                        ",".join(ensurelist(self.nodelist))
                    )
                )
            if self.exclude is not None:
                f.write(
                    "#SBATCH --exclude={}\n".format(",".join(ensurelist(self.exclude)))
                )
            if self.account is not None:
                f.write("#SBATCH --account={}\n".format(self.account))
            # Trap kill signals to create sentinel file
            f.write(
                '\ntrap "touch {}" EXIT SIGTERM\n'.format(
                    os.path.normpath(os.path.join(workdir, self._sentinel))
                )
            )
            f.write("\n")
            if self.prerun is not None:
                for call in ensurelist(self.prerun):
                    f.write("{}\n".format(call))
            f.write("\ncd {}\n".format(workdir))
            f.write("{}".format(runsh))

            # Move completed trajectories
            if self.datadir is not None:
                simname = os.path.basename(os.path.normpath(workdir))
                datadir = os.path.abspath(os.path.join(self.datadir, simname))
                os.makedirs(datadir, exist_ok=True)
                f.write(f"\nmv *.{self.trajext} {datadir}")

        os.chmod(fname, 0o700)