コード例 #1
0
ファイル: batch.py プロジェクト: tetukas/EVcouplings
class _Broker(mp.Process):
    """
    _Broker process handling dependencies and
    submission of jobs
    """
    def __init__(self,
                 broker_queue,
                 worker_queue,
                 results_queue,
                 results_queue_worker,
                 pending_dict,
                 db_path=None):
        mp.Process.__init__(self)
        self.__input_queue = broker_queue
        self.__results_queue_master = results_queue
        self.__results_queue_worker = results_queue_worker
        self.__worker_queue = worker_queue
        self.__pending_dict = pending_dict
        self.__db = PersistentDict(db_path)

    def __del__(self):
        try:
            # kill remaining runing commands
            for k, v in self.__db.RangeIter():
                if v["status"] not in [EStatus.EXIT, EStatus.DONE]:
                    os.killpg(os.getpgid(v["job_id"]), signal.SIGKILL)

            self.__db.close()
        except AttributeError:
            pass

    def run(self):
        while True:
            try:
                args = self.__input_queue.get(True, 0.1)
                ejob, args = args
                if ejob == EJob.STOP:
                    # terminate broker
                    self.terminate()

                elif ejob == EJob.MONITOR:
                    # monitor request
                    self.__results_queue_master.put(self.__monitor(args))

                elif ejob == EJob.CANCEL:
                    # cancel job
                    status = self.__cancel(args)
                    self.__results_queue_master.put(True)
                    self.__update_status(args.command_id, status)

                elif ejob == EJob.UPDATE:
                    # updating command status (e.g, RUN, EXIT, DONE)
                    # some command has started message is coming from worker process
                    c_id, status = args
                    self.__update_status(c_id, status)

                elif ejob == EJob.PID:
                    # updating process ID of submitted command
                    job_id, p_id = args
                    entry = yaml.load(self.__db[job_id], yaml.RoundTripLoader)
                    entry["job_id"] = p_id
                    self.__db[job_id] = yaml.dump(entry,
                                                  Dumper=yaml.RoundTripDumper)
                    self.__db.sync()

                else:
                    # submitting job to worker
                    job, dependent = args
                    self.__add_command(job)
                    if dependent is not None:
                        self.__pending_dict[job] = dependent
                    else:
                        self.__worker_queue.put(job)

            except queue.Empty:
                # go through pending jobs and update status
                # if one of the dependent jobs terminated with an error pending job is also terminated with error
                for job, dependent in list(self.__pending_dict.items()):
                    status = self.__condition_fulfilled(dependent)
                    if job is not None:
                        if status == EStatus.EXIT:
                            # job cannot be called due to termination of dependent jobs
                            self.__update_status(job.command_id, EStatus.EXIT)
                            del self.__pending_dict[job]
                        elif status == EStatus.RUN:
                            self.__worker_queue.put(job)
                            del self.__pending_dict[job]
                    else:
                        # makes sure that a submitted job is properly registered and join works as intended
                        if status == EStatus.RUN:
                            del self.__pending_dict[job]

            except Exception as e:
                tb = traceback.format_exc()
                self.__results_queue_master.put((e, tb))

    def __condition_fulfilled(self, dependent):
        for d in dependent:
            status = self.__monitor(d)
            if status == EStatus.EXIT:
                return EStatus.EXIT
            if status != EStatus.DONE:
                return EStatus.PEND
        return EStatus.RUN

    def __add_command(self, command):
        try:
            # update entry if existing:
            entry = yaml.load(self.__db[command.command_id],
                              yaml.RoundTripLoader)
            entry["name"] = command.name
            entry["tries"] += 1
            entry["job_id"] = None
            entry["status"] = EStatus.PEND
            entry["command"] = command.command
            entry["resources"] = command.resources
            entry["workdir"] = command.workdir
            entry["environment"] = command.environment
            self.__db[command.command_id] = yaml.dump(
                entry, Dumper=yaml.RoundTripDumper)
        except KeyError:
            # add new entry
            entry = {
                "name": command.name,
                "job_id": None,
                "tries": 1,
                "status": EStatus.PEND,
                "command": command.command,
                "resources": command.resources,
                "workdir": command.workdir,
                "environment": command.environment
            }
            self.__db[command.command_id] = yaml.dump(
                entry, Dumper=yaml.RoundTripDumper)
            self.__db.sync()

    def __cancel(self, command):
        try:
            entry = yaml.load(self.__db[command.command_id],
                              yaml.RoundTripLoader)
        except KeyError:
            raise ValueError("Command " + repr(command.command_id) +
                             " has not been submitted yet.")
        p_id = entry["job_id"]
        status = self.__monitor(command)
        if status == EStatus.PEND:
            del self.__pending_dict[command]
        elif status in [EStatus.SUSP, EStatus.RUN]:
            os.killpg(os.getpgid(p_id), signal.SIGKILL)
        elif status == EStatus.DONE:
            return status
        return EStatus.EXIT

    def __monitor(self, command):
        """
        local function to monitor a command via assigned pID

        Parameters
        ----------
        command: Command
            The Command object

        Returns
        -------
        EStatus
            The status of the command
        """
        if command in self.__pending_dict:
            return EStatus.PEND

        try:
            entry = yaml.load(self.__db[command.command_id],
                              yaml.RoundTripLoader)
        except KeyError:
            raise ValueError("Command " + repr(command.command_id) +
                             " has not been submitted yet.")

        p_id = entry["job_id"]
        cmd = entry["command"][0]

        try:
            # I think if PID is completed this should through an error ....
            p = psutil.Process(pid=p_id)
            status = p.status()
            p_cmd = " ".join(p.cmdline())

            # test status types # if status is ZOMBIE also kill the job
            # pid can be already in use by other process... one has to check the command as well....
            # if p_cmd is different than the original process is probably completed
            if cmd not in p_cmd:
                c_stat = EStatus.DONE

            elif status in [psutil.STATUS_DEAD, psutil.STATUS_ZOMBIE]:
                p.kill()
                c_stat = EStatus.EXIT

            elif status == psutil.STATUS_RUNNING:
                c_stat = EStatus.SUSP
            else:
                c_stat = EStatus.SUSP
        except psutil.NoSuchProcess:
            c_stat = EStatus.DONE
        except psutil.AccessDenied:
            c_stat = EStatus.RUN

        entry["status"] = c_stat
        self.__db[command.command_id] = yaml.dump(entry,
                                                  Dumper=yaml.RoundTripDumper)
        self.__db.sync()
        return c_stat

    def __update_status(self, c_id, status):
        """
        updates the status of a command

        Parameters
        ----------
        c_id: str
            The Command id
        status: EStatus
            The  new status
        """
        try:
            entry = yaml.load(self.__db[c_id], yaml.RoundTripLoader)
        except KeyError:
            raise ValueError("Command " + repr(c_id) +
                             " has not been submitted yet.")
        entry["status"] = status
        self.__db[c_id] = yaml.dump(entry, Dumper=yaml.RoundTripDumper)
        self.__db.sync()
コード例 #2
0
ファイル: batch.py プロジェクト: tetukas/EVcouplings
class SlurmSubmitter(AClusterSubmitter):
    """
    Implements an LSF submitter
    """
    __name = "slurm"
    __submit = "sbatch --job-name={name} {dependent} {resources} --wrap 'srun {cmd}'"
    __monitor = "squeue -t all -j {job_id}"
    __cancel = "scancel {job_id}"
    __resources = ""
    __resources_flag = {
        EResource.queue: "-p",
        EResource.time: "-t",
        EResource.mem: "--mem-per-cpu",
        EResource.nodes: "-c",
        EResource.error: "-e",
        EResource.out: "-o"
    }
    __job_id_pattern = re.compile(r"Submitted batch job ([0-9]*)")

    def __init__(self, blocking=False, db_path=None):
        """
        Init function

        Parameters
        ----------
        blocking: bool
            determines whether join() blocks or not
        db_path: str
            the string to a LevelDB for command persistence
        """
        self.__blocking = blocking
        if db_path is None:
            tmp_db = NamedTemporaryFile(delete=False,
                                        dir=os.getcwd(),
                                        suffix=".db")
            tmp_db.close()
            self.__is_temp_db = True
            self.__db_path = tmp_db.name
        else:
            self.__is_temp_db = False
            self.__db_path = db_path

        self.__db = PersistentDict(self.__db_path)

    def __del__(self):
        try:
            self.__db.close()
            if self.__is_temp_db:
                os.remove(self.__db_path)
        except AttributeError:
            pass

    @property
    def isBlocking(self):
        return self.__blocking

    @property
    def name(self):
        return self.__name

    @property
    def monitor_command(self):
        return self.__monitor

    @property
    def resource_flags(self):
        return self.__resources_flag

    @property
    def submit_command(self):
        return self.__submit

    @property
    def db(self):
        return self.__db

    @property
    def cancel_command(self):
        return self.__cancel

    @property
    def job_id_pattern(self):
        return self.__job_id_pattern

    def _prepare_dependencies(self, dependent):
        dep = ""
        if dependent is not None:
            try:
                if isinstance(dependent, Command):
                    d_info = yaml.load(self.__db[dependent.command_id],
                                       yaml.RoundTripLoader)
                    dep = "--kill-on-invalid-dep=yes --dependency=afterok:{}".format(
                        d_info["job_id"])
                else:
                    dep_jobs = []
                    for d in dependent:
                        d_info = yaml.load(self.__db[d.command_id],
                                           yaml.RoundTripLoader)
                        dep_jobs.append(d_info["job_id"])
                    # not sure if comma-separated is correct
                    dep = "--kill-on-invalid-dep=yes --dependency=afterok:{}".format(
                        ":".join("ended({})".format(d) for d in dep_jobs))
            except KeyError:
                raise ValueError(
                    "Specified depended jobs have not been submitted yet.")
        return dep

    def _prepare_resources(self, resources):
        return " ".join("{} {}".format(self.resource_flags[k], v)
                        for k, v in resources.items())

    def _get_status(self, stdo):
        def status_map(st):
            if st in ["PD", "CF"]:
                return EStatus.PEND
            elif st in ["R", "CG"]:
                return EStatus.RUN
            elif st == "CD":
                return EStatus.DONE
            elif st in ["BF", "PR", "TO", "NF", "F", "CA"]:
                return EStatus.EXIT
            else:
                return EStatus.SUSP

        return status_map(stdo.split("\n")[1].split()[4].strip())
コード例 #3
0
ファイル: batch.py プロジェクト: tetukas/EVcouplings
class SGESubmitter(AClusterSubmitter):
    """
    Implements an LSF submitter
    """
    __name = "sge"
    __submit = "echo '{cmd}' | qsub -N {name} {dependent} {resources}"
    __monitor = "qstat"
    __cancel = "qdel {job_id}"
    __resources = ""
    __resources_flag = {
        EResource.queue: "-q",
        EResource.time: '-l h_rt=',
        EResource.mem: '-l h_vmem=',
        EResource.nodes: "-pe smp",
        EResource.error: "-e",
        EResource.out: "-o"
    }
    __job_id_pattern = re.compile(r'Your job ([0-9]+) .*')

    def __init__(self, blocking=False, db_path=None):
        """
        Init function

        Parameters
        ----------
        blocking: bool
            determines whether join() blocks or not
        db_path: str
            the string to a LevelDB for command persistence
        """
        self.__blocking = blocking
        if db_path is None:
            tmp_db = NamedTemporaryFile(delete=False,
                                        dir=os.getcwd(),
                                        suffix=".db")
            tmp_db.close()
            self.__is_temp_db = True
            self.__db_path = tmp_db.name
        else:
            self.__is_temp_db = False
            self.__db_path = db_path

        self.__db = PersistentDict(self.__db_path)

    def __del__(self):
        try:
            self.__db.close()
            if self.__is_temp_db:
                os.remove(self.__db_path)
        except AttributeError:
            pass

    @property
    def isBlocking(self):
        return self.__blocking

    @property
    def name(self):
        return self.__name

    @property
    def monitor_command(self):
        return self.__monitor

    @property
    def resource_flags(self):
        return self.__resources_flag

    @property
    def submit_command(self):
        return self.__submit

    @property
    def db(self):
        return self.__db

    @property
    def cancel_command(self):
        return self.__cancel

    @property
    def job_id_pattern(self):
        return self.__job_id_pattern

    def _prepare_resources(self, resources):
        special_res = {EResource.mem, EResource.time}
        return " ".join(
            "{} {}".format(self.resource_flags[k], v) if k not in
            special_res else "{}{}".format(self.resource_flags[k], v)
            for k, v in resources.items())

    def _prepare_dependencies(self, dependent):
        dep = ""
        if dependent is not None:
            try:
                if isinstance(dependent, Command):
                    d_info = yaml.load(self.__db[dependent.command_id],
                                       yaml.RoundTripLoader)
                    dep = "{}".format(d_info["job_id"])
                else:
                    dep_jobs = []
                    for d in dependent:
                        d_info = yaml.load(self.__db[d.command_id],
                                           yaml.RoundTripLoader)
                        dep_jobs.append(d_info["job_id"])
                    dep = ",".join(dep_jobs)
                dep = "-hold_jid " + dep
            except KeyError:
                raise ValueError(
                    "Specified depended jobs have not been submitted yet.")
        return dep

    def _internal_monitor(self, command_id):
        try:
            job_id = yaml.load(self.db[command_id],
                               yaml.RoundTripLoader)["job_id"]
        except KeyError:
            raise ValueError("Command " + repr(command_id) +
                             " has not been submitted yet.")

        submit = self.monitor_command.format(job_id=job_id)

        try:
            p = subprocess.Popen(submit,
                                 shell=True,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 universal_newlines=True)
            stdo, stde = p.communicate()
            stdr = p.returncode
            if stdr > 0:
                raise RuntimeError("Unsuccessful monitoring of " +
                                   repr(command_id) +
                                   " (EXIT!=0) with error: " + stde)

        except Exception as e:
            raise RuntimeError(e)

        status = self._get_status(stdo, job_id)

        entry = yaml.load(self.db[command_id], yaml.RoundTripLoader)
        entry["status"] = status
        self.db[command_id] = yaml.dump(entry, Dumper=yaml.RoundTripDumper)
        self.db.sync()

        return status

    def _get_status(self, stdo, job_id):
        def status_map(status):
            if status == "r":
                return EStatus.RUN
            elif status == "qw":
                return EStatus.PEND
            elif status in ["Ewq", "e", "E"]:
                return EStatus.SUSP
            else:
                return EStatus.EXIT

        # search in list for command_id and extract status
        for l in stdo.split("\n"):
            if "" == l.strip():
                continue
            splits = l.split()
            if job_id == splits[0]:
                return status_map(splits[4])
        return EStatus.DONE
コード例 #4
0
ファイル: batch.py プロジェクト: tetukas/EVcouplings
class LSFSubmitter(AClusterSubmitter):
    """
    Implements an LSF submitter
    """
    __name = "lsf"
    __submit = "bsub -J {name} {dependent} {resources} '{cmd}'"
    __monitor = "bjobs {job_id}"
    __cancel = "bkill {job_id}"
    __resources = ""
    __resources_flag = {
        EResource.queue: "-q",
        EResource.time: "-W",
        EResource.mem: "-R",
        EResource.nodes: "-n",
        EResource.error: "-e",
        EResource.out: "-o"
    }
    __job_id_pattern = re.compile(r"^Job <([0-9]*)>")

    def __init__(self, blocking=False, db_path=None):
        """
        Init function

        Parameters
        ----------
        blocking: bool
            determines whether join() blocks or not
        db_path: str
            the string to a LevelDB for command persistence
        """
        self.__blocking = blocking
        if db_path is None:
            tmp_db = NamedTemporaryFile(delete=False,
                                        dir=os.getcwd(),
                                        suffix=".db")
            tmp_db.close()
            self.__is_temp_db = True
            self.__db_path = tmp_db.name
        else:
            self.__is_temp_db = False
            self.__db_path = db_path

        self.__db = PersistentDict(self.__db_path)

    def __del__(self):
        try:
            self.__db.close()
            if self.__is_temp_db:
                os.remove(self.__db_path)
        except AttributeError:
            pass

    @property
    def isBlocking(self):
        return self.__blocking

    @property
    def name(self):
        return self.__name

    @property
    def monitor_command(self):
        return self.__monitor

    @property
    def resource_flags(self):
        return self.__resources_flag

    @property
    def submit_command(self):
        return self.__submit

    @property
    def db(self):
        return self.__db

    @property
    def cancel_command(self):
        return self.__cancel

    @property
    def job_id_pattern(self):
        return self.__job_id_pattern

    def _get_status(self, stdo):
        def status_map(st):
            if st == "PEND":
                return EStatus.PEND
            elif st == "RUN":
                return EStatus.RUN
            elif st == "DONE":
                return EStatus.DONE
            elif st == "EXIT":
                return EStatus.EXIT
            else:
                return EStatus.SUSP

        return status_map(stdo.split("\n")[1].split()[2].strip())

    def _prepare_dependencies(self, dependent):
        dep = ""
        if dependent is not None:
            try:
                if isinstance(dependent, Command):
                    d_info = yaml.load(self.__db[dependent.command_id],
                                       yaml.RoundTripLoader)
                    dep = "-w {}".format(d_info["job_id"])
                else:
                    dep_jobs = []
                    for d in dependent:
                        d_info = yaml.load(self.__db[d.command_id],
                                           yaml.RoundTripLoader)
                        dep_jobs.append(d_info["job_id"])
                    # not sure if comma-separated is correct
                    dep = "-w {}".format(" && ".join("ended({})".format(d)
                                                     for d in dep_jobs))
            except KeyError:
                raise ValueError(
                    "Specified depended jobs have not been submitted yet.")
        return dep

    def _prepare_resources(self, resources):
        return " ".join(
            "{} {}".format(self.resource_flags[k], v) if k != EResource.mem
            else "{} 'rusage[mem={}]'".format(self.resource_flags[k], v)
            for k, v in resources.items())