Ejemplo n.º 1
0
    def release(arg):
        """Release a job that was previously on hold so it will be submitted
        to a compute node.

        Parameters
        ----------
        arg : int | list | str
            SLURM job id(s) to release. Can be 'all' to release all jobs.
        """

        if isinstance(arg, (list, tuple)):
            for jid in arg:
                SLURM.release(jid)

        elif isinstance(arg, int):
            cmd = 'release {}'.format(arg)
            SLURM.scontrol(cmd)

        elif str(arg).lower() == 'all':
            sq = SLURM.squeue()
            for row in sq[1:]:
                row_list = [x for x in row.strip().split(' ') if x != '']
                job_id = int(row_list[0])
                status = row_list[4]
                reason = row_list[-1]
                if status == 'PD' and 'jobheld' in reason.lower():
                    SLURM.release(job_id)

        else:
            e = ('Could not release: {} with type {}'.format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 2
0
    def __init__(self, obj, execution_iter, n_workers=None, mem_util_lim=0.7):
        """Single node parallel compute manager with smart data flushing.

        Parameters
        ----------
        obj : object
            Python object that will be submitted to futures. Must have methods
            run(arg) and flush(). run(arg) must take the iteration result of
            execution_iter as the single positional argument. Additionally,
            the results of obj.run(arg) will be pa ssed to obj.out. obj.out
            will be passed None when the memory is to be cleared. It is
            advisable that obj.run() be a @staticmethod for dramatically
            faster submission in parallel.
        execution_iter : iter
            Python iterator that controls the futures submitted in parallel.
        n_workers : int
            Number of workers to use in parallel. None will use all
            available workers.
        mem_util_lim : float
            Memory utilization limit (fractional). If the used memory divided
            by the total memory is greater than this value, the obj.out will
            be flushed and the local node memory will be cleared.
        """

        if not hasattr(obj, 'run') or not hasattr(obj, 'flush'):
            raise ExecutionError('Parallel execution with object: "{}" '
                                 'failed. The target object must have methods '
                                 'run() and flush()'.format(obj))
        self._obj = obj
        self._execution_iter = execution_iter
        self._n_workers = n_workers
        self._mem_util_lim = mem_util_lim
Ejemplo n.º 3
0
    def change_qos(arg, qos):
        """Change the priority (quality of service) for a job.

        Parameters
        ----------
        arg : int | list | str
            SLURM job id(s) to change qos for. Can be 'all' for all jobs.
        qos : str
            New qos value
        """

        if isinstance(arg, (list, tuple)):
            for jid in arg:
                SLURM.change_qos(jid, qos)

        elif isinstance(arg, int):
            cmd = 'update job {} QOS={}'.format(arg, qos)
            SLURM.scontrol(cmd)

        elif str(arg).lower() == 'all':
            sq = SLURM.squeue()
            for row in sq[1:]:
                row_list = [x for x in row.strip().split(' ') if x != '']
                job_id = int(row_list[0])
                status = row_list[4]
                if status == 'PD':
                    SLURM.change_qos(job_id, qos)

        else:
            e = ('Could not change qos of: {} with type {}'.format(
                arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 4
0
    def hold(arg):
        """Temporarily hold a job from submitting. Held jobs will stay in queue
        but will not get nodes until released.

        Parameters
        ----------
        arg : int | list | str
            SLURM job id(s) to hold. Can be 'all' to hold all jobs.
        """

        if isinstance(arg, (list, tuple)):
            for jid in arg:
                SLURM.hold(jid)

        elif isinstance(arg, int):
            cmd = 'hold {}'.format(arg)
            SLURM.scontrol(cmd)

        elif str(arg).lower() == 'all':
            sq = SLURM.squeue()
            for row in sq[1:]:
                row_list = [x for x in row.strip().split(' ') if x != '']
                job_id = int(row_list[0])
                status = row_list[4]
                if status == 'PD':
                    SLURM.hold(job_id)

        else:
            e = ('Could not hold: {} with type {}'.format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 5
0
    def scancel(arg):
        """Cancel a slurm job.

        Parameters
        ----------
        arg : int | list | str
            SLURM job id(s) to cancel. Can be a list of integer job ids, 'all'
            to cancel all jobs, or a feature (-p short) to cancel all jobs
            with a given feature
        """

        if isinstance(arg, (list, tuple)):
            for jid in arg:
                SLURM.scancel(jid)

        elif str(arg).lower() == 'all':
            sq = SLURM.squeue()
            for row in sq[1:]:
                job_id = int(row.strip().split(' ')[0])
                SLURM.scancel(job_id)

        elif isinstance(arg, (int, str)):
            cmd = ('scancel {}'.format(arg))
            cmd = shlex.split(cmd)
            call(cmd)

        else:
            e = ('Could not cancel: {} with type {}'.format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 6
0
    def release(self, arg):
        """Release a job that was previously on hold so it will be submitted
        to a compute node.

        Parameters
        ----------
        arg : int | list | str
            SLURM integer job id(s) to release.
            Can be 'all' to release all jobs.
        """

        if isinstance(arg, (list, tuple)):
            for job_id in arg:
                self.release(job_id)

        elif isinstance(arg, int):
            cmd = 'release {}'.format(arg)
            self.scontrol(cmd)

        elif str(arg).lower() == 'all':
            self._queue = None
            for job_id, attrs in self.queue.items():
                status = attrs[self.QCOL_STATUS].lower()
                reason = attrs['NODELIST(REASON)'].lower()
                if status == 'pd' and 'jobheld' in reason:
                    self.release(job_id)

        else:
            e = ('Could not release: {} with type {}'
                 .format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 7
0
    def hold(self, arg):
        """Temporarily hold a job from submitting. Held jobs will stay in queue
        but will not get nodes until released.

        Parameters
        ----------
        arg : int | list | str
            SLURM integer job id(s) to hold. Can be 'all' to hold all jobs.
        """

        if isinstance(arg, (list, tuple)):
            for job_id in arg:
                self.hold(job_id)

        elif isinstance(arg, int):
            cmd = 'hold {}'.format(arg)
            self.scontrol(cmd)

        elif str(arg).lower() == 'all':
            self._queue = None
            for job_id, attrs in self.queue.items():
                status = attrs[self.QCOL_STATUS].lower()
                if status == 'pd':
                    self.hold(job_id)

        else:
            e = ('Could not hold: {} with type {}'
                 .format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 8
0
    def change_qos(self, arg, qos):
        """Change the priority (quality of service) for a job.

        Parameters
        ----------
        arg : int | list | str
            SLURM integer job id(s) to change qos for.
            Can be 'all' for all jobs.
        qos : str
            New qos value
        """

        if isinstance(arg, (list, tuple)):
            for job_id in arg:
                self.change_qos(job_id, qos)

        elif isinstance(arg, int):
            cmd = 'update job {} QOS={}'.format(arg, qos)
            self.scontrol(cmd)

        elif str(arg).lower() == 'all':
            self._queue = None
            for job_id, attrs in self.queue.items():
                status = attrs[self.QCOL_STATUS].lower()
                if status == 'pd':
                    self.change_qos(job_id, qos)

        else:
            e = ('Could not change qos of: {} with type {}'
                 .format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)
Ejemplo n.º 9
0
    def scancel(self, arg):
        """Cancel a slurm job.

        Parameters
        ----------
        arg : int | list | str
            SLURM integer job id(s) to cancel. Can be a list of integer
            job ids, 'all' to cancel all jobs, or a feature (-p short) to
            cancel all jobs with a given feature
        """

        if isinstance(arg, (list, tuple)):
            for job_id in arg:
                self.scancel(job_id)

        elif str(arg).lower() == 'all':
            self._queue = None
            for job_id in self.queue_job_ids:
                self.scancel(job_id)

        elif isinstance(arg, (int, str)):
            cmd = ('scancel {}'.format(arg))
            cmd = shlex.split(cmd)
            subprocess.call(cmd)

        else:
            e = ('Could not cancel: {} with type {}'
                 .format(arg, type(arg)))
            logger.error(e)
            raise ExecutionError(e)