def release(arg): """Release a job that was previously on hold so it will be submitted to a compute node. Parameters ---------- arg : int | list | str SLURM job id(s) to release. Can be 'all' to release all jobs. """ if isinstance(arg, (list, tuple)): for jid in arg: SLURM.release(jid) elif isinstance(arg, int): cmd = 'release {}'.format(arg) SLURM.scontrol(cmd) elif str(arg).lower() == 'all': sq = SLURM.squeue() for row in sq[1:]: row_list = [x for x in row.strip().split(' ') if x != ''] job_id = int(row_list[0]) status = row_list[4] reason = row_list[-1] if status == 'PD' and 'jobheld' in reason.lower(): SLURM.release(job_id) else: e = ('Could not release: {} with type {}'.format(arg, type(arg))) logger.error(e) raise ExecutionError(e)
def __init__(self, obj, execution_iter, n_workers=None, mem_util_lim=0.7): """Single node parallel compute manager with smart data flushing. Parameters ---------- obj : object Python object that will be submitted to futures. Must have methods run(arg) and flush(). run(arg) must take the iteration result of execution_iter as the single positional argument. Additionally, the results of obj.run(arg) will be pa ssed to obj.out. obj.out will be passed None when the memory is to be cleared. It is advisable that obj.run() be a @staticmethod for dramatically faster submission in parallel. execution_iter : iter Python iterator that controls the futures submitted in parallel. n_workers : int Number of workers to use in parallel. None will use all available workers. mem_util_lim : float Memory utilization limit (fractional). If the used memory divided by the total memory is greater than this value, the obj.out will be flushed and the local node memory will be cleared. """ if not hasattr(obj, 'run') or not hasattr(obj, 'flush'): raise ExecutionError('Parallel execution with object: "{}" ' 'failed. The target object must have methods ' 'run() and flush()'.format(obj)) self._obj = obj self._execution_iter = execution_iter self._n_workers = n_workers self._mem_util_lim = mem_util_lim
def change_qos(arg, qos): """Change the priority (quality of service) for a job. Parameters ---------- arg : int | list | str SLURM job id(s) to change qos for. Can be 'all' for all jobs. qos : str New qos value """ if isinstance(arg, (list, tuple)): for jid in arg: SLURM.change_qos(jid, qos) elif isinstance(arg, int): cmd = 'update job {} QOS={}'.format(arg, qos) SLURM.scontrol(cmd) elif str(arg).lower() == 'all': sq = SLURM.squeue() for row in sq[1:]: row_list = [x for x in row.strip().split(' ') if x != ''] job_id = int(row_list[0]) status = row_list[4] if status == 'PD': SLURM.change_qos(job_id, qos) else: e = ('Could not change qos of: {} with type {}'.format( arg, type(arg))) logger.error(e) raise ExecutionError(e)
def hold(arg): """Temporarily hold a job from submitting. Held jobs will stay in queue but will not get nodes until released. Parameters ---------- arg : int | list | str SLURM job id(s) to hold. Can be 'all' to hold all jobs. """ if isinstance(arg, (list, tuple)): for jid in arg: SLURM.hold(jid) elif isinstance(arg, int): cmd = 'hold {}'.format(arg) SLURM.scontrol(cmd) elif str(arg).lower() == 'all': sq = SLURM.squeue() for row in sq[1:]: row_list = [x for x in row.strip().split(' ') if x != ''] job_id = int(row_list[0]) status = row_list[4] if status == 'PD': SLURM.hold(job_id) else: e = ('Could not hold: {} with type {}'.format(arg, type(arg))) logger.error(e) raise ExecutionError(e)
def scancel(arg): """Cancel a slurm job. Parameters ---------- arg : int | list | str SLURM job id(s) to cancel. Can be a list of integer job ids, 'all' to cancel all jobs, or a feature (-p short) to cancel all jobs with a given feature """ if isinstance(arg, (list, tuple)): for jid in arg: SLURM.scancel(jid) elif str(arg).lower() == 'all': sq = SLURM.squeue() for row in sq[1:]: job_id = int(row.strip().split(' ')[0]) SLURM.scancel(job_id) elif isinstance(arg, (int, str)): cmd = ('scancel {}'.format(arg)) cmd = shlex.split(cmd) call(cmd) else: e = ('Could not cancel: {} with type {}'.format(arg, type(arg))) logger.error(e) raise ExecutionError(e)
def release(self, arg): """Release a job that was previously on hold so it will be submitted to a compute node. Parameters ---------- arg : int | list | str SLURM integer job id(s) to release. Can be 'all' to release all jobs. """ if isinstance(arg, (list, tuple)): for job_id in arg: self.release(job_id) elif isinstance(arg, int): cmd = 'release {}'.format(arg) self.scontrol(cmd) elif str(arg).lower() == 'all': self._queue = None for job_id, attrs in self.queue.items(): status = attrs[self.QCOL_STATUS].lower() reason = attrs['NODELIST(REASON)'].lower() if status == 'pd' and 'jobheld' in reason: self.release(job_id) else: e = ('Could not release: {} with type {}' .format(arg, type(arg))) logger.error(e) raise ExecutionError(e)
def hold(self, arg): """Temporarily hold a job from submitting. Held jobs will stay in queue but will not get nodes until released. Parameters ---------- arg : int | list | str SLURM integer job id(s) to hold. Can be 'all' to hold all jobs. """ if isinstance(arg, (list, tuple)): for job_id in arg: self.hold(job_id) elif isinstance(arg, int): cmd = 'hold {}'.format(arg) self.scontrol(cmd) elif str(arg).lower() == 'all': self._queue = None for job_id, attrs in self.queue.items(): status = attrs[self.QCOL_STATUS].lower() if status == 'pd': self.hold(job_id) else: e = ('Could not hold: {} with type {}' .format(arg, type(arg))) logger.error(e) raise ExecutionError(e)
def change_qos(self, arg, qos): """Change the priority (quality of service) for a job. Parameters ---------- arg : int | list | str SLURM integer job id(s) to change qos for. Can be 'all' for all jobs. qos : str New qos value """ if isinstance(arg, (list, tuple)): for job_id in arg: self.change_qos(job_id, qos) elif isinstance(arg, int): cmd = 'update job {} QOS={}'.format(arg, qos) self.scontrol(cmd) elif str(arg).lower() == 'all': self._queue = None for job_id, attrs in self.queue.items(): status = attrs[self.QCOL_STATUS].lower() if status == 'pd': self.change_qos(job_id, qos) else: e = ('Could not change qos of: {} with type {}' .format(arg, type(arg))) logger.error(e) raise ExecutionError(e)
def scancel(self, arg): """Cancel a slurm job. Parameters ---------- arg : int | list | str SLURM integer job id(s) to cancel. Can be a list of integer job ids, 'all' to cancel all jobs, or a feature (-p short) to cancel all jobs with a given feature """ if isinstance(arg, (list, tuple)): for job_id in arg: self.scancel(job_id) elif str(arg).lower() == 'all': self._queue = None for job_id in self.queue_job_ids: self.scancel(job_id) elif isinstance(arg, (int, str)): cmd = ('scancel {}'.format(arg)) cmd = shlex.split(cmd) subprocess.call(cmd) else: e = ('Could not cancel: {} with type {}' .format(arg, type(arg))) logger.error(e) raise ExecutionError(e)