def get_usage(): data = {} cores_total, mem_total = 0, 0 cores_alloc, mem_alloc = 0, 0 cores_user, mem_user = 0, 0 nodes = 0 try: out = execute_command(CMD_INFO) except QMapError as e: raise ExecutorError(e) else: lines = out.splitlines() for line in lines: values = line.strip().split() node_id = values[0] all_cores = values[1].split('/') cores_total += int(all_cores[3]) cores_alloc += int(all_cores[0]) mem_total += int(values[2]) // 1024 mem_alloc += int(values[3]) // 1024 node_state = values[4] if node_state not in ['mix', 'idle', 'alloc']: # exclude nodes not working continue nodes += 1 data['nodes'] = nodes data['usage'] = get_usage_percentage(cores_alloc, mem_alloc, cores_total, mem_total) try: out = execute_command(CMD_SQUEUE) except QMapError as e: raise ExecutorError(e) else: lines = out.splitlines() for line in lines: values = line.strip().split() cores_user += int(values[0]) mem = values[1] mem_units = mem[-1] mem_value = int(float(mem[:-1])) mem_user += memory_convert(mem_value, mem_units, 'G') data['user'] = get_usage_percentage(cores_user, mem_user, cores_total, mem_total) return data
def generate_job_status_running(job_ids, retries=3): done = set() cmd = 'bjobs -noheader -o "jobid stat start_time"' try: out = execute_command(cmd) except QMapError as e: if retries > 0: time.sleep(0.5) yield from Executor.generate_job_status_running( job_ids=job_ids, retries=retries - 1) else: raise ExecutorError(e) from None else: info = {'usage': {'cluster': 'LSF'}} lines = out.splitlines() for i, line in enumerate(lines): l = line.strip().split(maxsplit=2) id_ = l[0] if id_ in job_ids: status = LSF_STATUS.get(l[1], Status.OTHER) error = ExecutorErrorCodes.UNKNOWN if status == Status.FAILED else ExecutorErrorCodes.NOERROR done.add(id_) if l[2].strip(): info = info.copy() info['start_time'] = l[2].strip() yield id_, (status, error, info) for id_ in done: job_ids.remove(id_)
def generate_job_status_running(job_ids, retries=3): done = set() cmd = "qstat" try: out = execute_command(cmd) except QMapError as e: if retries > 0: time.sleep(0.5) yield from Executor.generate_job_status_running( job_ids=job_ids, retries=retries - 1) else: raise ExecutorError(e) from None else: info = {'usage': {'cluster': 'SGE'}} lines = out.splitlines() for i, line in enumerate(lines): if i < 2: continue else: l = line.strip().split() id_ = l[0] if id_ in job_ids: status = QSTAT_STATUS.get(l[4], Status.OTHER) error = ExecutorErrorCodes.UNKNOWN if status == Status.FAILED else ExecutorErrorCodes.NOERROR done.add(id_) yield id_, (status, error, info) for id_ in done: job_ids.remove(id_)
def terminate_jobs(job_ids): cmd = "bkill {}".format(" ".join(job_ids)) if len(job_ids) == 0: return '', cmd try: out = execute_command(cmd) except QMapError as e: raise ExecutorError(e) else: return out.strip(), cmd
def run_job(f_script, parameters, out=None, err=None): options = parse_parameters(parameters) if out is not None: options.append( '-o {}'.format(out)) # File to which STDOUT will be written if err is not None: options.append( '-e {}'.format(err)) # File to which STDERR will be written cmd = "qsub -terse -r no {} {}.{}".format(' '.join(options), f_script, SCRIPT_FILE_EXTENSION) try: out = execute_command(cmd) except QMapError: raise ExecutorError( 'Job cannot be submitted to SGE. Command: {}'.format(cmd)) return out.strip(), cmd
def run_job(f_script, parameters, out=None, err=None): options = parse_parameters(parameters) if out is not None: options.append( '-o {}'.format(out)) # File to which STDOUT will be written if err is not None: options.append( '-e {}'.format(err)) # File to which STDERR will be written cmd = "bsub {} {}.{}".format(' '.join(options), f_script, SCRIPT_FILE_EXTENSION) # -rn try: out = execute_command(cmd) except QMapError: raise ExecutorError( 'Job cannot be submitted to slurm. Command: {}'.format(cmd)) job_id = out.strip().split()[1].replace('<', '').replace('>', '') return job_id, cmd
def generate_job_status_finished(job_ids, retries=3): cmd = "qacct -j {}".format(','.join(job_ids)) try: out = execute_command(cmd) except QMapError as e: if retries > 0: time.sleep(0.5) yield from Executor.generate_job_status_finished( job_ids=job_ids, retries=retries - 1) else: raise ExecutorError(e) from None else: lines = out.splitlines() id_ = None status = Status.OTHER error = None info = {} for line in lines: if not line: continue elif line.startswith('==') and id_ is not None: yield id_, (status, error, info) info = {'usage': {'cluster': {'type': 'SGE'}}} error = None status = Status.OTHER else: k, v = line.strip().split(maxsplit=1) if k == 'jobnumber': id_ = v elif k == 'exit_status': if v == '0': error = ExecutorErrorCodes.NOERROR status = Status.DONE else: error = ExecutorErrorCodes.UNKNOWN status = Status.FAILED info['exit_code'] = v elif k == 'maxvmem': info['usage']['memory'] = v elif k == 'hostname': info['usage']['cluster']['nodes'] = v else: if id_ is not None: yield id_, (status, error, info)
def generate_jobs_status(job_ids, retries=3): """ For each job ID, we assume we have a single step (.0 for run and .batch for batch submissions). """ cmd = "sacct --parsable2 --format {} --jobs {}".format( STATUS_FORMAT, ",".join(job_ids)) try: out = execute_command(cmd) except QMapError as e: if retries > 0: time.sleep(0.5) yield from Executor.generate_jobs_status(job_ids=job_ids, retries=retries - 1) else: raise ExecutorError(e) from None else: lines = out.splitlines() prev_id = None info = [] for line in csv.DictReader(lines, delimiter='|'): # We will get the information from the latest step of the job. id_ = line.pop('JobID') job_id = id_.split('.')[0] if prev_id is None: prev_id = job_id if prev_id == job_id: info.append(line) else: yield prev_id, parse_status( info[-1]) # get latest line of previous job prev_id = job_id info = [line] else: if prev_id is not None: yield prev_id, parse_status(info[-1])
def generate_job_status_finished(job_ids, retries=3): cmd = "bacct -l {}".format(' '.join(job_ids)) try: out = execute_command(cmd) except QMapError as e: if retries > 0: time.sleep(0.5) yield from Executor.generate_job_status_finished( job_ids=job_ids, retries=retries - 1) else: raise ExecutorError(e) from None else: lines = out.splitlines() id_ = None status = Status.OTHER error = None info = {} for line in lines: if not line: continue elif line.startswith('----') and id_ is not None: yield id_, (status, error, info) info = {'usage': {'cluster': {'type': 'LSF'}}} error = None status = Status.OTHER elif line.startswith('Job <'): for match in regex.finditer(line): name, value = match.groups() if name == 'Job': id_ = value elif name == 'Status': status = LSF_STATUS[value] # TODO more info?? else: if id_ is not None: yield id_, (status, error, info)