def do_sacct(job_ids): # there's a lag between when a job finishes and when sacct is available :(Z cmd = ( "sacct --format=" '"State,JobID,CPUTime,MaxRSS,AveRSS,AveCPU,CPUTimeRAW,AveVMSize,MaxVMSize,Elapsed,ExitCode,Start,End" ' "-j %s -P" % ",".join(job_ids)) out, err, _ = run_cli_cmd(cmd, shell=True) parts = out.strip().split("\n") # job_id_to_job_info_dict all_jobs = dict() # first line is the header keys = parts[0].split("|") # second line is all dashes, ignore it for line in parts[2:]: values = line.split("|") job_dict = dict(list(zip(keys, values))) if "batch" in job_dict["JobID"]: # slurm prints these .batch versions of jobids which have better information, overwrite job_dict["JobID"] = job_dict["JobID"].replace(".batch", "") all_jobs[job_dict["JobID"]] = job_dict return all_jobs
def qstat(logger=None): """ Return a mapping of job ids to a dict of GE information about each job. If qstat hangs or returns an error, wait 30 sec and call it again. Do this three times. If the final attempt returns an error, log it, and return an empty dictionary, which is the same behavior you'd get if all known jobs had exited. (If qstat is down for 90+ sec, any running job is likely to be functionally dead.) The exact contents of the sub-dictionaries in the returned dictionary's values() depend on the installed GE version. """ if logger is None: logger = _get_null_logger() stdout, _, returncode = run_cli_cmd(["qstat"], attempts=3, interval=30, logger=logger, timeout=30) if returncode != 0: logger.warning("qstat returned %s: If GE is offline, all jobs are dead or done") return {} lines = stdout.strip().split("\n") if not lines: logger.info( "qstat returned 0 and no output: all jobs are probably done, " "but in rare cases this may be a sign that GE is not working properly" ) return {} keys = re.split(r"\s+", lines[0]) bjobs = {} for l in lines[2:]: items = re.split(r"\s+", l.strip()) bjobs[items[0]] = dict(list(zip(keys, items))) return bjobs
def sbatch(task): ns = task.drm_native_specification if task.drm_native_specification else "" cmd = ([ "sbatch", "-o", os.path.abspath(task.output_stdout_path), "-e", os.path.abspath(task.output_stderr_path), ] + ns.split() + [task.output_command_script_path]) out, err, _ = run_cli_cmd(cmd, env=os.environ) return str(re.search(r"job (\d+)", out).group(1))
def qsub( cmd_fn, stdout_fn, stderr_fn, addl_args=None, drm_name="GE", logger=None, log_prefix="", ): """ Submit the requested (bash-parseable) script stored in cmd_fn to GE. The command is submitted relatove to the current CWD. Callers should change this before calling if they need to run in a particular directory. Output will be written to two filenames, specified in stdout_fn and stderr_fn. Additional arguments to SGE may be specified as a single string in addl_args. Callers can optionally supply a logger object and a prefix to prepend to log messages. """ for p in [stdout_fn, stderr_fn]: if os.path.exists(p): os.unlink(p) qsub_cli = "qsub -terse -o {stdout_fn} -e {stderr_fn} -b y -w e -cwd -S /bin/bash -V".format( stdout_fn=stdout_fn, stderr_fn=stderr_fn ) if addl_args: qsub_cli += " %s" % addl_args job_id = None stdout, stderr, returncode = run_cli_cmd( '{qsub_cli} "{cmd_fn}"'.format(cmd_fn=cmd_fn, qsub_cli=qsub_cli), attempts=1, # make just one attempt: running a task 2x could be disastrous env=os.environ, logger=logger, shell=True, ) if returncode != 0: logger.error( "%s submission to %s (%s) failed with error %s", log_prefix, drm_name, qsub, returncode, ) status = TaskStatus.failed else: try: job_id = str(int(stdout)) except ValueError: logger.error( "%s submission to %s returned unexpected text: %s", log_prefix, drm_name, stdout, ) status = TaskStatus.failed else: status = TaskStatus.submitted return (job_id, status)
def qdel(job_ids, logger): """ Call qdel on all the supplied job_ids: if that fails, qdel each job_id individually. Unlike other SGE cli commands, each qdel call is attempted only once, with a fairly harsh 20-second timeout, because this function is often called in an exit handler that does not have arbitrary amounts of time in which to run. """ stdout, stderr, returncode = run_cli_cmd( ["qdel", "-f", ",".join(job_ids)], logger=logger, attempts=1, timeout=20, ) if returncode == 0: logger.info("qdel reported success against %d job_ids", len(job_ids)) return len(job_ids) successful_qdels = 0 undead_job_ids = [] for job_id in job_ids: if "has deleted job %s" % job_id in stdout: successful_qdels += 1 elif "has registered the job %s for deletion" % job_id in stdout: successful_qdels += 1 else: undead_job_ids.append(job_id) if undead_job_ids: # # If the original qdel didn't catch everything, kick off a qdel for each # remaining job id. Don't set a timeout and don't check the return code. # logger.warning( "qdel returned exit code %s, calling on one job_id at a time", returncode) for i, job_id in enumerate(undead_job_ids): logger.warning("will qdel %s in %d sec and ignore exit code", job_id, i) subprocess.Popen("sleep %d; qdel -f %s" % (i, job_id), shell=True) logger.info( "qdel reported success against %d of %d job_ids, see above for details", successful_qdels, len(job_ids), ) return successful_qdels
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""): """ Parse qacct output into key/value pairs. If qacct reports results in multiple blocks (separated by a row of ===='s), the most recently-generated block with valid data is returned. If no block with valid data exists, then return the most recently-generated block of corrupt data. Call ``is_corrupt()`` on the output of this method to see if the data are suitable for use. """ if not logger: logger = _get_null_logger() start = time.time() curr_qacct_dict = None good_qacct_dict = None for i in range(num_retries): qacct_stdout_str, qacct_stderr_str, qacct_returncode = run_cli_cmd( ["qacct", "-j", str(job_id)], logger=logger) if qacct_returncode == 0 and qacct_stdout_str.strip(): # qacct returned actual output w/no error code. we're good break if qacct_stderr_str and re.match(r"error: job id \d+ not found", qacct_stderr_str): if i > 0: logger.info( '%s SGE (qacct -j %s) reports "not found"; this may mean ' "qacct is merely slow, or %s died in the 'qw' state", log_prefix, job_id, job_id, ) else: logger.error( "%s SGE (qacct -j %s) returned error code %d", log_prefix, job_id, qacct_returncode, ) if qacct_stdout_str or qacct_stderr_str: logger.error("%s SGE (qacct -j %s) printed the following", log_prefix, job_id) if qacct_stdout_str: logger.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: logger.error('stderr: "%s"', qacct_stderr_str) if i > 0: logger.info( "%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s", log_prefix, job_id, i + 1, time.time() - start, ". Will recheck job status after %d sec" % quantum if i + 1 < num_retries else "", ) if i + 1 < num_retries: logger.info( "%s Will wait %d sec before calling qacct on %s again", log_prefix, quantum, job_id, ) sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise QacctJobNotFoundError( "%s No valid SGE (qacct -j %s) output after %d tries over %d sec" % (log_prefix, job_id, i, time.time() - start)) for line in qacct_stdout_str.strip().split("\n"): if line.startswith("="): if curr_qacct_dict and not is_corrupt(curr_qacct_dict): # # Cache this non-corrupt block of qacct data just # in case all the more recent blocks are corrupt. # good_qacct_dict = curr_qacct_dict curr_qacct_dict = OrderedDict() continue try: k, v = re.split(r"\s+", line, maxsplit=1) except ValueError: raise EnvironmentError( "%s SGE (qacct -j %s) output is unparseable:\n%s" % (log_prefix, job_id, qacct_stdout_str)) curr_qacct_dict[k] = v.strip() # if the last block of qacct data looks good, promote it if curr_qacct_dict and not is_corrupt(curr_qacct_dict): good_qacct_dict = curr_qacct_dict return good_qacct_dict if good_qacct_dict else curr_qacct_dict