Esempio n. 1
0
def do_sacct(job_ids):
    # there's a lag between when a job finishes and when sacct is available :(Z
    cmd = (
        "sacct --format="
        '"State,JobID,CPUTime,MaxRSS,AveRSS,AveCPU,CPUTimeRAW,AveVMSize,MaxVMSize,Elapsed,ExitCode,Start,End" '
        "-j %s -P" % ",".join(job_ids))

    out, err, _ = run_cli_cmd(cmd, shell=True)

    parts = out.strip().split("\n")
    # job_id_to_job_info_dict
    all_jobs = dict()
    # first line is the header
    keys = parts[0].split("|")
    # second line is all dashes, ignore it
    for line in parts[2:]:
        values = line.split("|")
        job_dict = dict(list(zip(keys, values)))

        if "batch" in job_dict["JobID"]:
            # slurm prints these .batch versions of jobids which have better information, overwrite
            job_dict["JobID"] = job_dict["JobID"].replace(".batch", "")

        all_jobs[job_dict["JobID"]] = job_dict

    return all_jobs
Esempio n. 2
0
def qstat(logger=None):
    """
    Return a mapping of job ids to a dict of GE information about each job.

    If qstat hangs or returns an error, wait 30 sec and call it again. Do this
    three times. If the final attempt returns an error, log it, and return an
    empty dictionary, which is the same behavior you'd get if all known jobs
    had exited. (If qstat is down for 90+ sec, any running job is likely to be
    functionally dead.)

    The exact contents of the sub-dictionaries in the returned dictionary's
    values() depend on the installed GE version.
    """
    if logger is None:
        logger = _get_null_logger()

    stdout, _, returncode = run_cli_cmd(["qstat"], attempts=3, interval=30, logger=logger, timeout=30)
    if returncode != 0:
        logger.warning("qstat returned %s: If GE is offline, all jobs are dead or done")
        return {}
    lines = stdout.strip().split("\n")
    if not lines:
        logger.info(
            "qstat returned 0 and no output: all jobs are probably done, "
            "but in rare cases this may be a sign that GE is not working properly"
        )
        return {}

    keys = re.split(r"\s+", lines[0])
    bjobs = {}
    for l in lines[2:]:
        items = re.split(r"\s+", l.strip())
        bjobs[items[0]] = dict(list(zip(keys, items)))
    return bjobs
Esempio n. 3
0
def sbatch(task):
    ns = task.drm_native_specification if task.drm_native_specification else ""

    cmd = ([
        "sbatch",
        "-o",
        os.path.abspath(task.output_stdout_path),
        "-e",
        os.path.abspath(task.output_stderr_path),
    ] + ns.split() + [task.output_command_script_path])

    out, err, _ = run_cli_cmd(cmd, env=os.environ)
    return str(re.search(r"job (\d+)", out).group(1))
Esempio n. 4
0
def qsub(
    cmd_fn, stdout_fn, stderr_fn, addl_args=None, drm_name="GE", logger=None, log_prefix="",
):
    """
    Submit the requested (bash-parseable) script stored in cmd_fn to GE.

    The command is submitted relatove to the current CWD. Callers should change
    this before calling if they need to run in a particular directory.

    Output will be written to two filenames, specified in stdout_fn and stderr_fn.
    Additional arguments to SGE may be specified as a single string in addl_args.
    Callers can optionally supply a logger object and a prefix to prepend to log messages.
    """
    for p in [stdout_fn, stderr_fn]:
        if os.path.exists(p):
            os.unlink(p)

    qsub_cli = "qsub -terse -o {stdout_fn} -e {stderr_fn} -b y -w e -cwd -S /bin/bash -V".format(
        stdout_fn=stdout_fn, stderr_fn=stderr_fn
    )

    if addl_args:
        qsub_cli += " %s" % addl_args

    job_id = None

    stdout, stderr, returncode = run_cli_cmd(
        '{qsub_cli} "{cmd_fn}"'.format(cmd_fn=cmd_fn, qsub_cli=qsub_cli),
        attempts=1,  # make just one attempt: running a task 2x could be disastrous
        env=os.environ,
        logger=logger,
        shell=True,
    )

    if returncode != 0:
        logger.error(
            "%s submission to %s (%s) failed with error %s", log_prefix, drm_name, qsub, returncode,
        )
        status = TaskStatus.failed
    else:
        try:
            job_id = str(int(stdout))
        except ValueError:
            logger.error(
                "%s submission to %s returned unexpected text: %s", log_prefix, drm_name, stdout,
            )
            status = TaskStatus.failed
        else:
            status = TaskStatus.submitted

    return (job_id, status)
Esempio n. 5
0
def qdel(job_ids, logger):
    """
    Call qdel on all the supplied job_ids: if that fails, qdel each job_id individually.

    Unlike other SGE cli commands, each qdel call is attempted only once, with a
    fairly harsh 20-second timeout, because this function is often called in an
    exit handler that does not have arbitrary amounts of time in which to run.
    """
    stdout, stderr, returncode = run_cli_cmd(
        ["qdel", "-f", ",".join(job_ids)],
        logger=logger,
        attempts=1,
        timeout=20,
    )
    if returncode == 0:
        logger.info("qdel reported success against %d job_ids", len(job_ids))
        return len(job_ids)

    successful_qdels = 0
    undead_job_ids = []

    for job_id in job_ids:
        if "has deleted job %s" % job_id in stdout:
            successful_qdels += 1
        elif "has registered the job %s for deletion" % job_id in stdout:
            successful_qdels += 1
        else:
            undead_job_ids.append(job_id)

    if undead_job_ids:
        #
        # If the original qdel didn't catch everything, kick off a qdel for each
        # remaining job id. Don't set a timeout and don't check the return code.
        #
        logger.warning(
            "qdel returned exit code %s, calling on one job_id at a time",
            returncode)

        for i, job_id in enumerate(undead_job_ids):
            logger.warning("will qdel %s in %d sec and ignore exit code",
                           job_id, i)
            subprocess.Popen("sleep %d; qdel -f %s" % (i, job_id), shell=True)

    logger.info(
        "qdel reported success against %d of %d job_ids, see above for details",
        successful_qdels,
        len(job_ids),
    )
    return successful_qdels
Esempio n. 6
0
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""):
    """
    Parse qacct output into key/value pairs.

    If qacct reports results in multiple blocks (separated by a row of ===='s),
    the most recently-generated block with valid data is returned. If no block
    with valid data exists, then return the most recently-generated block of
    corrupt data. Call ``is_corrupt()`` on the output of this method to see if
    the data are suitable for use.
    """
    if not logger:
        logger = _get_null_logger()

    start = time.time()
    curr_qacct_dict = None
    good_qacct_dict = None

    for i in range(num_retries):

        qacct_stdout_str, qacct_stderr_str, qacct_returncode = run_cli_cmd(
            ["qacct", "-j", str(job_id)], logger=logger)
        if qacct_returncode == 0 and qacct_stdout_str.strip():
            # qacct returned actual output w/no error code. we're good
            break

        if qacct_stderr_str and re.match(r"error: job id \d+ not found",
                                         qacct_stderr_str):
            if i > 0:
                logger.info(
                    '%s SGE (qacct -j %s) reports "not found"; this may mean '
                    "qacct is merely slow, or %s died in the 'qw' state",
                    log_prefix,
                    job_id,
                    job_id,
                )
        else:
            logger.error(
                "%s SGE (qacct -j %s) returned error code %d",
                log_prefix,
                job_id,
                qacct_returncode,
            )
            if qacct_stdout_str or qacct_stderr_str:
                logger.error("%s SGE (qacct -j %s) printed the following",
                             log_prefix, job_id)
                if qacct_stdout_str:
                    logger.error('stdout: "%s"', qacct_stdout_str)
                if qacct_stderr_str:
                    logger.error('stderr: "%s"', qacct_stderr_str)

        if i > 0:
            logger.info(
                "%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s",
                log_prefix,
                job_id,
                i + 1,
                time.time() - start,
                ". Will recheck job status after %d sec" %
                quantum if i + 1 < num_retries else "",
            )
        if i + 1 < num_retries:
            logger.info(
                "%s Will wait %d sec before calling qacct on %s again",
                log_prefix,
                quantum,
                job_id,
            )
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise QacctJobNotFoundError(
            "%s No valid SGE (qacct -j %s) output after %d tries over %d sec" %
            (log_prefix, job_id, i, time.time() - start))

    for line in qacct_stdout_str.strip().split("\n"):
        if line.startswith("="):
            if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
                #
                # Cache this non-corrupt block of qacct data just
                # in case all the more recent blocks are corrupt.
                #
                good_qacct_dict = curr_qacct_dict

            curr_qacct_dict = OrderedDict()
            continue

        try:
            k, v = re.split(r"\s+", line, maxsplit=1)
        except ValueError:
            raise EnvironmentError(
                "%s SGE (qacct -j %s) output is unparseable:\n%s" %
                (log_prefix, job_id, qacct_stdout_str))

        curr_qacct_dict[k] = v.strip()

    # if the last block of qacct data looks good, promote it
    if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
        good_qacct_dict = curr_qacct_dict

    return good_qacct_dict if good_qacct_dict else curr_qacct_dict