Beispiel #1
0
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""):
    """
    Parse qacct output into key/value pairs.

    If qacct reports results in multiple blocks (separated by a row of ===='s),
    the most recently-generated block with valid data is returned. If no block
    with valid data exists, then return the most recently-generated block of
    corrupt data. Call ``is_corrupt()`` on the output of this method to see if
    the data are suitable for use.
    """
    if not logger:
        logger = _get_null_logger()

    start = time.time()
    curr_qacct_dict = None
    good_qacct_dict = None

    for i in range(num_retries):

        qacct_stdout_str, qacct_stderr_str, qacct_returncode = run_cli_cmd(
            ["qacct", "-j", str(job_id)], logger=logger)
        if qacct_returncode == 0 and qacct_stdout_str.strip():
            # qacct returned actual output w/no error code. we're good
            break

        if qacct_stderr_str and re.match(r"error: job id \d+ not found",
                                         qacct_stderr_str):
            if i > 0:
                logger.info(
                    '%s SGE (qacct -j %s) reports "not found"; this may mean '
                    "qacct is merely slow, or %s died in the 'qw' state",
                    log_prefix,
                    job_id,
                    job_id,
                )
        else:
            logger.error(
                "%s SGE (qacct -j %s) returned error code %d",
                log_prefix,
                job_id,
                qacct_returncode,
            )
            if qacct_stdout_str or qacct_stderr_str:
                logger.error("%s SGE (qacct -j %s) printed the following",
                             log_prefix, job_id)
                if qacct_stdout_str:
                    logger.error('stdout: "%s"', qacct_stdout_str)
                if qacct_stderr_str:
                    logger.error('stderr: "%s"', qacct_stderr_str)

        if i > 0:
            logger.info(
                "%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s",
                log_prefix,
                job_id,
                i + 1,
                time.time() - start,
                ". Will recheck job status after %d sec" %
                quantum if i + 1 < num_retries else "",
            )
        if i + 1 < num_retries:
            logger.info(
                "%s Will wait %d sec before calling qacct on %s again",
                log_prefix,
                quantum,
                job_id,
            )
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise QacctJobNotFoundError(
            "%s No valid SGE (qacct -j %s) output after %d tries over %d sec" %
            (log_prefix, job_id, i, time.time() - start))

    for line in qacct_stdout_str.strip().split("\n"):
        if line.startswith("="):
            if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
                #
                # Cache this non-corrupt block of qacct data just
                # in case all the more recent blocks are corrupt.
                #
                good_qacct_dict = curr_qacct_dict

            curr_qacct_dict = OrderedDict()
            continue

        try:
            k, v = re.split(r"\s+", line, maxsplit=1)
        except ValueError:
            raise EnvironmentError(
                "%s SGE (qacct -j %s) output is unparseable:\n%s" %
                (log_prefix, job_id, qacct_stdout_str))

        curr_qacct_dict[k] = v.strip()

    # if the last block of qacct data looks good, promote it
    if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
        good_qacct_dict = curr_qacct_dict

    return good_qacct_dict if good_qacct_dict else curr_qacct_dict
Beispiel #2
0
def _qacct_raw(task, timeout=600, quantum=15):
    """
    Parse qacct output into key/value pairs.

    If qacct reports results in multiple blocks (separated by a row of ===='s),
    the most recently-generated block with valid data is returned. If no such
    block exists, then return the most recently-generated block of corrupt data.
    """
    start = time.time()
    curr_qacct_dict = None
    good_qacct_dict = None
    num_retries = int(timeout / quantum)

    for i in xrange(num_retries):
        qacct_returncode = 0
        try:
            qacct_stdout_str, qacct_stderr_str = check_output_and_stderr(
                ['qacct', '-j', unicode(task.drm_jobID)],
                preexec_fn=exit_process_group)
            if qacct_stdout_str.strip():
                break
        except CosmosCalledProcessError as err:
            qacct_stdout_str = err.output.strip()
            qacct_stderr_str = err.stderr.strip()
            qacct_returncode = err.returncode

        if qacct_stderr_str and re.match(r'error: job id \d+ not found',
                                         qacct_stderr_str):
            if i > 0:
                task.workflow.log.info(
                    '%s SGE (qacct -j %s) reports "not found"; this may mean '
                    'qacct is merely slow, or %s died in the \'qw\' state',
                    task, task.drm_jobID, task.drm_jobID)
        else:
            task.workflow.log.error(
                '%s SGE (qacct -j %s) returned error code %d', task,
                task.drm_jobID, qacct_returncode)
            if qacct_stdout_str or qacct_stderr_str:
                task.workflow.log.error(
                    '%s SGE (qacct -j %s) printed the following', task,
                    task.drm_jobID)
                if qacct_stdout_str:
                    task.workflow.log.error('stdout: "%s"', qacct_stdout_str)
                if qacct_stderr_str:
                    task.workflow.log.error('stderr: "%s"', qacct_stderr_str)

        if i > 0:
            task.workflow.log.info(
                '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s',
                task, task.drm_jobID, i + 1,
                time.time() - start, '. Will recheck job status after %d sec' %
                quantum if i + 1 < num_retries else '')
        if i + 1 < num_retries:
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise ValueError(
            'No valid `qacct -j %s` output after %d tries and %d sec' %
            (task.drm_jobID, i, time.time() - start))

    for line in qacct_stdout_str.strip().split('\n'):
        if line.startswith('='):
            if curr_qacct_dict and not _is_corrupt(curr_qacct_dict):
                #
                # Cache this non-corrupt block of qacct data just
                # in case all the more recent blocks are corrupt.
                #
                good_qacct_dict = curr_qacct_dict

            curr_qacct_dict = OrderedDict()
            continue

        try:
            k, v = re.split(r'\s+', line, maxsplit=1)
        except ValueError:
            raise EnvironmentError(
                '%s with drm_jobID=%s has unparseable qacct output:\n%s' %
                (task, task.drm_jobID, qacct_stdout_str))

        curr_qacct_dict[k] = v.strip()

    # if the last block of qacct data looks good, promote it
    if curr_qacct_dict and not _is_corrupt(curr_qacct_dict):
        good_qacct_dict = curr_qacct_dict

    return good_qacct_dict if good_qacct_dict else curr_qacct_dict
Beispiel #3
0
def run_cli_cmd(args,
                attempts=1,
                interval=15,
                logger=None,
                preexec_fn=exit_process_group,
                timeout=30,
                trust_exit_code=True,
                **kwargs):
    """
    Run the supplied cmd, optionally retrying some number of times if it fails or times out.

    You can pass through arbitrary arguments to this command. They eventually
    wind up as constructor arguments to subprocess.Popen().
    """
    while attempts > 0:
        attempts -= 1
        try:
            result = subprocess.run(args,
                                    check=True,
                                    stderr=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    timeout=timeout,
                                    universal_newlines=True,
                                    **kwargs)
            if result.returncode == 0:
                if trust_exit_code:
                    attempts = 0
                elif result.stdout:
                    attempts = 0
        except (subprocess.CalledProcessError,
                subprocess.TimeoutExpired) as exc:
            result = exc

        if logger is not None:
            log_func = logger.error
            details = ": stdout='%s', stderr='%s'" % (
                result.stdout.strip(),
                result.stderr.strip(),
            )
            if isinstance(result, subprocess.TimeoutExpired):
                effect = "exceeded %s-sec timeout" % result.timeout
            else:
                effect = "had exit code %s" % result.returncode
                if result.returncode == 0 and attempts == 0:
                    log_func = logger.debug
                    details = ""

            plan = "will retry in %s sec" % interval if attempts else "final attempt"
            log_func(
                "Call to %s %s (%s)%s",
                args.split()[0] if isinstance(args, str) else args[0],
                effect,
                plan,
                details,
            )

        if attempts:
            sleep_through_signals(timeout=interval)

    returncode = result.returncode if hasattr(result,
                                              "returncode") else "TIMEOUT"
    return result.stdout, result.stderr, returncode
Beispiel #4
0
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""):
    """
    Parse qacct output into key/value pairs.

    If qacct reports results in multiple blocks (separated by a row of ===='s),
    the most recently-generated block with valid data is returned. If no block
    with valid data exists, then return the most recently-generated block of
    corrupt data. Call ``is_corrupt()`` on the output of this method to see if
    the data are suitable for use.
    """
    if not logger:
        logger = _get_null_logger()

    start = time.time()
    curr_qacct_dict = None
    good_qacct_dict = None

    for i in xrange(num_retries):
        qacct_returncode = 0
        try:
            qacct_stdout_str, qacct_stderr_str = check_output_and_stderr(
                ['qacct', '-j', unicode(job_id)],
                preexec_fn=exit_process_group)
            if qacct_stdout_str.strip():
                break
        except DetailedCalledProcessError as err:
            qacct_stdout_str = err.output.strip()
            qacct_stderr_str = err.stderr.strip()
            qacct_returncode = err.returncode

        if qacct_stderr_str and re.match(r'error: job id \d+ not found',
                                         qacct_stderr_str):
            if i > 0:
                logger.info(
                    '%s SGE (qacct -j %s) reports "not found"; this may mean '
                    'qacct is merely slow, or %s died in the \'qw\' state',
                    log_prefix, job_id, job_id)
        else:
            logger.error('%s SGE (qacct -j %s) returned error code %d',
                         log_prefix, job_id, qacct_returncode)
            if qacct_stdout_str or qacct_stderr_str:
                logger.error('%s SGE (qacct -j %s) printed the following',
                             log_prefix, job_id)
                if qacct_stdout_str:
                    logger.error('stdout: "%s"', qacct_stdout_str)
                if qacct_stderr_str:
                    logger.error('stderr: "%s"', qacct_stderr_str)

        if i > 0:
            logger.info(
                '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s',
                log_prefix, job_id, i + 1,
                time.time() - start, '. Will recheck job status after %d sec' %
                quantum if i + 1 < num_retries else '')
        if i + 1 < num_retries:
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise QacctJobNotFoundError(
            '%s No valid SGE (qacct -j %s) output after %d tries over %d sec' %
            (log_prefix, job_id, i, time.time() - start))

    for line in qacct_stdout_str.strip().split('\n'):
        if line.startswith('='):
            if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
                #
                # Cache this non-corrupt block of qacct data just
                # in case all the more recent blocks are corrupt.
                #
                good_qacct_dict = curr_qacct_dict

            curr_qacct_dict = OrderedDict()
            continue

        try:
            k, v = re.split(r'\s+', line, maxsplit=1)
        except ValueError:
            raise EnvironmentError(
                '%s SGE (qacct -j %s) output is unparseable:\n%s' %
                (log_prefix, job_id, qacct_stdout_str))

        curr_qacct_dict[k] = v.strip()

    # if the last block of qacct data looks good, promote it
    if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
        good_qacct_dict = curr_qacct_dict

    return good_qacct_dict if good_qacct_dict else curr_qacct_dict
Beispiel #5
0
def _scontrol_raw(task, timeout=600, quantum=15):
    """
    Parse "scontrol show jobid" output into key/value pairs.
    """
    start = time.time()
    num_retries = int(timeout / quantum)

    for i in xrange(num_retries):
        qacct_returncode = 0
        try:
            qacct_stdout_str, qacct_stderr_str = check_output_and_stderr(
                [
                    'scontrol', 'show', 'jobid', '-d', '-o',
                    unicode(task.drm_jobID)
                ],
                preexec_fn=exit_process_group)
            if qacct_stdout_str.strip():
                break
        except CosmosCalledProcessError as err:
            qacct_stdout_str = err.output.strip()
            qacct_stderr_str = err.stderr.strip()
            qacct_returncode = err.returncode

            if qacct_stderr_str == 'slurm_load_jobs error: Invalid job id specified':
                # too many jobs were scheduled since it finished and the job id was forgotten
                return dict(JobId=task.drm_jobID)
            else:
                task.workflow.log.error(
                    '%s Slurm (scontrol show jobid -d -o %s) returned error code %d',
                    task, task.drm_jobID, qacct_returncode)
                if qacct_stdout_str or qacct_stderr_str:
                    task.workflow.log.error(
                        '%s Slurm (scontrol show jobid -d -o %s) printed the following',
                        task, task.drm_jobID)
                    if qacct_stdout_str:
                        task.workflow.log.error('stdout: "%s"',
                                                qacct_stdout_str)
                    if qacct_stderr_str:
                        task.workflow.log.error('stderr: "%s"',
                                                qacct_stderr_str)

        if i > 0:
            task.workflow.log.info(
                '%s Slurm (scontrol show jobid -d -o %s) attempt %d failed %d sec after first attempt%s',
                task, task.drm_jobID, i + 1,
                time.time() - start, '. Will recheck job status after %d sec' %
                quantum if i + 1 < num_retries else '')
        if i + 1 < num_retries:
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise ValueError(
            'No valid `scontrol show jobid -d -o %s` output after %d tries and %d sec'
            % (task.drm_jobID, i, time.time() - start))

    acct_dict = {}
    k, v = None, None
    for kv in qacct_stdout_str.strip().split():
        eq_pos = kv.find('=')
        if eq_pos == -1:
            # add the string to previous value - most likely the previous value contained a white space
            if k is not None:
                acct_dict[k] += (" " + kv)
                continue
            else:
                raise EnvironmentError(
                    '%s with drm_jobID=%s has unparseable "scontrol show jobid -d -o" output:\n%s\n'
                    'Could not find "=" in "%s"' %
                    (task, task.drm_jobID, qacct_stdout_str, kv))
        k, v = kv[:eq_pos], kv[(eq_pos + 1):]
        acct_dict[k] = v

    return acct_dict