Beispiel #1
0
def perform_initial_payload_error_analysis(job, exit_code):
    """
    Perform an initial analysis of the payload.
    Singularity errors are caught here.

    :param job: job object.
    :param exit_code: exit code from payload execution.
    :return:
    """

    log = get_logger(job.jobid, logger)

    if exit_code != 0:
        log.warning('main payload execution returned non-zero exit code: %d' %
                    exit_code)
        stderr = read_file(
            os.path.join(job.workdir, config.Payload.payloadstderr))
        if stderr != "":
            msg = errors.extract_stderr_msg(stderr)
            if msg != "":
                log.warning("extracted message from stderr:\n%s" % msg)
        ec = errors.resolve_transform_error(exit_code, stderr)
        if ec != 0:
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                ec)
        else:
            if job.piloterrorcodes:
                log.warning('error code(s) already set: %s' %
                            str(job.piloterrorcodes))
            else:
                log.warning('initial error analysis did not resolve the issue')
    else:
        log.info(
            'main payload execution returned zero exit code, but will check it more carefully'
        )
Beispiel #2
0
def get_panda_tracer_log(job):
    """
    Return the contents of the PanDA tracer log if it exists.
    This file will contain information about outbound connections.

    :param job: job object.
    :return: log extracts from pandatracerlog.txt (string).
    """

    extracts = ""

    tracerlog = os.path.join(job.workdir, "pandatracerlog.txt")
    if os.path.exists(tracerlog):
        # only add if file is not empty
        if os.path.getsize(tracerlog) > 0:
            message = "PandaID=%s had outbound connections: " % (job.jobid)
            extracts += message
            message = read_file(tracerlog)
            extracts += message
            logger.warning(message)
        else:
            logger.info(
                "PanDA tracer log (%s) has zero size (no outbound connections detected)"
                % tracerlog)
    else:
        logger.debug("PanDA tracer log does not exist: %s (ignoring)" %
                     tracerlog)

    return extracts
Beispiel #3
0
def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""):
    """
    Replace all LFNs with full TURLs in the payload execution command.

    This function is used with direct access in production jobs. Athena requires a full TURL instead of LFN.

    :param cmd: payload execution command (string).
    :param workdir: location of metadata file (string).
    :param filename: metadata file name (string).
    :param infiles: list of input files.
    :param writetofile:
    :return: updated cmd (string).
    """

    turl_dictionary = {}  # { LFN: TURL, ..}
    path = os.path.join(workdir, filename)
    if os.path.exists(path):
        file_info_dictionary = get_file_info_from_xml(workdir,
                                                      filename=filename)
        for inputfile in infiles:
            if inputfile in cmd:
                turl = file_info_dictionary[inputfile][0]
                turl_dictionary[inputfile] = turl
                # if turl.startswith('root://') and turl not in cmd:
                if turl not in cmd:
                    cmd = cmd.replace(inputfile, turl)
                    logger.info("replaced '%s' with '%s' in the run command" %
                                (inputfile, turl))

        # replace the LFNs with TURLs in the writetofile input file list (if it exists)
        if writetofile and turl_dictionary:
            filenames = get_writetoinput_filenames(writetofile)
            logger.info("filenames=%s" % filenames)
            for fname in filenames:
                new_lines = []
                path = os.path.join(workdir, fname)
                if os.path.exists(path):
                    f = read_file(path)
                    for line in f.split('\n'):
                        fname = os.path.basename(line)
                        if fname in turl_dictionary:
                            turl = turl_dictionary[fname]
                            new_lines.append(turl)
                        else:
                            if line:
                                new_lines.append(line)

                    lines = '\n'.join(new_lines)
                    if lines:
                        write_file(path, lines)
                        logger.info("lines=%s" % lines)
                else:
                    logger.warning("file does not exist: %s" % path)
    else:
        logger.warning(
            "could not find file: %s (cannot locate TURLs for direct access)" %
            filename)

    return cmd
Beispiel #4
0
def interpret(job):
    """
    Interpret the payload, look for specific errors in the stdout.

    :param job: job object
    :return: exit code (payload) (int).
    """

    stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
    message = 'payload stdout dump\n'
    message += read_file(stdout)
    logger.debug(message)
    stderr = os.path.join(job.workdir, config.Payload.payloadstderr)
    message = 'payload stderr dump\n'
    message += read_file(stderr)
    logger.debug(message)

    return 0
Beispiel #5
0
def perform_initial_payload_error_analysis(job, exit_code):
    """
    Perform an initial analysis of the payload.
    Singularity errors are caught here.

    :param job: job object.
    :param exit_code: exit code from payload execution.
    :return:
    """

    log = get_logger(job.jobid, logger)

    if exit_code != 0:
        msg = ""
        ec = 0
        log.warning('main payload execution returned non-zero exit code: %d' %
                    exit_code)
        stderr = read_file(
            os.path.join(job.workdir, config.Payload.payloadstderr))
        if stderr != "":
            msg = errors.extract_stderr_error(stderr)
            if msg == "":
                # look for warning messages instead (might not be fatal so do not set UNRECOGNIZEDTRFSTDERR)
                msg = errors.extract_stderr_warning(stderr)
                fatal = False
            else:
                fatal = True
            if msg != "":
                log.warning("extracted message from stderr:\n%s" % msg)
                ec = set_error_code_from_stderr(msg, fatal)

        if not ec:
            ec = errors.resolve_transform_error(exit_code, stderr)
        if ec != 0:
            if msg:
                msg = errors.format_diagnostics(ec, msg)
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                ec, msg=msg)
        else:
            if job.piloterrorcodes:
                log.warning('error code(s) already set: %s' %
                            str(job.piloterrorcodes))
            else:
                if os.path.exists(os.path.join(job.workdir, "core")):
                    log.warning("detected a core dump file (will be removed)")
                    remove(os.path.join(job.workdir, "core"))
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                        errors.COREDUMP)
                else:
                    log.warning(
                        'initial error analysis did not resolve the issue')
    else:
        log.info(
            'main payload execution returned zero exit code, but will check it more carefully'
        )
Beispiel #6
0
def get_metadata(workdir):
    """
    Return the metadata from file.

    :param workdir: work directory (string)
    :return:
    """

    path = os.path.join(workdir, config.Payload.jobreport)
    metadata = read_file(path) if os.path.exists(path) else None

    return metadata
Beispiel #7
0
def get_score(pid):
    """
    Get the OOM process score.

    :param pid: process id (int).
    :return: score (string).
    """

    try:
        score = '%s' % read_file('/proc/%d/oom_score' % pid)
    except Exception as error:
        logger.warning('caught exception reading oom_score: %s', error)
        score = 'UNKNOWN'
    else:
        if score.endswith('\n'):
            score = score[:-1]

    return score
Beispiel #8
0
def perform_initial_payload_error_analysis(job, exit_code):
    """
    Perform an initial analysis of the payload.
    Singularity errors are caught here.

    :param job: job object.
    :param exit_code: exit code from payload execution.
    :return:
    """

    log = get_logger(job.jobid, logger)

    if exit_code != 0:
        ec = 0
        log.warning('main payload execution returned non-zero exit code: %d' % exit_code)
        stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr))
        if stderr != "":
            msg = errors.extract_stderr_msg(stderr)
            if msg != "":
                log.warning("extracted message from stderr:\n%s" % msg)
                if "Failed invoking the NEWUSER namespace runtime" in msg:
                    ec = errors.SINGULARITYNEWUSERNAMESPACE
                elif "Failed to create user namespace" in msg:
                    ec = errors.SINGULARITYFAILEDUSERNAMESPACE
                elif "command not found" in msg:
                    ec = errors.TRANSFORMNOTFOUND
                elif "SL5 is unsupported" in msg:
                    ec = errors.UNSUPPORTEDSL5OS
                elif "resource temporarily unavailable" in msg:
                    ec = errors.SINGULARITYRESOURCEUNAVAILABLE
                elif "unrecognized arguments" in msg:
                    ec = errors.UNRECOGNIZEDTRFARGUMENTS

        if not ec:
            ec = errors.resolve_transform_error(exit_code, stderr)
        if ec != 0:
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec)
        else:
            if job.piloterrorcodes:
                log.warning('error code(s) already set: %s' % str(job.piloterrorcodes))
            else:
                log.warning('initial error analysis did not resolve the issue')
    else:
        log.info('main payload execution returned zero exit code, but will check it more carefully')
Beispiel #9
0
def get_max_memory_usage_from_cgroups():
    """
    Read the max_memory from CGROUPS file memory.max_usage_in_bytes.

    :return: max_memory (int).
    """

    max_memory = None

    # Get the CGroups max memory using the pilot pid
    pid = os.getpid()
    path = "/proc/%d/cgroup" % pid
    if os.path.exists(path):
        cmd = "grep memory %s" % path
        exit_code, out, stderr = execute(cmd)
        if out == "":
            logger.info("(command did not return anything)")
        else:
            logger.info(out)
            if ":memory:" in out:
                pos = out.find('/')
                path = out[pos:]
                logger.info("extracted path = %s" % path)

                pre = get_cgroups_base_path()
                if pre != "":
                    path = pre + os.path.join(path,
                                              "memory.max_usage_in_bytes")
                    logger.info("path to CGROUPS memory info: %s" % path)
                    max_memory = read_file(path)
                else:
                    logger.info(
                        "CGROUPS base path could not be extracted - not a CGROUPS site"
                    )
            else:
                logger.warning(
                    "invalid format: %s (expected ..:memory:[path])" % out)
    else:
        logger.info("path %s does not exist (not a CGROUPS site)" % path)

    return max_memory
Beispiel #10
0
def process_metadata_from_xml(job):
    """
    Extract necessary metadata from XML when job report is not available.

    :param job: job object.
    :return: [updated job object - return not needed].
    """

    # get the metadata from the xml file instead, which must exist for most production transforms
    path = os.path.join(job.workdir, config.Payload.metadata)
    if os.path.exists(path):
        job.metadata = read_file(path)
    else:
        if not job.is_analysis() and job.transformation != 'Archive_tf.py':
            diagnostics = 'metadata does not exist: %s' % path
            logger.warning(diagnostics)
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                errors.NOPAYLOADMETADATA)
            job.piloterrorcode = errors.NOPAYLOADMETADATA
            job.piloterrordiag = diagnostics

    # add missing guids
    for dat in job.outdata:
        if not dat.guid:
            # try to read it from the metadata before the last resort of generating it
            metadata = None
            try:
                metadata = get_metadata_from_xml(job.workdir)
            except Exception as e:
                msg = "Exception caught while interpreting XML: %s (ignoring it, but guids must now be generated)" % e
                logger.warning(msg)
            if metadata:
                dat.guid = get_guid_from_xml(metadata, dat.lfn)
                logger.info('read guid for lfn=%s from xml: %s' %
                            (dat.lfn, dat.guid))
            else:
                dat.guid = get_guid()
                logger.info('generated guid for lfn=%s: %s' %
                            (dat.lfn, dat.guid))
Beispiel #11
0
def process_job_report(job):
    """
    Process the job report produced by the payload/transform if it exists.
    Payload error codes and diagnostics, as well as payload metadata (for output files) and stageout type will be
    extracted. The stageout type is either "all" (i.e. stage-out both output and log files) or "log" (i.e. only log file
    will be staged out).
    Note: some fields might be experiment specific. A call to a user function is therefore also done.

    :param job: job dictionary will be updated by the function and several fields set.
    :return:
    """

    log = get_logger(job.jobid)

    # get the job report
    path = os.path.join(job.workdir, config.Payload.jobreport)
    if not os.path.exists(path):
        log.warning(
            'job report does not exist: %s (any missing output file guids must be generated)'
            % path)

        # get the metadata from the xml file instead, which must exist for most production transforms
        path = os.path.join(job.workdir, config.Payload.metadata)
        if os.path.exists(path):
            job.metadata = read_file(path)
        else:
            if not job.is_analysis() and job.transformation != 'Archive_tf.py':
                diagnostics = 'metadata does not exist: %s' % path
                log.warning(diagnostics)
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.NOPAYLOADMETADATA)
                job.piloterrorcode = errors.NOPAYLOADMETADATA
                job.piloterrordiag = diagnostics

        # add missing guids
        for dat in job.outdata:
            if not dat.guid:
                dat.guid = get_guid()
                log.warning('guid not set: generated guid=%s for lfn=%s' %
                            (dat.guid, dat.lfn))

    else:
        with open(path) as data_file:
            # compulsory field; the payload must produce a job report (see config file for file name), attach it to the
            # job object
            job.metadata = json.load(data_file)

            #
            update_job_data(job)

            # compulsory fields
            try:
                job.exitcode = job.metadata['exitCode']
            except Exception as e:
                log.warning(
                    'could not find compulsory payload exitCode in job report: %s (will be set to 0)'
                    % e)
                job.exitcode = 0
            else:
                log.info('extracted exit code from job report: %d' %
                         job.exitcode)
            try:
                job.exitmsg = job.metadata['exitMsg']
            except Exception as e:
                log.warning(
                    'could not find compulsory payload exitMsg in job report: %s '
                    '(will be set to empty string)' % e)
                job.exitmsg = ""
            else:
                # assign special payload error code
                if "got a SIGSEGV signal" in job.exitmsg:
                    diagnostics = 'Invalid memory reference or a segmentation fault in payload: %s (job report)' % \
                                  job.exitmsg
                    log.warning(diagnostics)
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                        errors.PAYLOADSIGSEGV)
                    job.piloterrorcode = errors.PAYLOADSIGSEGV
                    job.piloterrordiag = diagnostics
                else:
                    log.info('extracted exit message from job report: %s' %
                             job.exitmsg)
                    if job.exitmsg != 'OK':
                        job.exeerrordiag = job.exitmsg
                        job.exeerrorcode = job.exitcode

            if job.exitcode != 0:
                # get list with identified errors in job report
                job_report_errors = get_job_report_errors(job.metadata, log)

                # is it a bad_alloc failure?
                bad_alloc, diagnostics = is_bad_alloc(job_report_errors, log)
                if bad_alloc:
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                        errors.BADALLOC)
                    job.piloterrorcode = errors.BADALLOC
                    job.piloterrordiag = diagnostics
Beispiel #12
0
def perform_initial_payload_error_analysis(job, exit_code):
    """
    Perform an initial analysis of the payload.
    Singularity errors are caught here.

    :param job: job object.
    :param exit_code: exit code from payload execution.
    :return:
    """

    if exit_code != 0:
        logger.warning(
            'main payload execution returned non-zero exit code: %d',
            exit_code)

    # look for singularity errors (the exit code can be zero in this case)
    stderr = read_file(os.path.join(job.workdir, config.Payload.payloadstderr))
    exit_code = errors.resolve_transform_error(exit_code, stderr)

    if exit_code != 0:
        msg = ""
        if stderr != "":
            msg = errors.extract_stderr_error(stderr)
            if msg == "":
                # look for warning messages instead (might not be fatal so do not set UNRECOGNIZEDTRFSTDERR)
                msg = errors.extract_stderr_warning(stderr)
            #    fatal = False
            #else:
            #    fatal = True
            #if msg != "":  # redundant since resolve_transform_error is used above
            #    logger.warning("extracted message from stderr:\n%s", msg)
            #    exit_code = set_error_code_from_stderr(msg, fatal)

        if msg:
            msg = errors.format_diagnostics(exit_code, msg)

        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            exit_code, msg=msg)
        '''
        if exit_code != 0:
            if msg:
                msg = errors.format_diagnostics(exit_code, msg)
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg)
        else:
            if job.piloterrorcodes:
                logger.warning('error code(s) already set: %s', str(job.piloterrorcodes))
            else:
                # check if core dumps exist, if so remove them and return True
                if remove_core_dumps(job.workdir) and not job.debug:
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP)
                else:
                    logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)')
        '''
    else:
        logger.info('main payload execution returned zero exit code')

    # check if core dumps exist, if so remove them and return True
    if not job.debug:  # do not shorten these if-statements
        if remove_core_dumps(job.workdir):
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                errors.COREDUMP)