Beispiel #1
0
def check_work_dir(job):
    """
    Check the size of the work directory.
    The function also updates the workdirsizes list in the job object.

    :param job: job object.
    :return: exit code (int), error diagnostics (string)
    """

    exit_code = 0
    diagnostics = ""

    log = get_logger(job.jobid)

    if os.path.exists(job.workdir):
        # get the limit of the workdir
        maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata)

        if os.path.exists(job.workdir):
            workdirsize = get_directory_size(directory=job.workdir)

            # is user dir within allowed size limit?
            if workdirsize > maxwdirsize:
                exit_code = errors.USERDIRTOOLARGE
                diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \
                              (job.workdir, workdirsize, maxwdirsize)
                log.fatal("%s" % diagnostics)

                cmd = 'ls -altrR %s' % job.workdir
                exit_code, stdout, stderr = execute(cmd, mute=True)
                log.info("%s: %s" % (cmd + '\n', stdout))

                # kill the job
                # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED")
                kill_processes(job.pid)

                # remove any lingering input files from the work dir
                lfns, guids = job.get_lfns_and_guids()
                if lfns:
                    remove_files(job.workdir, lfns)

                    # remeasure the size of the workdir at this point since the value is stored below
                    workdirsize = get_directory_size(directory=job.workdir)
            else:
                log.info(
                    "size of work directory %s: %d B (within %d B limit)" %
                    (job.workdir, workdirsize, maxwdirsize))

            # Store the measured disk space (the max value will later be sent with the job metrics)
            if workdirsize > 0:
                job.add_workdir_size(workdirsize)
        else:
            log.warning('job work dir does not exist: %s' % job.workdir)
    else:
        log.warning(
            'skipping size check of workdir since it has not been created yet')

    return exit_code, diagnostics
Beispiel #2
0
def kill_looping_job(job):
    """
    Kill the looping process.
    TODO: add allow_looping_job() exp. spec?

    :param job: job object.
    :return: (updated job object.)
    """

    # the child process is looping, kill it
    diagnostics = "pilot has decided to kill looping job %s at %s" % (
        job.jobid, time_stamp())
    logger.fatal(diagnostics)

    cmd = 'ps -fwu %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ls -ltr %s' % (job.workdir)
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'pstree -g -a'
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    # set the relevant error code
    if job.state == 'stagein':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEINTIMEOUT)
    elif job.state == 'stageout':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTTIMEOUT)
    else:
        # most likely in the 'running' state, but use the catch-all 'else'
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.LOOPINGJOB)
    set_pilot_state(job=job, state="failed")

    # remove any lingering input files from the work dir
    lfns, guids = job.get_lfns_and_guids()
    if lfns:
        ec = remove_files(job.workdir, lfns)
        if ec != 0:
            logger.warning('failed to remove all files')

    kill_processes(job.pid)
Beispiel #3
0
def check_payload_stdout(job):
    """
    Check the size of the payload stdout.

    :param job: job object.
    :return: exit code (int), diagnostics (string).
    """

    exit_code = 0
    diagnostics = ""

    # get list of log files
    file_list = glob(os.path.join(job.workdir, 'log.*'))

    # is this a multi-trf job?
    n_jobs = job.jobparams.count("\n") + 1
    for _i in range(n_jobs):
        # get name of payload stdout file created by the pilot
        _stdout = config.Payload.payloadstdout
        if n_jobs > 1:
            _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))

        # add the primary stdout file to the fileList
        file_list.append(os.path.join(job.workdir, _stdout))

    tmp_list = glob(os.path.join(job.workdir, 'workDir/tmp.stdout.*'))
    if tmp_list:
        file_list += tmp_list
    logger.debug('file list=%s' % str(file_list))

    # now loop over all files and check each individually (any large enough file will fail the job)
    for filename in file_list:

        logger.debug('check_payload_stdout: filename=%s', filename)
        if "job.log.tgz" in filename:
            logger.info("skipping file size check of file (%s) since it is a special log file", filename)
            continue

        if os.path.exists(filename):
            try:
                # get file size in bytes
                fsize = os.path.getsize(filename)
            except Exception as error:
                logger.warning("could not read file size of %s: %s", filename, error)
            else:
                # is the file too big?
                localsizelimit_stdout = get_local_size_limit_stdout()
                if fsize > localsizelimit_stdout:
                    exit_code = errors.STDOUTTOOBIG
                    diagnostics = "Payload stdout file too big: %d B (larger than limit %d B)" % \
                                  (fsize, localsizelimit_stdout)
                    logger.warning(diagnostics)

                    # kill the job
                    set_pilot_state(job=job, state="failed")
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
                    kill_processes(job.pid)

                    # remove the payload stdout file after the log extracts have been created

                    # remove any lingering input files from the work dir
                    lfns, guids = job.get_lfns_and_guids()
                    if lfns:
                        # remove any lingering input files from the work dir
                        exit_code = remove_files(job.workdir, lfns)
                else:
                    logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize)
        else:
            logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename)

    return exit_code, diagnostics