コード例 #1
0
ファイル: data.py プロジェクト: ptrlv/pilot2
    def check_availablespace(self, files):
        """
        Verify that enough local space is available to stage in and run the job

        :param files: list of FileSpec objects.
        :raise: PilotException in case of not enough space or total input size too large
        """

        for f in files:
            self.logger.debug('lfn=%s filesize=%d accessmode=%s' %
                              (f.lfn, f.filesize, f.accessmode))

        maxinputsize = convert_mb_to_b(get_maximum_input_sizes())
        totalsize = reduce(lambda x, y: x + y.filesize, files, 0)

        # verify total filesize
        if maxinputsize and totalsize > maxinputsize:
            error = "too many/too large input files (%s). total file size=%s B > maxinputsize=%s B" % \
                    (len(files), totalsize, maxinputsize)
            raise SizeTooLarge(error)

        self.logger.info(
            "total input file size=%s B within allowed limit=%s B (zero value means unlimited)"
            % (totalsize, maxinputsize))

        # get available space
        available_space = convert_mb_to_b(get_local_disk_space(os.getcwd()))
        self.logger.info("locally available space: %d B" % available_space)

        # are we within the limit?
        if totalsize > available_space:
            error = "not enough local space for staging input files and run the job (need %d B, but only have %d B)" % \
                    (totalsize, available_space)
            raise NoLocalSpace(error)
コード例 #2
0
ファイル: monitoring.py プロジェクト: ptrlv/pilot2
def get_max_allowed_work_dir_size(queuedata):
    """
    Return the maximum allowed size of the work directory.

    :param queuedata: job.infosys.queuedata object.
    :return: max allowed work dir size in Bytes (int).
    """

    try:
        maxwdirsize = convert_mb_to_b(get_maximum_input_sizes(
        ))  # from MB to B, e.g. 16336 MB -> 17,129,537,536 B
    except Exception as e:
        max_input_size = get_max_input_size()
        maxwdirsize = max_input_size + config.Pilot.local_size_limit_stdout * 1024
        logger.info(
            "work directory size check will use %d B as a max limit (maxinputsize [%d B] + local size limit for"
            " stdout [%d B])" % (maxwdirsize, max_input_size,
                                 config.Pilot.local_size_limit_stdout * 1024))
        logger.warning('conversion caught exception: %s' % e)
    else:
        # grace margin, as discussed in https://its.cern.ch/jira/browse/ATLASPANDA-482
        margin = 10.0  # percent, read later from somewhere
        maxwdirsize = int(maxwdirsize * (1 + margin / 100.0))
        logger.info(
            "work directory size check will use %d B as a max limit (10%% grace limit added)"
            % maxwdirsize)

    return maxwdirsize
コード例 #3
0
ファイル: monitoring.py プロジェクト: mlassnig/pilot2
def check_local_space(initial=True):
    """
    Do we have enough local disk space left to run the job?
    For the initial local space check, the Pilot will require 2 GB of free space, but during running
    this can be lowered to 1 GB.

    :param initial: True means a 2 GB limit, False means a 1 GB limit (optional Boolean)
    :return: pilot error code (0 if success, NOLOCALSPACE if failure)
    """

    ec = 0
    diagnostics = ""

    # is there enough local space to run a job?
    cwd = os.getcwd()
    logger.debug('checking local space on %s', cwd)
    spaceleft = convert_mb_to_b(get_local_disk_space(cwd))  # B (diskspace is in MB)
    free_space_limit = human2bytes(config.Pilot.free_space_limit) if initial else human2bytes(config.Pilot.free_space_limit_running)

    if spaceleft <= free_space_limit:
        diagnostics = 'too little space left on local disk to run job: %d B (need > %d B)' %\
                      (spaceleft, free_space_limit)
        ec = errors.NOLOCALSPACE
        logger.warning(diagnostics)
    else:
        logger.info('sufficient remaining disk space (%d B)', spaceleft)

    return ec, diagnostics
コード例 #4
0
ファイル: monitoring.py プロジェクト: ptrlv/pilot2
def check_local_space():
    """
    Do we have enough local disk space left to run the job?

    :return: pilot error code (0 if success, NOLOCALSPACE if failure)
    """

    ec = 0
    diagnostics = ""

    # is there enough local space to run a job?
    cwd = os.getcwd()
    logger.debug('checking local space on %s' % cwd)
    spaceleft = convert_mb_to_b(
        get_local_disk_space(cwd))  # B (diskspace is in MB)
    free_space_limit = human2bytes(config.Pilot.free_space_limit)
    if spaceleft <= free_space_limit:
        diagnostics = 'too little space left on local disk to run job: %d B (need > %d B)' %\
                      (spaceleft, free_space_limit)
        ec = errors.NOLOCALSPACE
        logger.warning(diagnostics)
    else:
        logger.info('sufficient remaining disk space (%d B)' % spaceleft)

    return ec, diagnostics
コード例 #5
0
def interpret_payload_exit_info(job):
    """
    Interpret the exit info from the payload

    :param job: job object.
    :return:
    """

    # try to identify out of memory errors in the stderr
    if is_out_of_memory(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.PAYLOADOUTOFMEMORY, priority=True)
        return

    # look for specific errors in the stdout (tail)
    if is_installation_error(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.MISSINGINSTALLATION, priority=True)
        return

    # did AtlasSetup fail?
    if is_atlassetup_error(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.SETUPFATAL, priority=True)
        return

    # did the payload run out of space?
    if is_out_of_space(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.NOLOCALSPACE, priority=True)

        # double check local space
        spaceleft = convert_mb_to_b(get_local_disk_space(
            os.getcwd()))  # B (diskspace is in MB)
        logger.info('verifying local space: %d B' % spaceleft)
        return

    # look for specific errors in the stdout (full)
    if is_nfssqlite_locking_problem(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.NFSSQLITE, priority=True)
        return

    # is the user tarball missing on the server?
    if is_user_code_missing(job):
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.MISSINGUSERCODE, priority=True)
        return

    # set a general Pilot error code if the payload error could not be identified
    if job.transexitcode == 0 and job.exitcode != 0:
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.UNKNOWNPAYLOADFAILURE, priority=True)