Example #1
0
def get_looping_job_limit(job):
    """
    Get the time limit for looping job detection.

    :param job: job object.
    :return: looping job time limit (int).
    """

    log = get_logger(job.jobid)

    is_analysis = job.is_analysis()
    looping_limit = convert_to_int(config.Pilot.looping_limit_default_prod,
                                   default=12 * 3600)
    if is_analysis:
        looping_limit = convert_to_int(config.Pilot.looping_limit_default_user,
                                       default=3 * 3600)

    if job.maxcpucount and job.maxcpucount >= config.Pilot.looping_limit_min_default:
        _looping_limit = max(config.Pilot.looping_limit_min_default,
                             job.maxcpucount)
    else:
        _looping_limit = max(looping_limit, job.maxcpucount)

    if _looping_limit != looping_limit:
        log.info(
            "task request has updated looping job limit from %d s to %d s using maxCpuCount"
            % (looping_limit, _looping_limit))
        looping_limit = _looping_limit
    else:
        log.info("using standard looping job limit: %d s" % looping_limit)

    return looping_limit
Example #2
0
def get_looping_job_limit():
    """
    Get the time limit for looping job detection.

    :return: looping job time limit in seconds (int).
    """

    looping_limit = convert_to_int(config.Pilot.looping_limit_default,
                                   default=2 * 3600)
    looping_limit_min_default = convert_to_int(
        config.Pilot.looping_limit_min_default, default=2 * 3600)
    looping_limit = max(looping_limit, looping_limit_min_default)
    logger.info("using looping job limit: %d s", looping_limit)

    return looping_limit
Example #3
0
def verify_running_processes(current_time, mt, pid):
    """
    Verify the number of running processes.
    The function sets the environmental variable PILOT_MAXNPROC to the maximum number of found (child) processes
    corresponding to the main payload process id.
    The function does not return an error code (always returns exit code 0).

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :param pid: payload process id (int).
    :return: exit code (int), error diagnostics (string).
    """

    nproc_env = 0

    process_verification_time = convert_to_int(config.Pilot.process_verification_time, default=300)
    if current_time - mt.get('ct_process') > process_verification_time:
        # time to check the number of processes
        nproc = get_number_of_child_processes(pid)
        try:
            nproc_env = int(os.environ.get('PILOT_MAXNPROC', 0))
        except Exception as error:
            logger.warning('failed to convert PILOT_MAXNPROC to int: %s', error)
        else:
            if nproc > nproc_env:
                # set the maximum number of found processes
                os.environ['PILOT_MAXNPROC'] = str(nproc)

        if nproc_env > 0:
            logger.info('maximum number of monitored processes: %d', nproc_env)

    return 0, ""
Example #4
0
def verify_user_proxy(current_time, mt):
    """
    Verify the user proxy.
    This function is called by the job_monitor_tasks() function.

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :return: exit code (int), error diagnostics (string).
    """

    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    userproxy = __import__('pilot.user.%s.proxy' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3

    # is it time to verify the proxy?
    proxy_verification_time = convert_to_int(config.Pilot.proxy_verification_time, default=600)
    if current_time - mt.get('ct_proxy') > proxy_verification_time:
        # is the proxy still valid?
        exit_code, diagnostics = userproxy.verify_proxy(test=False)  # use test=True to test expired proxy
        if exit_code != 0:
            return exit_code, diagnostics
        else:
            # update the ct_proxy with the current time
            mt.update('ct_proxy')

    return 0, ""
Example #5
0
def verify_looping_job(current_time, mt, job):
    """
    Verify that the job is not looping.

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :param job: job object.
    :return: exit code (int), error diagnostics (string).
    """

    looping_verification_time = convert_to_int(config.Pilot.looping_verification_time, default=600)
    if current_time - mt.get('ct_looping') > looping_verification_time:
        # is the job looping?
        try:
            exit_code, diagnostics = looping_job(job, mt)
        except Exception as e:
            diagnostics = 'exception caught in looping job algorithm: %s' % e
            logger.warning(diagnostics)
            if "No module named" in diagnostics:
                exit_code = errors.BLACKHOLE
            else:
                exit_code = errors.UNKNOWNEXCEPTION
            return exit_code, diagnostics
        else:
            if exit_code != 0:
                return exit_code, diagnostics

        # update the ct_proxy with the current time
        mt.update('ct_looping')

    return 0, ""
Example #6
0
def verify_memory_usage(current_time, mt, job):
    """
    Verify the memory usage (optional).
    Note: this function relies on a stand-alone memory monitor tool that may be executed by the Pilot.

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :param job: job object.
    :return: exit code (int), error diagnostics (string).
    """

    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    memory = __import__('pilot.user.%s.memory' % pilot_user, globals(), locals(), [pilot_user], 0)  # Python 2/3

    if not memory.allow_memory_usage_verifications():
        return 0, ""

    # is it time to verify the memory usage?
    memory_verification_time = convert_to_int(config.Pilot.memory_usage_verification_time, default=60)
    if current_time - mt.get('ct_memory') > memory_verification_time:
        # is the used memory within the allowed limit?
        try:
            exit_code, diagnostics = memory.memory_usage(job)
        except Exception as e:
            logger.warning('caught exception: %s' % e)
            exit_code = -1
        if exit_code != 0:
            logger.warning('ignoring failure to parse memory monitor output')
            #return exit_code, diagnostics
        else:
            # update the ct_proxy with the current time
            mt.update('ct_memory')

    return 0, ""
Example #7
0
def convert_text_file_to_dictionary(path):
    """
    Convert row-column text file to dictionary.
    User first row identifiers as dictionary keys.
    Note: file must follow the convention:
        NAME1   NAME2   ..
        value1  value2  ..
        ..      ..      ..

    :param path: path to file (string).
    :return: dictionary.
    """

    summary_keys = []  # to keep track of content
    header_locked = False
    dictionary = {}

    with open(path) as f:
        for line in f:
            line = convert_unicode_string(line)
            if line != "":
                try:
                    # Remove empty entries from list (caused by multiple \t)
                    _l = line.replace('\n', '')
                    if is_python3():
                        _l = [_f for _f in _l.split('\t') if _f]  # Python 3
                    else:
                        _l = filter(None, _l.split('\t'))  # Python 2

                    # define dictionary keys
                    if type(_l[0]) == str and not header_locked:
                        summary_keys = _l
                        for key in _l:
                            dictionary[key] = []
                        header_locked = True
                    else:  # sort the memory measurements in the correct columns
                        for i, key in enumerate(_l):
                            # for key in _l:
                            key_entry = summary_keys[i]  # e.g. Time
                            value = convert_to_int(key)
                            dictionary[key_entry].append(value)
                except Exception:
                    logger.warning("unexpected format of utility output: %s" %
                                   line)

    return dictionary
Example #8
0
def verify_disk_usage(current_time, mt, job):
    """
    Verify the disk usage.
    The function checks 1) payload stdout size, 2) local space, 3) work directory size, 4) output file sizes.

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :param job: job object.
    :return: exit code (int), error diagnostics (string).
    """

    disk_space_verification_time = convert_to_int(
        config.Pilot.disk_space_verification_time, default=300)
    if current_time - mt.get('ct_diskspace') > disk_space_verification_time:
        # time to check the disk space

        # check the size of the payload stdout
        exit_code, diagnostics = check_payload_stdout(job)
        if exit_code != 0:
            return exit_code, diagnostics

        # check the local space, if it's enough left to keep running the job
        exit_code, diagnostics = check_local_space()
        if exit_code != 0:
            return exit_code, diagnostics

        # check the size of the workdir
        exit_code, diagnostics = check_work_dir(job)
        if exit_code != 0:
            return exit_code, diagnostics

        # check the output file sizes
        exit_code, diagnostics = check_output_file_sizes(job)
        if exit_code != 0:
            return exit_code, diagnostics

        # update the ct_diskspace with the current time
        mt.update('ct_diskspace')

    return 0, ""
Example #9
0
def should_abort_payload(current_time, mt):
    """
    Should the pilot abort the payload?
    In the case of Raythena, the Driver is monitoring the time to end jobs and may decide
    that the pilot should abort the payload. Internally, this is achieved by letting the Actors
    know it's time to end, and they in turn contacts the pilot by placing a 'pilot_kill_payload' file
    in the run directory.

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :return: exit code (int), error diagnostics (string).
    """

    # is it time to look for the kill instruction file?
    killing_time = convert_to_int(config.Pilot.kill_instruction_time, default=600)
    if current_time - mt.get('ct_kill') > killing_time:
        path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.kill_instruction_filename)
        if os.path.exists(path):
            logger.info('pilot encountered payload kill instruction file - will abort payload')
            return errors.KILLPAYLOAD, ""  # note, this is not an error

    return 0, ""
Example #10
0
def get_average_summary_dictionary_prmon(path):
    """
    Loop over the memory monitor output file and create the averaged summary dictionary.

    prmon keys:
    'Time', 'nprocs', 'nthreads', 'pss', 'rchar', 'read_bytes', 'rss', 'rx_bytes',
    'rx_packets', 'stime', 'swap', 'tx_bytes', 'tx_packets', 'utime', 'vmem', 'wchar',
    'write_bytes', 'wtime'

    The function uses the first line in the output file to define the dictionary keys used
    later in the function. This means that any change in the format such as new columns
    will be handled automatically.

    :param path: path to memory monitor txt output file (string).
    :return: summary dictionary.
    """

    dictionary = {}
    summary_dictionary = {}
    summary_keys = []  # to keep track of content
    header_locked = False
    with open(path) as f:
        for line in f:
            line = convert_unicode_string(line)
            if line != "":
                try:
                    # Remove empty entries from list (caused by multiple \t)
                    _l = line.replace('\n', '')
                    if is_python3():
                        _l = [_f for _f in _l.split('\t') if _f]  # Python 3
                    else:
                        _l = filter(None, _l.split('\t'))  # Python 2

                    # define dictionary keys
                    if type(_l[0]) == str and not header_locked:
                        summary_keys = _l
                        for key in _l:
                            dictionary[key] = []
                        header_locked = True
                    else:  # sort the memory measurements in the correct columns
                        for i, key in enumerate(_l):
                            # for key in _l:
                            key_entry = summary_keys[i]  # e.g. Time
                            value = convert_to_int(key)
                            dictionary[key_entry].append(value)
                except Exception:
                    logger.warning("unexpected format of utility output: %s" % line)
    #
    if dictionary:
        # Calculate averages and store all values
        summary_dictionary = {"Max": {}, "Avg": {}, "Other": {}}

        def filter_value(value):
            """ Inline function used to remove any string or None values from data. """
            if type(value) == str or value is None:
                return False
            else:
                return True

        keys = ['vmem', 'pss', 'rss', 'swap']
        values = {}
        for key in keys:
            value_list = list(filter(filter_value, dictionary.get(key, 0)))  # Python 2/3
            n = len(value_list)
            average = int(float(sum(value_list)) / float(n)) if n > 0 else 0
            maximum = max(value_list)
            values[key] = {'avg': average, 'max': maximum}

        summary_dictionary["Max"] = {"maxVMEM": values['vmem'].get('max'), "maxPSS": values['pss'].get('max'),
                                     "maxRSS": values['rss'].get('max'), "maxSwap": values['swap'].get('max')}
        summary_dictionary["Avg"] = {"avgVMEM": values['vmem'].get('avg'), "avgPSS": values['pss'].get('avg'),
                                     "avgRSS": values['rss'].get('avg'), "avgSwap": values['swap'].get('avg')}

        # add the last of the rchar, .., values
        keys = ['rchar', 'wchar', 'read_bytes', 'write_bytes']
        # warning: should read_bytes/write_bytes be reported as rbytes/wbytes?
        for key in keys:
            value = get_last_value(dictionary.get(key, None))
            if value:
                summary_dictionary["Other"][key] = value

    return summary_dictionary