Example #1
0
def get_postgetjob_time(job_id, args):
    """
    Return the post getjob time.

    :param job_id: job object.
    :param args: pilot arguments.
    :return: post getjob time measurement (int). In case of failure, return None.
    """

    time_measurement = None
    timing_constant = PILOT_POST_GETJOB

    if job_id in args.timing:
        # extract time measurements
        time_measurement_dictionary = args.timing.get(job_id, None)
        if time_measurement_dictionary:
            time_measurement = time_measurement_dictionary.get(
                timing_constant, None)

        if not time_measurement:
            log = get_logger(job_id)
            log.warning(
                'failed to extract time measurement %s from %s (no such key)' %
                (timing_constant, time_measurement_dictionary))

    return time_measurement
Example #2
0
def get_time_measurement(timing_constant, time_measurement_dictionary,
                         timing_dictionary, job_id):
    """
    Return a requested time measurement from the time measurement dictionary, read from the pilot timing file.

    :param timing_constant: timing constant (e.g. PILOT_MULTIJOB_START_TIME)
    :param time_measurement_dictionary: time measurement dictionary, extracted from pilot timing dictionary.
    :param timing_dictionary: full timing dictionary from pilot timing file.
    :param job_id: PanDA job id (string).
    :return: time measurement (float).
    """

    time_measurement = time_measurement_dictionary.get(timing_constant, None)
    if not time_measurement:
        # try to get the measurement for the PILOT_MULTIJOB_START_TIME dictionary
        i = '0' if timing_constant == PILOT_START_TIME else '1'
        time_measurement_dictionary_0 = timing_dictionary.get(i, None)
        if time_measurement_dictionary_0:
            time_measurement = time_measurement_dictionary_0.get(
                timing_constant, None)
        else:
            log = get_logger(job_id)
            log.warning(
                'failed to extract time measurement %s from %s (no such key)' %
                (timing_constant, time_measurement_dictionary))

    return time_measurement
Example #3
0
    def get_events(self, req):
        """
        Get events
        """
        self.get_events_lock.acquire()

        resp = None
        try:
            log = get_logger(str(req.jobid), logger)

            if not req.num_ranges:
                # ToBeFix num_ranges with corecount
                req.num_ranges = 1

            data = {
                'pandaID': req.jobid,
                'jobsetID': req.jobsetid,
                'taskID': req.taskid,
                'nRanges': req.num_ranges
            }

            log.info("Downloading new event ranges: %s" % data)
            res = https.request(
                '{pandaserver}/server/panda/getEventRanges'.format(
                    pandaserver=config.Pilot.pandaserver),
                data=data)
            log.info("Downloaded event ranges: %s" % res)

            if res['StatusCode'] == 0 or str(res['StatusCode']) == '0':
                resp_attrs = {
                    'status': 0,
                    'content': res['eventRanges'],
                    'exception': None
                }
            else:
                resp_attrs = {
                    'status':
                    res['StatusCode'],
                    'content':
                    None,
                    'exception':
                    exception.CommunicationFailure(
                        "Get events from panda returns non-zero value: %s" %
                        res['StatusCode'])
                }

            resp = CommunicationResponse(resp_attrs)
        except Exception, e:
            log.error("Failed to download event ranges: %s, %s" %
                      (e, traceback.format_exc()))
            resp_attrs = {
                'status':
                -1,
                'content':
                None,
                'exception':
                exception.UnknownException("Failed to get events: %s" %
                                           (traceback.format_exc()))
            }
            resp = CommunicationResponse(resp_attrs)
Example #4
0
def timing_report(job_id, args):
    """
    Write a timing report to the job log and return relevant timing measurements.

    :param job_id: job id (string).
    :param args: pilot arguments.
    :return: time_getjob, time_stagein, time_payload, time_stageout, time_total_setup (integer strings).
    """

    log = get_logger(job_id)

    # collect pilot timing data
    time_getjob = get_getjob_time(job_id, args)
    time_initial_setup = get_initial_setup_time(job_id, args)
    time_setup = get_setup_time(job_id, args)
    time_total_setup = time_initial_setup + time_setup
    time_stagein = get_stagein_time(job_id, args)
    time_payload = get_payload_execution_time(job_id, args)
    time_stageout = get_stageout_time(job_id, args)
    log.info('.' * 30)
    log.info('. Timing measurements:')
    log.info('. get job = %d s' % time_getjob)
    log.info('. initial setup = %d s' % time_initial_setup)
    log.info('. payload setup = %d s' % time_setup)
    log.info('. total setup = %d s' % time_total_setup)
    log.info('. stage-in = %d s' % time_stagein)
    log.info('. payload execution = %d s' % time_payload)
    log.info('. stage-out = %d s' % time_stageout)
    log.info('.' * 30)

    return time_getjob, time_stagein, time_payload, time_stageout, time_total_setup
Example #5
0
    def clean(self):
        """
        Clean temp produced files
        """
        job = self.get_job()
        log = get_logger(job.jobid, logger)

        for msg in self.__all_out_messages:
            if msg['status'] in ['failed', 'fatal']:
                pass
            elif 'output' in msg:
                try:
                    log.info("Removing es premerge file: %s" % msg['output'])
                    os.remove(msg['output'])
                except Exception as e:
                    log.error("Failed to remove file(%s): %s" %
                              (msg['output'], str(e)))
        self.__queued_out_messages = []
        self.__last_stageout_time = None
        self.__all_out_messages = []

        if self.proc:
            self.proc.stop()
            while self.proc.is_alive():
                time.sleep(0.1)

        self.stop_communicator()
Example #6
0
def get_log_extracts(job, state):
    """
    Extract special warnings and other other info from special logs.
    This function also discovers if the payload had any outbound connections.

    :param job: job object.
    :param state: job state (string).
    :return: log extracts (string).
    """

    log = get_logger(job.jobid)
    log.info("building log extracts (sent to the server as \'pilotLog\')")

    # did the job have any outbound connections?
    # look for the pandatracerlog.txt file, produced if the user payload attempted any outgoing connections
    extracts = get_panda_tracer_log(job)

    # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself
    _extracts = get_pilot_log_extracts(job)
    if _extracts != "":
        log.warning(
            'detected the following tail of warning/fatal messages in the pilot log:\n%s'
            % _extracts)
        if state == 'failed' or state == 'holding':
            extracts += _extracts

    # add extracts from payload logs
    # (see buildLogExtracts in Pilot 1)

    return extracts
Example #7
0
def get_panda_tracer_log(job):
    """
    Return the contents of the PanDA tracer log if it exists.
    This file will contain information about outbound connections.

    :param job: job object.
    :return: log extracts from pandatracerlog.txt (string).
    """

    extracts = ""
    log = get_logger(job.jobid)

    tracerlog = os.path.join(job.workdir, "pandatracerlog.txt")
    if os.path.exists(tracerlog):
        # only add if file is not empty
        if os.path.getsize(tracerlog) > 0:
            message = "PandaID=%s had outbound connections: " % (job.jobid)
            extracts += message
            message = read_file(tracerlog)
            extracts += message
            log.warning(message)
        else:
            log.info(
                "PanDA tracer log (%s) has zero size (no outbound connections detected)"
                % tracerlog)
    else:
        log.debug("PanDA tracer log does not exist: %s (ignoring)" % tracerlog)

    return extracts
Example #8
0
def verify_looping_job(current_time, mt, job):
    """
    Verify that the job is not looping.

    :param current_time: current time at the start of the monitoring loop (int).
    :param mt: measured time object.
    :param job: job object.
    :return: exit code (int), error diagnostics (string).
    """

    log = get_logger(job.jobid)

    looping_verification_time = convert_to_int(config.Pilot.looping_verification_time, default=600)
    if current_time - mt.get('ct_looping') > looping_verification_time:
        # is the job looping?
        try:
            exit_code, diagnostics = looping_job(job, mt)
        except Exception as e:
            diagnostics = 'exception caught in looping job algorithm: %s' % e
            log.warning(diagnostics)
            if "No module named" in diagnostics:
                exit_code = errors.BLACKHOLE
            else:
                exit_code = errors.UNKNOWNEXCEPTION
            return exit_code, diagnostics
        else:
            if exit_code != 0:
                return exit_code, diagnostics

        # update the ct_proxy with the current time
        mt.update('ct_looping')

    return 0, ""
Example #9
0
def check_output_file_sizes(job):
    """
    Are the output files within the allowed size limits?

    :param job: job object.
    :return: exit code (int), error diagnostics (string)
    """

    exit_code = 0
    diagnostics = ""

    log = get_logger(job.jobid)

    # loop over all known output files
    for fspec in job.outdata:
        path = os.path.join(job.workdir, fspec.lfn)
        if os.path.exists(path):
            # get the current file size
            fsize = get_local_file_size(path)
            max_fsize = human2bytes(config.Pilot.maximum_output_file_size)
            if fsize and fsize < max_fsize:
                log.info('output file %s is within allowed size limit (%d B < %d B)' % (path, fsize, max_fsize))
            else:
                exit_code = errors.OUTPUTFILETOOLARGE
                diagnostics = 'output file %s is not within allowed size limit (%d B > %d B)' % (path, fsize, max_fsize)
                log.warning(diagnostics)
        else:
            log.info('output file size check: skipping output file %s since it does not exist' % path)

    return exit_code, diagnostics
Example #10
0
def get_job_metrics(job):
    """
    Return a properly formatted job metrics string.
    Job metrics are highly user specific, so this function merely calls a corresponding get_job_metrics() in the
    user code. The format of the job metrics string is defined by the server. It will be reported to the server during
    updateJob.

    Example of job metrics:
    Number of events read | Number of events written | vmPeak maximum | vmPeak average | RSS average | ..
    Format: nEvents=<int> nEventsW=<int> vmPeakMax=<int> vmPeakMean=<int> RSSMean=<int> hs06=<float> shutdownTime=<int>
            cpuFactor=<float> cpuLimit=<float> diskLimit=<float> jobStart=<int> memLimit=<int> runLimit=<float>

    :param job: job object.
    :return: job metrics (string).
    """

    log = get_logger(job.jobid)

    user = environ.get('PILOT_USER',
                       'generic').lower()  # TODO: replace with singleton
    try:
        job_metrics_module = __import__('pilot.user.%s.jobmetrics' % user,
                                        globals(), locals(), [user],
                                        0)  # Python 2/3
    except AttributeError as e:
        job_metrics = None
        log.warning('function not implemented in jobmetrics module: %s' % e)
    else:
        job_metrics = job_metrics_module.get_job_metrics(job)

    return job_metrics
Example #11
0
    def utility_after_payload_started(self, job):
        """
        Functions to run after payload started
        :param job: job object
        """
        log = get_logger(job.jobid, logger)

        # get the payload command from the user specific code
        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
        user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], -1)

        # should any additional commands be executed after the payload?
        cmds = user.get_utility_commands_list(order=UTILITY_AFTER_PAYLOAD_STARTED)
        if cmds != []:
            for utcmd in cmds:
                log.info('utility command to be executed after the payload: %s' % utcmd)

                # how should this command be executed?
                utilitycommand = user.get_utility_command_setup(utcmd, job)
                if not utilitycommand:
                    log.warning('empty utility command - nothing to run')
                    return
                try:
                    proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True,
                                    usecontainer=False, stdout=PIPE, stderr=PIPE, cwd=job.workdir,
                                    job=job)
                except Exception as e:
                    log.error('could not execute: %s' % e)
                else:
                    # store process handle in job object, and keep track on how many times the command has been launched
                    # also store the full command in case it needs to be restarted later (by the job_monitor() thread)
                    job.utilities[utcmd] = [proc1, 1, utilitycommand]
Example #12
0
def find_number_of_events(job):
    """
    Locate the number of events.

    :param job: job object.
    :return:
    """

    log = get_logger(job.jobid)

    if job.nevents:
        log.info('number of events already known: %d' % job.nevents)
        return

    log.info('looking for number of processed events (source #1: jobReport.json)')
    find_number_of_events_in_jobreport(job)
    if job.nevents > 0:
        log.info('found %d processed events' % job.nevents)
        return

    log.info('looking for number of processed events (source #2: metadata.xml)')
    find_number_of_events_in_xml(job)
    if job.nevents > 0:
        log.info('found %d processed events' % job.nevents)
        return

    log.info('looking for number of processed events (source #3: athena summary file(s)')
    n1, n2 = process_athena_summary(job)
    if n1 > 0:
        job.nevents = n1
        log.info('found %d processed (read) events' % job.nevents)
    if n2 > 0:
        job.neventsw = n2
        log.info('found %d processed (written) events' % job.neventsw)
Example #13
0
def get_time_since(job_id, timing_constant, args):
    """
    Return the amount of time that has passed since the time measurement of timing_constant.

    :param job_id: PanDA job id (string).
    :param timing_constant:
    :param args: pilot arguments.
    :return: time in seconds (int).
    """

    diff = 0
    log = get_logger(job_id)

    if job_id in args.timing:

        # extract time measurements
        time_measurement_dictionary = args.timing.get(job_id, None)
        if time_measurement_dictionary:
            time_measurement = get_time_measurement(timing_constant, time_measurement_dictionary,
                                                    args.timing, job_id)
            if time_measurement:
                diff = time.time() - time_measurement
        else:
            log.warning('failed to extract time measurement dictionary from %s' % str(args.timing))
    else:
        log.warning('job id %s not found in timing dictionary' % job_id)

    return diff
Example #14
0
def perform_initial_payload_error_analysis(job, exit_code):
    """
    Perform an initial analysis of the payload.
    Singularity errors are caught here.

    :param job: job object.
    :param exit_code: exit code from payload execution.
    :return:
    """

    log = get_logger(job.jobid, logger)

    if exit_code != 0:
        log.warning('main payload execution returned non-zero exit code: %d' %
                    exit_code)
        stderr = read_file(
            os.path.join(job.workdir, config.Payload.payloadstderr))
        if stderr != "":
            msg = errors.extract_stderr_msg(stderr)
            if msg != "":
                log.warning("extracted message from stderr:\n%s" % msg)
        ec = errors.resolve_transform_error(exit_code, stderr)
        if ec != 0:
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                ec)
        else:
            if job.piloterrorcodes:
                log.warning('error code(s) already set: %s' %
                            str(job.piloterrorcodes))
            else:
                log.warning('initial error analysis did not resolve the issue')
    else:
        log.info(
            'main payload execution returned zero exit code, but will check it more carefully'
        )
Example #15
0
def _do_stageout(job, xdata, activity, title):
    """
    Use the `StageOutClient` in the Data API to perform stage-out.

    :param job: job object.
    :param xdata: list of FileSpec objects.
    :param activity: copytool activity or preferred list of activities to resolve copytools
    :param title: type of stage-out (output, log) (string).
    :return: True in case of success transfers
    """

    log = get_logger(job.jobid)
    log.info('prepare to stage-out %d %s file(s)' % (len(xdata), title))

    event_type = "put_sm"
    #if log_transfer:
    #    eventType += '_logs'
    #if special_log_transfer:
    #    eventType += '_logs_os'
    if job.is_analysis():
        event_type += "_a"
    rse = get_rse(xdata)
    localsite = remotesite = rse
    trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), localSite=localsite, remoteSite=remotesite, dataset="", eventType=event_type)
    trace_report.init(job)

    try:
        client = StageOutClient(job.infosys, logger=log, trace_report=trace_report)
        kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job)  #, mode='stage-out')
        # prod analy unification: use destination preferences from PanDA server for unified queues
        if job.infosys.queuedata.type != 'unified':
            client.prepare_destinations(xdata, activity)  ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow)
        client.transfer(xdata, activity, **kwargs)
    except PilotException as error:
        import traceback
        error_msg = traceback.format_exc()
        log.error(error_msg)
        msg = errors.format_diagnostics(error.get_error_code(), error_msg)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg)
    except Exception:
        import traceback
        log.error(traceback.format_exc())
        # do not raise the exception since that will prevent also the log from being staged out
        # error = PilotException("stageOut failed with error=%s" % e, code=ErrorCodes.STAGEOUTFAILED)
    else:
        log.debug('stage-out client completed')

    log.info('summary of transferred files:')
    for e in xdata:
        if not e.status:
            status = "(not transferred)"
        else:
            status = e.status
        log.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status))

    remain_files = [e for e in xdata if e.status not in ['transferred']]
    log.debug('remain_files=%s' % str(remain_files))
    log.debug('xdata=%s' % str(xdata))

    return not remain_files
Example #16
0
    def stageout_es_real(self, output_file):
        """
        Stage out event service output file.

        :param output_file: output file name.
        """
        job = self.get_job()
        log = get_logger(job.jobid, logger)
        log.info('prepare to stage-out eventservice files')

        error = None
        try:
            file_data = {
                'scope': 'transient',
                'lfn': os.path.basename(output_file),
            }
            file_spec = FileSpec(filetype='output', **file_data)
            xdata = [file_spec]
            client = StageOutESClient(job.infosys, logger=log)
            kwargs = dict(workdir=job.workdir,
                          cwd=job.workdir,
                          usecontainer=False,
                          job=job)
            client.transfer(xdata, activity=['es_events', 'pw'], **kwargs)
        except exception.PilotException, error:
            log.error(error.get_detail())
Example #17
0
def validate_post(queues, traces, args):
    """
    Validate finished payloads.
    If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as
    well but only for log stage-out (in failed_post() below).

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        # finished payloads
        try:
            job = queues.finished_payloads.get(block=True, timeout=1)
        except queue.Empty:
            time.sleep(0.1)
            continue
        log = get_logger(job.jobid, logger)

        # by default, both output and log should be staged out
        job.stageout = 'all'
        log.debug('adding job to data_out queue')
        #queues.data_out.put(job)
        set_pilot_state(job=job, state='stageout')
        put_in_queue(job, queues.data_out)

    logger.info('[payload] validate_post thread has finished')
Example #18
0
def failed_post(queues, traces, args):
    """
    (add description)

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        # finished payloads
        try:
            job = queues.failed_payloads.get(block=True, timeout=1)
        except queue.Empty:
            time.sleep(0.1)
            continue
        log = get_logger(job.jobid, logger)

        log.debug('adding log for log stageout')

        job.stageout = 'log'  # only stage-out log file
        #queues.data_out.put(job)
        set_pilot_state(job=job, state='stageout')
        put_in_queue(job, queues.data_out)

    logger.info('[payload] failed_post thread has finished')
Example #19
0
def post_memory_monitor_action(job):
    """
    Perform post action items for memory monitor.

    :param job: job object.
    :return:
    """

    log = get_logger(job.jobid)

    nap = 3
    path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename())
    path2 = os.environ.get('PILOT_HOME')
    i = 0
    maxretry = 20
    while i <= maxretry:
        if os.path.exists(path1):
            break
        log.info("taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)"
                 % (nap, i, maxretry))
        time.sleep(nap)
        i += 1

    try:
        copy(path1, path2)
    except Exception as e:
        log.warning('failed to copy memory monitor output: %s' % e)
Example #20
0
def _validate_payload(job):
    """
    Perform validation tests for the payload.

    :param job: job object.
    :return: boolean.
    """

    status = True

    log = get_logger(job.jobid, logger)
    log.debug('PandaID = %s' % os.environ.get('PandaID', 'unknown'))
    log.debug('PanDA_TaskID = %s' % os.environ.get('PanDA_TaskID', 'unknown'))

    # perform user specific validation
    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(),
                      [pilot_user], 0)  # Python 2/3
    try:
        status = user.validate(job)
    except Exception as e:
        log.fatal('failed to execute user validate() function: %s' % e)
        status = False

    return status
Example #21
0
def get_looping_job_limit(job):
    """
    Get the time limit for looping job detection.

    :param job: job object.
    :return: looping job time limit (int).
    """

    log = get_logger(job.jobid)

    is_analysis = job.is_analysis()
    looping_limit = convert_to_int(config.Pilot.looping_limit_default_prod,
                                   default=12 * 3600)
    if is_analysis:
        looping_limit = convert_to_int(config.Pilot.looping_limit_default_user,
                                       default=3 * 3600)

    if job.maxcpucount and job.maxcpucount >= config.Pilot.looping_limit_min_default:
        _looping_limit = max(config.Pilot.looping_limit_min_default,
                             job.maxcpucount)
    else:
        _looping_limit = max(looping_limit, job.maxcpucount)

    if _looping_limit != looping_limit:
        log.info(
            "task request has updated looping job limit from %d s to %d s using maxCpuCount"
            % (looping_limit, _looping_limit))
        looping_limit = _looping_limit
    else:
        log.info("using standard looping job limit: %d s" % looping_limit)

    return looping_limit
Example #22
0
    def wait_graceful(self, args, proc, job):
        """
        (add description)

        :param args:
        :param proc:
        :param job:
        :return:
        """

        log = get_logger(job.jobid, logger)

        t1 = time.time()
        while proc.is_alive():
            if args.graceful_stop.is_set():
                log.debug("Graceful stop is set, stopping work executor")
                proc.stop()
                break
            if time.time() > t1 + 300:  # 5 minutes
                log.info("Process is still running")
                t1 = time.time()
            time.sleep(2)

        while proc.is_alive():
            time.sleep(2)
        exit_code = proc.get_exit_code()
        return exit_code
Example #23
0
def get_core_count(job):
    """
    Return the core count from ATHENA_PROC_NUMBER.

    :param job: job object.
    :return: core count (int).
    """

    log = get_logger(job.jobid)

    if "HPC_HPC" in job.infosys.queuedata.catchall:
        if job.corecount is None:
            job.corecount = 0
    else:
        if job.corecount:
            # Always use the ATHENA_PROC_NUMBER first, if set
            if 'ATHENA_PROC_NUMBER' in os.environ:
                try:
                    job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER'))
                except Exception as e:
                    log.warning(
                        "ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.corecount value)"
                        % e)
        else:
            try:
                job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER'))
            except Exception:
                log.warning(
                    "environment variable ATHENA_PROC_NUMBER is not set. corecount is not set"
                )

    return job.corecount
Example #24
0
def is_out_of_memory(job):
    """
    Did the payload run out of memory?

    :param job: job object.
    :return: Boolean. (note: True means the error was found)
    """

    out_of_memory = False
    log = get_logger(job.jobid)

    stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
    stderr = os.path.join(job.workdir, config.Payload.payloadstderr)

    files = {stderr: ["FATAL out of memory: taking the application down"], stdout: ["St9bad_alloc", "std::bad_alloc"]}
    for path in files:
        if os.path.exists(path):
            log.info('looking for out-of-memory errors in %s' % os.path.basename(path))
            if os.path.getsize(path) > 0:
                matched_lines = grep(files[path], path)
                if len(matched_lines) > 0:
                    log.warning("identified an out of memory error in %s %s:" % (job.payload, os.path.basename(path)))
                    for line in matched_lines:
                        log.info(line)
                    out_of_memory = True
        else:
            log.warning('file does not exist: %s (cannot look for out-of-memory error in it)')

    return out_of_memory
Example #25
0
def abort_jobs_in_queues(queues, sig):
    """
    Find all jobs in the queues and abort them.

    :param queues: queues object.
    :param sig: detected kill signal.
    :return:
    """

    jobs_list = []

    # loop over all queues and find all jobs
    for q in queues._fields:
        _q = getattr(queues, q)
        jobs = list(_q.queue)
        for job in jobs:
            if job not in jobs_list:
                jobs_list.append(job)

    logger.info('found %d job(s) in %d queues' %
                (len(jobs_list), len(queues._fields)))
    for job in jobs_list:
        log = get_logger(job.jobid)
        log.info('aborting job %s' % job.jobid)
        declare_failed_by_kill(job, queues.failed_jobs, sig)
Example #26
0
    def run_payload(self, job, out, err):
        """
        (add description)

        :param job:
        :param out:
        :param err:
        :return:
        """
        log = get_logger(job.jobid, logger)

        self.pre_setup(job)

        # get the payload command from the user specific code
        pilot_user = os.environ.get('PILOT_USER', 'atlas').lower()
        user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                          locals(), [pilot_user], -1)

        self.post_setup(job)

        self.utility_before_payload(job)

        self.utility_with_payload(job)

        try:
            executable = user.get_payload_command(job)
        except exception.PilotException as e:
            log.fatal('could not define payload command')
            return None

        log.info("payload execution command: %s" % executable)

        try:
            payload = {
                'executable': executable,
                'workdir': job.workdir,
                'output_file': out,
                'error_file': err,
                'job': job
            }
            log.debug("payload: %s" % payload)

            log.info("Starting EventService WorkExecutor")
            executor = WorkExecutor()
            executor.set_payload(payload)
            executor.start()
            log.info("EventService WorkExecutor started")

            log.info("ESProcess started with pid: %s" % executor.get_pid())
            job.pid = executor.get_pid()
            if job.pid:
                job.pgrp = os.getpgid(job.pid)

            self.utility_after_payload_started(job)
        except Exception as e:
            log.error('could not execute: %s' % str(e))
            return None

        return executor
Example #27
0
def create_log(job, logfile, tarball_name):
    """

    :param job:
    :param logfile:
    :param tarball_name:
    :raises LogFileCreationFailure: in case of log file creation problem
    :return:
    """

    log = get_logger(job.jobid)
    log.debug('preparing to create log file')

    # perform special cleanup (user specific) prior to log file creation
    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(),
                      [pilot_user], -1)
    user.remove_redundant_files(job.workdir)

    input_files = [e.lfn for e in job.indata]
    output_files = [e.lfn for e in job.outdata]

    # remove any present input/output files before tarring up workdir
    for f in input_files + output_files:
        path = os.path.join(job.workdir, f)
        if os.path.exists(path):
            log.info('removing file: %s' % path)
            remove(path)

    # rename the workdir for the tarball creation
    newworkdir = os.path.join(os.path.dirname(job.workdir), tarball_name)
    orgworkdir = job.workdir
    log.debug('renaming %s to %s' % (job.workdir, newworkdir))
    os.rename(job.workdir, newworkdir)
    job.workdir = newworkdir

    fullpath = os.path.join(job.workdir,
                            logfile.lfn)  # /some/path/to/dirname/log.tgz

    log.info('will create archive %s' % fullpath)
    try:
        #newdirnm = "tarball_PandaJob_%s" % job.jobid
        #tarballnm = "%s.tar.gz" % newdirnm
        #os.rename(job.workdir, newdirnm)
        cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % (
            fullpath, tarball_name)
        exit_code, stdout, stderr = execute(cmd)
        #with closing(tarfile.open(name=fullpath, mode='w:gz', dereference=True)) as archive:
        #    archive.add(os.path.basename(job.workdir), recursive=True)
    except Exception as e:
        raise LogFileCreationFailure(e)
    else:
        log.debug('stdout = %s' % stdout)
    log.debug('renaming %s back to %s' % (job.workdir, orgworkdir))
    try:
        os.rename(job.workdir, orgworkdir)
    except Exception as e:
        log.debug('exception caught: %s' % e)
    job.workdir = orgworkdir
Example #28
0
def get_time_for_last_touch(job, mt, looping_limit):
    """
    Return the time when the files in the workdir were last touched.
    in case no file was touched since the last check, the returned value will be the same as the previous time.

    :param job: job object.
    :param mt: `MonitoringTime` object.
    :param looping_limit: looping limit in seconds.
    :return: time in seconds since epoch (int) (or None in case of failure).
    """

    log = get_logger(job.jobid)

    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
    loopingjob_definitions = __import__(
        'pilot.user.%s.loopingjob_definitions' % pilot_user, globals(),
        locals(), [pilot_user], -1)

    # locate all files that were modified the last N minutes
    cmd = "find %s -mmin -%d" % (job.workdir, int(looping_limit / 60))
    exit_code, stdout, stderr = execute(cmd)
    if exit_code == 0:
        if stdout != "":
            files = stdout.split(
                "\n")  # find might add a \n even for single entries

            # remove unwanted list items (*.py, *.pyc, workdir, ...)
            files = loopingjob_definitions.remove_unwanted_files(
                job.workdir, files)
            if files:
                log.info('found %d files that were recently updated' %
                         len(files))

                updated_files = verify_file_list(files)

                # now get the mod times for these file, and identify the most recently update file
                latest_modified_file, mtime = find_latest_modified_file(
                    updated_files)
                if latest_modified_file:
                    log.info(
                        "file %s is the most recently updated file (at time=%d)"
                        % (latest_modified_file, mtime))
                else:
                    log.warning(
                        'looping job algorithm failed to identify latest updated file'
                    )
                    return mt.ct_looping_last_touched

                # store the time of the last file modification
                mt.update('ct_looping_last_touched', modtime=mtime)
            else:
                log.warning("found no recently updated files!")
        else:
            log.warning('found no recently updated files')
    else:
        log.warning('find command failed: %d, %s, %s' %
                    (exit_code, stdout, stderr))

    return mt.ct_looping_last_touched
Example #29
0
def get_time_difference(job_id, timing_constant_1, timing_constant_2, args):
    """
    Return the positive time difference between the given constants.
    The order is not important and a positive difference is always returned. The function collects the time measurements
    corresponding to the given timing constants from the pilot timing file.
    The job_id is used internally as a dictionary key. The given timing constants and their timing measurements, belong
    to the given job_id.
    Structure of pilot timing dictionary:
        { job_id: { <timing_constant_1>: <time measurement in seconds since epoch>, .. }
    job_id = 0 means timing information from wrapper. Timing constants are defined in pilot.util.constants.
    Time measurement are time.time() values. The float value will be converted to an int as a last step.

    :param job_id: PanDA job id (string).
    :param timing_constant_1:
    :param timing_constant_2:
    :param args: pilot arguments.
    :return: time difference in seconds (int).
    """

    diff = 0

    log = get_logger(job_id)

    if job_id in args.timing:

        # extract time measurements
        time_measurement_dictionary = args.timing.get(job_id, None)
        if time_measurement_dictionary:

            time_measurement_1 = get_time_measurement(
                timing_constant_1, time_measurement_dictionary, args.timing,
                job_id)
            time_measurement_2 = get_time_measurement(
                timing_constant_2, time_measurement_dictionary, args.timing,
                job_id)

            if time_measurement_1 and time_measurement_2:
                diff = time_measurement_2 - time_measurement_1
        else:
            log.warning(
                'failed to extract time measurement dictionary from %s' %
                str(args.timing))
    else:
        log.warning('job id %s not found in timing dictionary' % job_id)

    # always return a positive number
    if diff < 0:
        diff = -diff

    # convert to int as a last step
    try:
        diff = int(diff)
    except Exception as e:
        log.warning('failed to convert %s to int: %s (will reset to 0)' %
                    (diff, e))
        diff = 0

    return diff
Example #30
0
def check_work_dir(job):
    """
    Check the size of the work directory.
    The function also updates the workdirsizes list in the job object.

    :param job: job object.
    :return: exit code (int), error diagnostics (string)
    """

    exit_code = 0
    diagnostics = ""

    log = get_logger(job.jobid)

    if os.path.exists(job.workdir):
        # get the limit of the workdir
        maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata)

        if os.path.exists(job.workdir):
            workdirsize = get_directory_size(directory=job.workdir)

            # is user dir within allowed size limit?
            if workdirsize > maxwdirsize:
                exit_code = errors.USERDIRTOOLARGE
                diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \
                              (job.workdir, workdirsize, maxwdirsize)
                log.fatal("%s" % diagnostics)

                cmd = 'ls -altrR %s' % job.workdir
                exit_code, stdout, stderr = execute(cmd, mute=True)
                log.info("%s: %s" % (cmd + '\n', stdout))

                # kill the job
                # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED")
                kill_processes(job.pid)

                # remove any lingering input files from the work dir
                lfns, guids = job.get_lfns_and_guids()
                if lfns:
                    remove_files(job.workdir, lfns)

                    # remeasure the size of the workdir at this point since the value is stored below
                    workdirsize = get_directory_size(directory=job.workdir)
            else:
                log.info(
                    "size of work directory %s: %d B (within %d B limit)" %
                    (job.workdir, workdirsize, maxwdirsize))

            # Store the measured disk space (the max value will later be sent with the job metrics)
            if workdirsize > 0:
                job.add_workdir_size(workdirsize)
        else:
            log.warning('job work dir does not exist: %s' % job.workdir)
    else:
        log.warning(
            'skipping size check of workdir since it has not been created yet')

    return exit_code, diagnostics