Exemple #1
0
def declare_output(job, work_report, worker_stageout_declaration):
    out_file_report = {}
    out_file_report[job.jobid] = []
    for outfile in job.output_files.keys():
        logger.debug(
            "File {} will be checked and declared for stage out".format(
                outfile))
        if os.path.exists(outfile):
            file_desc = {}
            if outfile == job.log_file:
                file_desc['filetype'] = 'log'
            else:
                file_desc['filetype'] = 'output'
            file_desc['path'] = os.path.abspath(outfile)
            file_desc['fsize'] = os.path.getsize(outfile)
            if 'guid' in job.output_files[outfile].keys():
                file_desc['guid'] = job.output_files[outfile]['guid']
            elif work_report['outputfiles'] and work_report['outputfiles'][
                    outfile]:
                file_desc['guid'] = work_report['outputfiles'][outfile]['guid']
            out_file_report[job.jobid].append(file_desc)
        else:
            logger.info(
                "Expected output file {0} missed. Job {1} will be failed".
                format(outfile, job.jobid))
            set_pilot_state(job=job, state='failed')

    if out_file_report[job.jobid]:
        write_json(worker_stageout_declaration, out_file_report)
        logger.debug(
            'Stagout declared in: {0}'.format(worker_stageout_declaration))
        logger.debug('Report for stageout: {}'.format(out_file_report))
Exemple #2
0
def validate_post(queues, traces, args):
    """
    Validate finished payloads.
    If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as
    well but only for log stage-out (in failed_post() below).

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        # finished payloads
        try:
            job = queues.finished_payloads.get(block=True, timeout=1)
        except queue.Empty:
            time.sleep(0.1)
            continue
        log = get_logger(job.jobid, logger)

        # by default, both output and log should be staged out
        job.stageout = 'all'
        log.debug('adding job to data_out queue')
        #queues.data_out.put(job)
        set_pilot_state(job=job, state='stageout')
        put_in_queue(job, queues.data_out)

    logger.info('[payload] validate_post thread has finished')
Exemple #3
0
def failed_post(queues, traces, args):
    """
    Get a Job object from the "failed_payloads" queue. Set the pilot state to "stakeout" and the stageout field to
    "log", and add the Job object to the "data_out" queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        # finished payloads
        try:
            job = queues.failed_payloads.get(block=True, timeout=1)
        except queue.Empty:
            time.sleep(0.1)
            continue

        logger.debug('adding log for log stageout')

        job.stageout = 'log'  # only stage-out log file
        #queues.data_out.put(job)
        set_pilot_state(job=job, state='stageout')
        put_in_queue(job, queues.data_out)

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.info('[payload] failed_post thread has finished')
Exemple #4
0
def failed_post(queues, traces, args):
    """
    (add description)

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        # finished payloads
        try:
            job = queues.failed_payloads.get(block=True, timeout=1)
        except queue.Empty:
            time.sleep(0.1)
            continue
        log = get_logger(job.jobid, logger)

        log.debug('adding log for log stageout')

        job.stageout = 'log'  # only stage-out log file
        #queues.data_out.put(job)
        set_pilot_state(job=job, state='stageout')
        put_in_queue(job, queues.data_out)

    logger.info('[payload] failed_post thread has finished')
Exemple #5
0
def memory_usage(job):
    """
    Perform memory usage verification.

    :param job: job object
    :return: exit code (int), diagnostics (string).
    """

    exit_code = 0
    diagnostics = ""

    # Get the maxPSS value from the memory monitor
    summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor)

    if not summary_dictionary:
        exit_code = errors.BADMEMORYMONITORJSON
        diagnostics = "Memory monitor output could not be read"
        return exit_code, diagnostics

    maxdict = summary_dictionary.get('Max', {})
    maxpss_int = maxdict.get('maxPSS', -1)

    # Only proceed if values are set
    if maxpss_int != -1:
        maxrss = job.infosys.queuedata.maxrss

        if maxrss:
            # correction for SCORE/4CORE/nCORE jobs on UCORE queues
            scale = get_ucore_scale_factor(job)
            try:
                maxrss_int = 2 * int(maxrss * scale) * 1024  # Convert to int and kB
            except Exception as e:
                logger.warning("unexpected value for maxRSS: %s" % e)
            else:
                # Compare the maxRSS with the maxPSS from memory monitor
                if maxrss_int > 0 and maxpss_int > 0:
                    if maxpss_int > maxrss_int:
                        diagnostics = "job has exceeded the memory limit %d kB > %d kB (2 * queuedata.maxrss)" % \
                                      (maxpss_int, maxrss_int)
                        logger.warning(diagnostics)

                        # Create a lockfile to let RunJob know that it should not restart the memory monitor after it has been killed
                        #pUtil.createLockFile(False, self.__env['jobDic'][k][1].workdir, lockfile="MEMORYEXCEEDED")

                        # Kill the job
                        set_pilot_state(job=job, state="failed")
                        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXCEEDMAXMEM)
                        kill_processes(job.pid)
                    else:
                        logger.info("max memory (maxPSS) used by the payload is within the allowed limit: "
                                    "%d B (2 * maxRSS = %d B)" % (maxpss_int, maxrss_int))
        else:
            if maxrss == 0 or maxrss == "0":
                logger.info("queuedata.maxrss set to 0 (no memory checks will be done)")
            else:
                logger.warning("queuedata.maxrss is not set")

    return exit_code, diagnostics
Exemple #6
0
def check_work_dir(job):
    """
    Check the size of the work directory.
    The function also updates the workdirsizes list in the job object.

    :param job: job object.
    :return: exit code (int), error diagnostics (string)
    """

    exit_code = 0
    diagnostics = ""

    if os.path.exists(job.workdir):
        # get the limit of the workdir
        maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata)

        if os.path.exists(job.workdir):
            workdirsize = get_disk_usage(job.workdir)

            # is user dir within allowed size limit?
            if workdirsize > maxwdirsize:
                exit_code = errors.USERDIRTOOLARGE
                diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \
                              (job.workdir, workdirsize, maxwdirsize)
                logger.fatal("%s", diagnostics)

                cmd = 'ls -altrR %s' % job.workdir
                _ec, stdout, stderr = execute(cmd, mute=True)
                logger.info("%s: %s", cmd + '\n', stdout)

                # kill the job
                # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED")
                set_pilot_state(job=job, state="failed")
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
                kill_processes(job.pid)

                # remove any lingering input files from the work dir
                lfns, guids = job.get_lfns_and_guids()
                if lfns:
                    remove_files(job.workdir, lfns)

                    # remeasure the size of the workdir at this point since the value is stored below
                    workdirsize = get_disk_usage(job.workdir)
            else:
                logger.info("size of work directory %s: %d B (within %d B limit)", job.workdir, workdirsize, maxwdirsize)

            # Store the measured disk space (the max value will later be sent with the job metrics)
            if workdirsize > 0:
                job.add_workdir_size(workdirsize)
        else:
            logger.warning('job work dir does not exist: %s', job.workdir)
    else:
        logger.warning('skipping size check of workdir since it has not been created yet')

    return exit_code, diagnostics
Exemple #7
0
    def run(self):
        """
        (add description)
        :return:
        """
        log = get_logger(str(self.__job.jobid), logger)

        exit_code = 1
        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()

        if self.setup_payload(self.__job, self.__out, self.__err):
            log.debug('running payload')
            proc = self.run_payload(self.__job, self.__out, self.__err)
            if proc is not None:
                # the process is now running, update the server
                set_pilot_state(job=self.__job, state="running")
                send_state(self.__job, self.__args, self.__job.state)

                log.info('will wait for graceful exit')
                exit_code = self.wait_graceful(self.__args, proc, self.__job)
                state = 'finished' if exit_code == 0 else 'failed'
                set_pilot_state(job=self.__job, state=state)
                log.info('finished pid=%s exit_code=%s state=%s' %
                         (proc.pid, exit_code, self.__job.state))

                if exit_code is None:
                    log.warning(
                        'detected unset exit_code from wait_graceful - reset to -1'
                    )
                    exit_code = -1

                self.utility_after_payload_finished(self.__job)
                self.post_payload(self.__job)

                # stop any running utilities
                if self.__job.utilities != {}:
                    for utcmd in self.__job.utilities.keys():
                        utproc = self.__job.utilities[utcmd][0]
                        if utproc:
                            user = __import__(
                                'pilot.user.%s.common' % pilot_user, globals(),
                                locals(), [pilot_user], -1)
                            sig = user.get_utility_command_kill_signal(utcmd)
                            log.info("stopping process \'%s\' with signal %d" %
                                     (utcmd, sig))
                            try:
                                os.killpg(os.getpgid(utproc.pid), sig)
                            except Exception as e:
                                log.warning('exception caught: %s (ignoring)' %
                                            e)

                            user.post_utility_command_action(utcmd, self.__job)

        return exit_code
Exemple #8
0
def kill_looping_job(job):
    """
    Kill the looping process.
    TODO: add allow_looping_job() exp. spec?

    :param job: job object.
    :return: (updated job object.)
    """

    # the child process is looping, kill it
    diagnostics = "pilot has decided to kill looping job %s at %s" % (
        job.jobid, time_stamp())
    logger.fatal(diagnostics)

    cmd = 'ps -fwu %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ls -ltr %s' % (job.workdir)
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'pstree -g -a'
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    # set the relevant error code
    if job.state == 'stagein':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEINTIMEOUT)
    elif job.state == 'stageout':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTTIMEOUT)
    else:
        # most likely in the 'running' state, but use the catch-all 'else'
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.LOOPINGJOB)
    set_pilot_state(job=job, state="failed")

    # remove any lingering input files from the work dir
    lfns, guids = job.get_lfns_and_guids()
    if lfns:
        ec = remove_files(job.workdir, lfns)
        if ec != 0:
            logger.warning('failed to remove all files')

    kill_processes(job.pid)
Exemple #9
0
def declare_failed_by_kill(job, queue, sig):
    """
    Declare the job failed by a kill signal and put it in a suitable failed queue.
    E.g. queue=queues.failed_data_in, if the kill signal was received during stage-in.

    :param job: job object.
    :param queue: queue object.
    :param sig: signal.
    :return:
    """

    set_pilot_state(job=job, state="failed")
    error_code = errors.get_kill_signal_error_code(sig)
    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error_code)

    #queue.put(job)
    put_in_queue(job, queue)
Exemple #10
0
    def run_payload(self, job, cmd, out, err):
        """
        Setup and execute the main payload process.

        REFACTOR using run_command()

        :param job: job object.
        :param out: (currently not used; deprecated)
        :param err: (currently not used; deprecated)
        :return: proc (subprocess returned by Popen())
        """

        # main payload process steps

        # add time for PILOT_PRE_PAYLOAD
        self.pre_payload(job)

        logger.info("\n\npayload execution command:\n\n%s\n" % cmd)
        try:
            proc = execute(cmd,
                           workdir=job.workdir,
                           returnproc=True,
                           usecontainer=True,
                           stdout=out,
                           stderr=err,
                           cwd=job.workdir,
                           job=job)
        except Exception as e:
            logger.error('could not execute: %s' % str(e))
            return None
        if type(proc) == tuple and not proc[0]:
            logger.error('failed to execute payload')
            return None

        logger.info('started -- pid=%s executable=%s' % (proc.pid, cmd))
        job.pid = proc.pid
        job.pgrp = os.getpgid(job.pid)
        set_pilot_state(job=job, state="running")

        #_cmd = self.utility_with_payload(job)
        #if _cmd:
        #    logger.info('could have executed: %s' % _cmd)

        self.utility_after_payload_started(job)

        return proc
Exemple #11
0
def validate_post(queues, traces, args):
    """
    Validate finished payloads.
    If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as
    well but only for log stage-out (in failed_post() below).

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        # finished payloads
        try:
            job = queues.finished_payloads.get(block=True, timeout=1)
        except queue.Empty:
            time.sleep(0.1)
            continue

        # by default, both output and log should be staged out
        job.stageout = 'all'
        logger.debug('adding job to data_out queue')
        #queues.data_out.put(job)
        set_pilot_state(job=job, state='stageout')
        put_in_queue(job, queues.data_out)

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.info('[payload] validate_post thread has finished')
Exemple #12
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)

            # does the user want to execute any special commands before stage-in?
            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
            user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            cmd = user.get_utility_commands(job=job,
                                            order=UTILITY_BEFORE_STAGEIN)
            if cmd:
                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before xcache start] stdout=%s', _stdout)
                #logger.debug('[before xcache start] stderr=%s', _stderr)

                _, stdout, stderr = execute(cmd.get('command'))
                logger.debug('stdout=%s', stdout)
                logger.debug('stderr=%s', stderr)

                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after xcache start] stdout=%s', _stdout)
                #logger.debug('[after xcache start] stderr=%s', _stderr)

                # perform any action necessary after command execution (e.g. stdout processing)
                kwargs = {
                    'label': cmd.get('label', 'utility'),
                    'output': stdout
                }
                user.post_prestagein_utility_command(**kwargs)

                # write output to log files
                write_utility_output(job.workdir, cmd.get('label', 'utility'),
                                     stdout, stderr)

            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)

                # now create input file metadata if required by the payload
                if os.environ.get('PILOT_ES_EXECUTOR_TYPE',
                                  'generic') == 'generic':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s', xml)
            else:
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')
Exemple #13
0
def queue_monitoring(queues, traces, args):
    """
    Monitoring of Data queues.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while True:  # will abort when graceful_stop has been set
        time.sleep(0.5)
        if traces.pilot['command'] == 'abort':
            logger.warning('data queue monitor saw the abort instruction')
            args.graceful_stop.set()

        # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set)
        # (abort at the end of the loop)
        abort = should_abort(args, label='data:queue_monitoring')

        # monitor the failed_data_in queue
        try:
            job = queues.failed_data_in.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            # stage-out log file then add the job to the failed_jobs queue
            job.stageout = "log"

            # TODO: put in data_out queue instead?

            if not _stage_out_new(job, args):
                logger.info(
                    "job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue",
                    job.jobid)
                put_in_queue(job, queues.failed_data_out)
            else:
                logger.info(
                    "job %s failed during stage-in, adding job object to failed_jobs queue",
                    job.jobid)
                put_in_queue(job, queues.failed_jobs)

        # monitor the finished_data_out queue
        try:
            job = queues.finished_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            # use the payload/transform exitCode from the job report if it exists
            if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []:
                logger.info(
                    'finished stage-out for finished payload, adding job to finished_jobs queue'
                )
                #queues.finished_jobs.put(job)
                put_in_queue(job, queues.finished_jobs)
            else:
                logger.info('finished stage-out (of log) for failed payload')
                #queues.failed_jobs.put(job)
                put_in_queue(job, queues.failed_jobs)

        # monitor the failed_data_out queue
        try:
            job = queues.failed_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            # attempt to upload the log in case the previous stage-out failure was not an SE error
            job.stageout = "log"
            set_pilot_state(job=job, state="failed")
            if not _stage_out_new(job, args):
                logger.info("job %s failed during stage-out", job.jobid)

            put_in_queue(job, queues.failed_jobs)

        if abort:
            break

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] queue_monitor thread has finished')
Exemple #14
0
def _stage_out_new(job, args):
    """
    Stage-out of all output files.
    If job.stageout=log then only log files will be transferred.

    :param job: job object.
    :param args: pilot args object.
    :return: True in case of success, False otherwise.
    """

    #logger.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)

    is_success = True

    if not job.outdata or job.is_eventservice:
        logger.info(
            'this job does not have any output files, only stage-out log file')
        job.stageout = 'log'

    if job.stageout != 'log':  ## do stage-out output files
        if not _do_stageout(job,
                            job.outdata, ['pw', 'w'],
                            args.queue,
                            title='output',
                            output_dir=args.output_dir):
            is_success = False
            logger.warning('transfer of output file(s) failed')

    if job.stageout in ['log', 'all'
                        ] and job.logdata:  ## do stage-out log files
        # prepare log file, consider only 1st available log file
        status = job.get_status('LOG_TRANSFER')
        if status != LOG_TRANSFER_NOT_DONE:
            logger.warning('log transfer already attempted')
            return False

        job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS
        logfile = job.logdata[0]

        try:
            tarball_name = 'tarball_PandaJob_%s_%s' % (job.jobid,
                                                       job.infosys.pandaqueue)
            input_files = [fspec.lfn for fspec in job.indata]
            output_files = [fspec.lfn for fspec in job.outdata]
            create_log(job.workdir,
                       logfile.lfn,
                       tarball_name,
                       args.cleanup,
                       input_files=input_files,
                       output_files=output_files,
                       is_looping=errors.LOOPINGJOB in job.piloterrorcodes,
                       debugmode=job.debug)
        except LogFileCreationFailure as error:
            logger.warning('failed to create tar file: %s', error)
            set_pilot_state(job=job, state="failed")
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                errors.LOGFILECREATIONFAILURE)
            return False

        if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'],
                            args.queue,
                            title='log',
                            output_dir=args.output_dir):
            is_success = False
            logger.warning('log transfer failed')
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED
        else:
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE
    elif not job.logdata:
        logger.info('no log was defined - will not create log file')
        job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

    # generate fileinfo details to be send to Panda
    fileinfo = {}
    for iofile in job.outdata + job.logdata:
        if iofile.status in ['transferred']:
            fileinfo[iofile.lfn] = {
                'guid': iofile.guid,
                'fsize': iofile.filesize,
                'adler32': iofile.checksum.get('adler32'),
                'surl': iofile.turl
            }

    job.fileinfo = fileinfo

    # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS
    if not is_success:
        # set error code + message (a more precise error code might have been set already)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTFAILED)
        set_pilot_state(job=job, state="failed")
        logger.warning('stage-out failed')
        return False

    logger.info('stage-out finished correctly')

    if not job.state or (
            job.state and job.state == 'stageout'
    ):  # is the job state already set? if so, don't change the state (unless it's the stageout state)
        logger.debug('changing job state from %s to finished', job.state)
        set_pilot_state(job=job, state="finished")

    # send final server update since all transfers have finished correctly
    # send_state(job, args, 'finished', xml=dumps(fileinfodict))

    return is_success
Exemple #15
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue'
                        % _job.jobid)

                # now create input file metadata if required by the payload
                if config.Payload.executor_type.lower() != 'raythena':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s' % xml)
            else:
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')
Exemple #16
0
def queue_monitoring(queues, traces, args):
    """
    Monitoring of Data queues.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while True:  # will abort when graceful_stop has been set
        if traces.pilot['command'] == 'abort':
            logger.warning('data queue monitor saw the abort instruction')

        # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set)
        # (abort at the end of the loop)
        abort = should_abort(args, label='data:queue_monitoring')

        # monitor the failed_data_in queue
        try:
            job = queues.failed_data_in.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            log = get_logger(job.jobid)

            # stage-out log file then add the job to the failed_jobs queue
            job.stageout = "log"

            # TODO: put in data_out queue instead?

            if not _stage_out_new(job, args):
                log.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs "
                         "queue" % job.jobid)
                #queues.failed_data_out.put(job)
                put_in_queue(job, queues.failed_data_out)
            else:
                log.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid)
                #queues.failed_jobs.put(job)
                put_in_queue(job, queues.failed_jobs)

        # monitor the finished_data_out queue
        try:
            job = queues.finished_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            log = get_logger(job.jobid)

            # use the payload/transform exitCode from the job report if it exists
            if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []:
                log.info('finished stage-out for finished payload, adding job to finished_jobs queue')
                #queues.finished_jobs.put(job)
                put_in_queue(job, queues.finished_jobs)
            else:
                log.info('finished stage-out (of log) for failed payload')
                #queues.failed_jobs.put(job)
                put_in_queue(job, queues.failed_jobs)

        # monitor the failed_data_out queue
        try:
            job = queues.failed_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            log = get_logger(job.jobid)

            # attempt to upload the log in case the previous stage-out failure was not an SE error
            job.stageout = "log"
            set_pilot_state(job=job, state="failed")
            if not _stage_out_new(job, args):
                log.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, "
                         "adding job object to failed_jobs queue" % job.jobid)
            else:
                log.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job "
                         "object to failed_jobs queue" % job.jobid)

            #queues.failed_jobs.put(job)
            put_in_queue(job, queues.failed_jobs)

        if abort:
            break

    logger.debug('[data] queue_monitor thread has finished')
Exemple #17
0
def _stage_out_new(job, args):
    """
    Stage-out of all output files.
    If job.stageout=log then only log files will be transferred.

    :param job: job object.
    :param args: pilot args object.
    :return: True in case of success, False otherwise.
    """

    log = get_logger(job.jobid)

    #log.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)

    is_success = True

    if not job.outdata or job.is_eventservice:
        log.info('this job does not have any output files, only stage-out log file')
        job.stageout = 'log'

    if job.stageout != 'log':  ## do stage-out output files
        if not _do_stageout(job, job.outdata, ['pw', 'w'], title='output'):
            is_success = False
            log.warning('transfer of output file(s) failed')

    if job.stageout in ['log', 'all'] and job.logdata:  ## do stage-out log files
        # prepare log file, consider only 1st available log file
        status = job.get_status('LOG_TRANSFER')
        if status != LOG_TRANSFER_NOT_DONE:
            log.warning('log transfer already attempted')
            return False

        job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS
        logfile = job.logdata[0]

        try:
            create_log(job, logfile, 'tarball_PandaJob_%s_%s' % (job.jobid, job.infosys.pandaqueue))
        except LogFileCreationFailure as e:
            log.warning('failed to create tar file: %s' % e)
            set_pilot_state(job=job, state="failed")
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE)
            return False

        if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'], title='log'):
            is_success = False
            log.warning('log transfer failed')
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED
        else:
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

    # generate fileinfo details to be send to Panda
    fileinfo = {}
    for e in job.outdata + job.logdata:
        if e.status in ['transferred']:
            fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize,
                               'adler32': e.checksum.get('adler32'),
                               'surl': e.turl}

    job.fileinfo = fileinfo
    log.info('prepared job.fileinfo=%s' % job.fileinfo)

    # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS
    if not is_success:
        # set error code + message (a more precise error code might have been set already)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED)
        set_pilot_state(job=job, state="failed")
        log.warning('stage-out failed')  # with error: %d, %s (setting job state to failed)' %
        # log.warning('stage-out failed with error: %d, %s (setting job state to failed)' %
        #  (job['pilotErrorCode'], job['pilotErrorDiag']))
        # send_state(job, args, 'failed')
        return False

    log.info('stage-out finished correctly')

    if not job.state:  # is the job state already set? if so, don't change the state
        set_pilot_state(job=job, state="finished")

    # send final server update since all transfers have finished correctly
    # send_state(job, args, 'finished', xml=dumps(fileinfodict))

    return is_success
Exemple #18
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        try:
            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')
            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING
            log = get_logger(job.jobid)

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                log.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)')
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    log.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)')
                    declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                    break

                #queues.finished_data_in.put(job)
                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    log.debug('job %s has been removed from the current_data_in queue' % _job.jobid)

                # now create input file metadata if required by the payload
                try:
                    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], -1)
                    _dir = '/srv' if job.usecontainer else job.workdir
                    file_dictionary = get_input_file_dictionary(job.indata, _dir)
                    #file_dictionary = get_input_file_dictionary(job.indata, job.workdir)
                    log.debug('file_dictionary=%s' % str(file_dictionary))
                    xml = user.create_input_file_metadata(file_dictionary, job.workdir)
                    log.info('created input file metadata:\n%s' % xml)
                except Exception as e:
                    pass
            else:
                log.warning('stage-in failed, adding job object to failed_data_in queue')
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                #queues.failed_data_in.put(job)
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                # send_state(job, args, 'failed')

        except queue.Empty:
            continue

    logger.debug('[data] copytool_in thread has finished')
Exemple #19
0
    def run(self):  # noqa: C901
        """
        Run all payload processes (including pre- and post-processes, and utilities).
        In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special
        exit code.
        :return:
        """

        # get the payload command from the user specific code
        self.pre_setup(self.__job)

        cmd = self.get_payload_command(self.__job)
        # extract the setup in case the preprocess command needs it
        self.__job.setup = self.extract_setup(cmd)
        self.post_setup(self.__job)

        # a loop is needed for HPO jobs
        # abort when nothing more to run, or when the preprocess returns a special exit code
        iteration = 0
        while True:

            logger.info('payload iteration loop #%d', iteration + 1)
            os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration
            show_memory_usage()

            # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd
            jobparams_pre = self.__job.jobparams
            exit_code = self.run_preprocess(self.__job)
            jobparams_post = self.__job.jobparams
            if exit_code:
                if exit_code >= 160 and exit_code <= 162:
                    exit_code = 0
                    # wipe the output file list since there won't be any new files
                    # any output files from previous iterations, should have been transferred already
                    logger.debug(
                        'reset outdata since further output should not be expected after preprocess exit'
                    )
                    self.__job.outdata = []
                break
            if jobparams_pre != jobparams_post:
                logger.debug(
                    'jobparams were updated by utility_before_payload()')
                # must update cmd
                cmd = cmd.replace(jobparams_pre, jobparams_post)

            # now run the main payload, when it finishes, run the postprocess (if necessary)
            # note: no need to run any main payload in HPO Horovod jobs on Kubernetes
            if os.environ.get('HARVESTER_HOROVOD', '') == '':

                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before payload start] stdout=%s', _stdout)
                #logger.debug('[before payload start] stderr=%s', _stderr)

                proc = self.run_payload(self.__job, cmd, self.__out,
                                        self.__err)
            else:
                proc = None

            proc_co = None
            if proc is None:
                # run the post-process command even if there was no main payload
                if os.environ.get('HARVESTER_HOROVOD', '') != '':
                    logger.info('No need to execute any main payload')
                    exit_code = self.run_utility_after_payload_finished(
                        exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2)
                    self.post_payload(self.__job)
                else:
                    break
            else:
                # the process is now running, update the server
                # test 'tobekilled' from here to try payload kill
                send_state(self.__job, self.__args, self.__job.state)

                # note: when sending a state change to the server, the server might respond with 'tobekilled'
                if self.__job.state == 'failed':
                    logger.warning(
                        'job state is \'failed\' - abort payload and run()')
                    kill_processes(proc.pid)
                    break

                # allow for a secondary command to be started after the payload (e.g. a coprocess)
                utility_cmd = self.get_utility_command(
                    order=UTILITY_AFTER_PAYLOAD_STARTED2)
                if utility_cmd:
                    logger.debug('starting utility command: %s', utility_cmd)
                    label = 'coprocess' if 'coprocess' in utility_cmd else None
                    proc_co = self.run_command(utility_cmd, label=label)

                logger.info('will wait for graceful exit')
                exit_code = self.wait_graceful(self.__args, proc)
                # reset error if Raythena decided to kill payload (no error)
                if errors.KILLPAYLOAD in self.__job.piloterrorcodes:
                    logger.debug('ignoring KILLPAYLOAD error')
                    self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code(
                        errors.KILLPAYLOAD,
                        pilot_error_codes=self.__job.piloterrorcodes,
                        pilot_error_diags=self.__job.piloterrordiags)
                    exit_code = 0
                    state = 'finished'
                else:
                    state = 'finished' if exit_code == 0 else 'failed'
                set_pilot_state(job=self.__job, state=state)
                logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n',
                            proc.pid, exit_code, self.__job.state)

                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after payload finish] stdout=%s', _stdout)
                #logger.debug('[after payload finish] stderr=%s', _stderr)

                # stop the utility command (e.g. a coprocess if necessary
                if proc_co:
                    logger.debug('stopping utility command: %s', utility_cmd)
                    kill_processes(proc_co.pid)

                if exit_code is None:
                    logger.warning(
                        'detected unset exit_code from wait_graceful - reset to -1'
                    )
                    exit_code = -1

                for order in [
                        UTILITY_AFTER_PAYLOAD_FINISHED,
                        UTILITY_AFTER_PAYLOAD_FINISHED2
                ]:
                    exit_code = self.run_utility_after_payload_finished(
                        exit_code, state, order)

                self.post_payload(self.__job)

                # stop any running utilities
                if self.__job.utilities != {}:
                    self.stop_utilities()

            if self.__job.is_hpo and state != 'failed':
                # in case there are more hyper-parameter points, move away the previous log files
                #self.rename_log_files(iteration)
                iteration += 1
            else:
                break

        return exit_code
Exemple #20
0
def check_payload_stdout(job):
    """
    Check the size of the payload stdout.

    :param job: job object.
    :return: exit code (int), diagnostics (string).
    """

    exit_code = 0
    diagnostics = ""

    # get list of log files
    file_list = glob(os.path.join(job.workdir, 'log.*'))

    # is this a multi-trf job?
    n_jobs = job.jobparams.count("\n") + 1
    for _i in range(n_jobs):
        # get name of payload stdout file created by the pilot
        _stdout = config.Payload.payloadstdout
        if n_jobs > 1:
            _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))

        # add the primary stdout file to the fileList
        file_list.append(os.path.join(job.workdir, _stdout))

    tmp_list = glob(os.path.join(job.workdir, 'workDir/tmp.stdout.*'))
    if tmp_list:
        file_list += tmp_list
    logger.debug('file list=%s' % str(file_list))

    # now loop over all files and check each individually (any large enough file will fail the job)
    for filename in file_list:

        logger.debug('check_payload_stdout: filename=%s', filename)
        if "job.log.tgz" in filename:
            logger.info("skipping file size check of file (%s) since it is a special log file", filename)
            continue

        if os.path.exists(filename):
            try:
                # get file size in bytes
                fsize = os.path.getsize(filename)
            except Exception as error:
                logger.warning("could not read file size of %s: %s", filename, error)
            else:
                # is the file too big?
                localsizelimit_stdout = get_local_size_limit_stdout()
                if fsize > localsizelimit_stdout:
                    exit_code = errors.STDOUTTOOBIG
                    diagnostics = "Payload stdout file too big: %d B (larger than limit %d B)" % \
                                  (fsize, localsizelimit_stdout)
                    logger.warning(diagnostics)

                    # kill the job
                    set_pilot_state(job=job, state="failed")
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
                    kill_processes(job.pid)

                    # remove the payload stdout file after the log extracts have been created

                    # remove any lingering input files from the work dir
                    lfns, guids = job.get_lfns_and_guids()
                    if lfns:
                        # remove any lingering input files from the work dir
                        exit_code = remove_files(job.workdir, lfns)
                else:
                    logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize)
        else:
            logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename)

    return exit_code, diagnostics
Exemple #21
0
def run(args):
    """
    Main execution function for the generic HPC workflow.

    :param args: pilot arguments.
    :returns: traces object.
    """

    # set communication point. Worker report should be placed there, matched with working directory of Harvester
    if args.harvester_workdir:
        communication_point = args.harvester_workdir
    else:
        communication_point = os.getcwd()
    work_report = get_initial_work_report()
    worker_attributes_file = config.Harvester.workerAttributesFile
    worker_stageout_declaration = config.Harvester.StageOutnFile
    payload_report_file = config.Payload.jobreport
    payload_stdout_file = config.Payload.payloadstdout
    payload_stderr_file = config.Payload.payloadstderr

    try:
        logger.info('setting up signal handling')
        signal.signal(signal.SIGINT, functools.partial(interrupt, args))

        logger.info('setting up tracing')
        traces = namedtuple('traces', ['pilot'])
        traces.pilot = {'state': SUCCESS, 'nr_jobs': 0}

        if args.hpc_resource == '':
            logger.critical('hpc resource not specified, cannot continue')
            traces.pilot['state'] = FAILURE
            return traces

        # get the resource reference
        resource = __import__('pilot.resource.%s' % args.hpc_resource,
                              globals(), locals(), [args.hpc_resource], -1)

        # get the user reference
        user = __import__('pilot.user.%s.common' % args.pilot_user.lower(),
                          globals(), locals(), [args.pilot_user.lower()], -1)

        # get job (and rank)
        add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args)
        job, rank = resource.get_job(communication_point)
        add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args)
        # cd to job working directory

        add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args)
        work_dir = resource.set_job_workdir(job, communication_point)
        work_report['workdir'] = work_dir
        worker_attributes_file = os.path.join(work_dir, worker_attributes_file)
        logger.debug("Worker attributes will be publeshied in: {0}".format(
            worker_attributes_file))

        set_pilot_state(job=job, state="starting")
        work_report["jobStatus"] = job.state
        publish_work_report(work_report, worker_attributes_file)

        # Get HPC specific setup commands
        logger.info('setup for resource %s: %s' %
                    (args.hpc_resource, str(resource.get_setup())))
        setup_str = "; ".join(resource.get_setup())

        # Prepare job scratch directory (RAM disk etc.)
        job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args)

        my_command = " ".join([job.script, job.script_parameters])
        my_command = resource.command_fix(my_command, job_scratch_dir)
        my_command = setup_str + my_command
        add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args)

        # Basic execution. Should be replaced with something like 'run_payload'
        logger.debug("Going to launch: {0}".format(my_command))
        logger.debug("Current work directory: {0}".format(job_scratch_dir))
        payloadstdout = open(payload_stdout_file, "w")
        payloadstderr = open(payload_stderr_file, "w")

        add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args)
        set_pilot_state(job=job, state="running")
        work_report["jobStatus"] = job.state
        work_report["startTime"] = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")
        start_time = time.asctime(time.localtime(time.time()))
        job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        publish_work_report(work_report, worker_attributes_file)

        stime = time.time()
        t0 = os.times()
        exit_code, stdout, stderr = execute(my_command,
                                            stdout=payloadstdout,
                                            stderr=payloadstderr,
                                            shell=True)
        logger.debug("Payload exit code: {0}".format(exit_code))
        t1 = os.times()
        exetime = time.time() - stime
        end_time = time.asctime(time.localtime(time.time()))
        t = map(lambda x, y: x - y, t1, t0)
        t_tot = reduce(lambda x, y: x + y, t[2:3])
        job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        payloadstdout.close()
        payloadstderr.close()
        add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args)

        state = 'finished' if exit_code == 0 else 'failed'
        set_pilot_state(job=job, state=state)
        job.exitcode = exit_code

        work_report["startTime"] = job.startTime
        work_report["endTime"] = job.endTime
        work_report["jobStatus"] = job.state
        work_report["cpuConsumptionTime"] = t_tot
        work_report["transExitCode"] = job.exitcode

        log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format(
            exit_code, job.jobid)
        log_jobreport += "CPU comsumption time: {0}  JobID: {1} \n".format(
            t_tot, job.jobid)
        log_jobreport += "Start time: {0}  JobID: {1} \n".format(
            start_time, job.jobid)
        log_jobreport += "End time: {0}  JobID: {1} \n".format(
            end_time, job.jobid)
        log_jobreport += "Execution time: {0} sec.  JobID: {1} \n".format(
            exetime, job.jobid)
        logger.info(log_jobreport)
        log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format(
            job.startTime, job.endTime)
        logger.debug(log_jobreport)

        # Parse job report file and update of work report
        if os.path.exists(payload_report_file):
            payload_report = user.parse_jobreport_data(
                read_json(payload_report_file))
            work_report.update(payload_report)
            resource.process_jobreport(payload_report_file, job_scratch_dir,
                                       work_dir)

        resource.postprocess_workdir(job_scratch_dir)

        # output files should not be packed with logs
        protectedfiles = job.output_files.keys()

        # log file not produced (yet), so should be excluded
        if job.log_file in protectedfiles:
            protectedfiles.remove(job.log_file)
        else:
            logger.info("Log files was not declared")

        logger.info("Cleanup of working directory")

        protectedfiles.extend(
            [worker_attributes_file, worker_stageout_declaration])
        user.remove_redundant_files(job_scratch_dir, protectedfiles)
        res = tar_files(job_scratch_dir, protectedfiles, job.log_file)
        if res > 0:
            raise FileHandlingFailure("Log file tar failed")

        add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)
        # Copy of output to shared FS for stageout
        if not job_scratch_dir == work_dir:
            copy_output(job, job_scratch_dir, work_dir)
        add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

        logger.info("Declare stage-out")
        add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(),
                            args)
        declare_output(job, work_report, worker_stageout_declaration)

        logger.info("All done")
        publish_work_report(work_report, worker_attributes_file)
        traces.pilot['state'] = SUCCESS
        logger.debug("Final report: {0}".format(work_report))
        add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(),
                            args)

    except Exception as e:
        work_report["jobStatus"] = "failed"
        work_report["exitMsg"] = str(e)
        publish_work_report(work_report, worker_attributes_file)
        logging.exception('exception caught:')
        traces.pilot['state'] = FAILURE

    return traces