Exemple #1
0
def wait_graceful(args, proc, job):
    log = logger.getChild(str(job['PandaID']))

    breaker = False
    exit_code = None
    while True:
        for i in xrange(100):
            if args.graceful_stop.is_set():
                breaker = True
                log.debug('breaking -- sending SIGTERM pid=%s' % proc.pid)
                proc.terminate()
                break
            time.sleep(0.1)
        if breaker:
            log.debug('breaking -- sleep 3s before sending SIGKILL pid=%s' %
                      proc.pid)
            time.sleep(3)
            proc.kill()
            break

        exit_code = proc.poll()
        log.info('running: pid=%s exit_code=%s' % (proc.pid, exit_code))
        if exit_code is not None:
            break
        else:
            send_state(job, 'running')
            continue

    return exit_code
Exemple #2
0
def _stage_out_all(job, args):

    outputs = {}

    for f in job['job_report']['files']['output']:
        outputs[f['subFiles'][0]['name']] = {
            'scope': job['scopeOut'],
            'name': f['subFiles'][0]['name'],
            'guid': f['subFiles'][0]['file_guid'],
            'bytes': f['subFiles'][0]['file_size']
        }

    outputs['%s:%s' % (job['scopeLog'], job['logFile'])] = prepare_log(
        job, 'tarball_PandaJob_%s_%s' % (job['PandaID'], args.queue))

    pfc = '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">
<POOLFILECATALOG>'''

    pfc_file = '''
 <File ID="{guid}">
  <logical>
   <lfn name="{name}"/>
  </logical>
  <metadata att_name="surl" att_value="{pfn}"/>
  <metadata att_name="fsize" att_value="{bytes}"/>
  <metadata att_name="adler32" att_value="{adler32}"/>
 </File>
'''

    failed = False

    for outfile in outputs:
        summary = _stage_out(args, outputs[outfile], job)

        if summary is not None:
            outputs[outfile]['pfn'] = summary[
                '%s:%s' %
                (outputs[outfile]['scope'], outputs[outfile]['name'])]['pfn']
            outputs[outfile]['adler32'] = summary[
                '%s:%s' % (outputs[outfile]['scope'],
                           outputs[outfile]['name'])]['adler32']

            pfc += pfc_file.format(**outputs[outfile])

        else:
            failed = True

    pfc += '</POOLFILECATALOG>'

    if failed:
        send_state(job, 'failed')
        return False
    else:
        send_state(job, 'finished', xml=pfc)
        return True
Exemple #3
0
    def run(self):
        """
        (add description)
        :return:
        """
        log = get_logger(str(self.__job.jobid), logger)

        exit_code = 1
        pilot_user = os.environ.get('PILOT_USER', 'generic').lower()

        if self.setup_payload(self.__job, self.__out, self.__err):
            log.debug('running payload')
            proc = self.run_payload(self.__job, self.__out, self.__err)
            if proc is not None:
                # the process is now running, update the server
                set_pilot_state(job=self.__job, state="running")
                send_state(self.__job, self.__args, self.__job.state)

                log.info('will wait for graceful exit')
                exit_code = self.wait_graceful(self.__args, proc, self.__job)
                state = 'finished' if exit_code == 0 else 'failed'
                set_pilot_state(job=self.__job, state=state)
                log.info('finished pid=%s exit_code=%s state=%s' %
                         (proc.pid, exit_code, self.__job.state))

                if exit_code is None:
                    log.warning(
                        'detected unset exit_code from wait_graceful - reset to -1'
                    )
                    exit_code = -1

                self.utility_after_payload_finished(self.__job)
                self.post_payload(self.__job)

                # stop any running utilities
                if self.__job.utilities != {}:
                    for utcmd in self.__job.utilities.keys():
                        utproc = self.__job.utilities[utcmd][0]
                        if utproc:
                            user = __import__(
                                'pilot.user.%s.common' % pilot_user, globals(),
                                locals(), [pilot_user], -1)
                            sig = user.get_utility_command_kill_signal(utcmd)
                            log.info("stopping process \'%s\' with signal %d" %
                                     (utcmd, sig))
                            try:
                                os.killpg(os.getpgid(utproc.pid), sig)
                            except Exception as e:
                                log.warning('exception caught: %s (ignoring)' %
                                            e)

                            user.post_utility_command_action(utcmd, self.__job)

        return exit_code
Exemple #4
0
def execute(queues, traces, args):

    while not args.graceful_stop.is_set():
        try:
            job = queues.validated_payloads.get(block=True, timeout=1)
            log = logger.getChild(str(job['PandaID']))

            q_snapshot = list(queues.finished_data_in.queue)
            peek = [
                s_job for s_job in q_snapshot
                if job['PandaID'] == s_job['PandaID']
            ]
            if len(peek) == 0:
                queues.validated_payloads.put(job)
                for i in xrange(10):
                    if args.graceful_stop.is_set():
                        break
                    time.sleep(0.1)
                continue

            log.debug('opening payload stdout/err logs')
            out = open(os.path.join(job['working_dir'], 'payload.stdout'),
                       'wb')
            err = open(os.path.join(job['working_dir'], 'payload.stderr'),
                       'wb')

            log.debug('setting up payload environment')
            send_state(job, 'starting')

            exit_code = 1
            if setup_payload(job, out, err):
                log.debug('running payload')
                send_state(job, 'running')
                proc = run_payload(job, out, err)
                if proc is not None:
                    exit_code = wait_graceful(args, proc, job)
                    log.info('finished pid=%s exit_code=%s' %
                             (proc.pid, exit_code))

            log.debug('closing payload stdout/err logs')
            out.close()
            err.close()

            if exit_code == 0:
                queues.finished_payloads.put(job)
            else:
                queues.failed_payloads.put(job)

        except Queue.Empty:
            continue
Exemple #5
0
def copytool_in(queues, traces, args):

    while not args.graceful_stop.is_set():
        try:
            job = queues.data_in.get(block=True, timeout=1)

            send_state(job, 'transferring')

            if _stage_in(args, job):
                queues.finished_data_in.put(job)
            else:
                queues.failed_data_in.put(job)

        except Queue.Empty:
            continue
Exemple #6
0
def copytool_out(queues, traces, args):

    while not args.graceful_stop.is_set():
        try:
            job = queues.data_out.get(block=True, timeout=1)

            logger.info('dataset=%s rse=%s' %
                        (job['destinationDblock'],
                         job['ddmEndPointOut'].split(',')[0]))

            send_state(job, 'transferring')

            if _stage_out_all(job, args):
                queues.finished_data_out.put(job)
            else:
                queues.failed_data_out.put(job)

        except Queue.Empty:
            continue
Exemple #7
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)

            # does the user want to execute any special commands before stage-in?
            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
            user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            cmd = user.get_utility_commands(job=job,
                                            order=UTILITY_BEFORE_STAGEIN)
            if cmd:
                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before xcache start] stdout=%s', _stdout)
                #logger.debug('[before xcache start] stderr=%s', _stderr)

                _, stdout, stderr = execute(cmd.get('command'))
                logger.debug('stdout=%s', stdout)
                logger.debug('stderr=%s', stderr)

                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after xcache start] stdout=%s', _stdout)
                #logger.debug('[after xcache start] stderr=%s', _stderr)

                # perform any action necessary after command execution (e.g. stdout processing)
                kwargs = {
                    'label': cmd.get('label', 'utility'),
                    'output': stdout
                }
                user.post_prestagein_utility_command(**kwargs)

                # write output to log files
                write_utility_output(job.workdir, cmd.get('label', 'utility'),
                                     stdout, stderr)

            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)

                # now create input file metadata if required by the payload
                if os.environ.get('PILOT_ES_EXECUTOR_TYPE',
                                  'generic') == 'generic':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s', xml)
            else:
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')
Exemple #8
0
def execute_payloads(queues, traces, args):  # noqa: C901
    """
    Execute queued payloads.

    Extract a Job object from the "validated_payloads" queue and put it in the "monitored_jobs" queue. The payload
    stdout/err streams are opened and the pilot state is changed to "starting". A payload executor is selected (for
    executing a normal job, an event service job or event service merge job). After the payload (or rather its executor)
    is started, the thread will wait for it to finish and then check for any failures. A successfully completed job is
    placed in the "finished_payloads" queue, and a failed job will be placed in the "failed_payloads" queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    job = None
    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            job = queues.validated_payloads.get(block=True, timeout=1)

            q_snapshot = list(queues.finished_data_in.queue)
            peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid]
            if len(peek) == 0:
                #queues.validated_payloads.put(job)
                put_in_queue(job, queues.validated_payloads)
                for i in range(10):  # Python 3
                    if args.graceful_stop.is_set():
                        break
                    time.sleep(1)
                continue

            # this job is now to be monitored, so add it to the monitored_payloads queue
            #queues.monitored_payloads.put(job)
            put_in_queue(job, queues.monitored_payloads)

            logger.info('job %s added to monitored payloads queue' % job.jobid)

            try:
                out = open(
                    os.path.join(job.workdir, config.Payload.payloadstdout),
                    'wb')
                err = open(
                    os.path.join(job.workdir, config.Payload.payloadstderr),
                    'wb')
            except Exception as e:
                logger.warning('failed to open payload stdout/err: %s' % e)
                out = None
                err = None
            send_state(job, args, 'starting')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - abort execute_payloads()')
                break

            payload_executor = get_payload_executor(args, job, out, err,
                                                    traces)
            logger.info("Got payload executor: %s" % payload_executor)

            show_memory_usage()

            # run the payload and measure the execution time
            job.t0 = os.times()
            exit_code = payload_executor.run()

            set_cpu_consumption_time(job)
            job.transexitcode = exit_code % 255

            out.close()
            err.close()

            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()

            # some HPO jobs will produce new output files (following lfn name pattern), discover those and replace the job.outdata list
            if job.is_hpo:
                user = __import__('pilot.user.%s.common' % pilot_user,
                                  globals(), locals(), [pilot_user],
                                  0)  # Python 2/3
                try:
                    user.update_output_for_hpo(job)
                except Exception as e:
                    logger.warning(
                        'exception caught by update_output_for_hpo(): %s' % e)
                else:
                    for dat in job.outdata:
                        if not dat.guid:
                            dat.guid = get_guid()
                            logger.warning(
                                'guid not set: generated guid=%s for lfn=%s' %
                                (dat.guid, dat.lfn))

            #if traces.pilot['nr_jobs'] == 1:
            #    logger.debug('faking job failure in first multi-job')
            #    job.transexitcode = 1
            #    exit_code = 1

            # analyze and interpret the payload execution output
            perform_initial_payload_error_analysis(job, exit_code)

            # was an error already found?
            #if job.piloterrorcodes:
            #    exit_code_interpret = 1
            #else:
            user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            try:
                exit_code_interpret = user.interpret(job)
            except Exception as e:
                logger.warning('exception caught: %s' % e)
                #exit_code_interpret = -1
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.INTERNALPILOTPROBLEM)

            if job.piloterrorcodes:
                exit_code_interpret = 1

            if exit_code_interpret == 0 and exit_code == 0:
                logger.info(
                    'main payload error analysis completed - did not find any errors'
                )

                # update output lists if zipmaps were used
                #job.add_archives_to_output_lists()

                # queues.finished_payloads.put(job)
                put_in_queue(job, queues.finished_payloads)
            else:
                logger.debug(
                    'main payload error analysis completed - adding job to failed_payloads queue'
                )
                #queues.failed_payloads.put(job)
                put_in_queue(job, queues.failed_payloads)

        except queue.Empty:
            continue
        except Exception as e:
            logger.fatal(
                'execute payloads caught an exception (cannot recover): %s, %s'
                % (e, traceback.format_exc()))
            if job:
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.PAYLOADEXECUTIONEXCEPTION)
                #queues.failed_payloads.put(job)
                put_in_queue(job, queues.failed_payloads)
            while not args.graceful_stop.is_set():
                # let stage-out of log finish, but stop running payloads as there should be a problem with the pilot
                time.sleep(5)

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.info('[payload] execute_payloads thread has finished')
Exemple #9
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        try:
            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')
            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING
            log = get_logger(job.jobid)

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                log.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)')
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    log.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)')
                    declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                    break

                #queues.finished_data_in.put(job)
                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    log.debug('job %s has been removed from the current_data_in queue' % _job.jobid)

                # now create input file metadata if required by the payload
                try:
                    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], -1)
                    _dir = '/srv' if job.usecontainer else job.workdir
                    file_dictionary = get_input_file_dictionary(job.indata, _dir)
                    #file_dictionary = get_input_file_dictionary(job.indata, job.workdir)
                    log.debug('file_dictionary=%s' % str(file_dictionary))
                    xml = user.create_input_file_metadata(file_dictionary, job.workdir)
                    log.info('created input file metadata:\n%s' % xml)
                except Exception as e:
                    pass
            else:
                log.warning('stage-in failed, adding job object to failed_data_in queue')
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                #queues.failed_data_in.put(job)
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                # send_state(job, args, 'failed')

        except queue.Empty:
            continue

    logger.debug('[data] copytool_in thread has finished')
Exemple #10
0
    def run(self):  # noqa: C901
        """
        Run all payload processes (including pre- and post-processes, and utilities).
        In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special
        exit code.
        :return:
        """

        # get the payload command from the user specific code
        self.pre_setup(self.__job)

        cmd = self.get_payload_command(self.__job)
        # extract the setup in case the preprocess command needs it
        self.__job.setup = self.extract_setup(cmd)
        self.post_setup(self.__job)

        # a loop is needed for HPO jobs
        # abort when nothing more to run, or when the preprocess returns a special exit code
        iteration = 0
        while True:

            logger.info('payload iteration loop #%d', iteration + 1)
            os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration
            show_memory_usage()

            # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd
            jobparams_pre = self.__job.jobparams
            exit_code = self.run_preprocess(self.__job)
            jobparams_post = self.__job.jobparams
            if exit_code:
                if exit_code >= 160 and exit_code <= 162:
                    exit_code = 0
                    # wipe the output file list since there won't be any new files
                    # any output files from previous iterations, should have been transferred already
                    logger.debug(
                        'reset outdata since further output should not be expected after preprocess exit'
                    )
                    self.__job.outdata = []
                break
            if jobparams_pre != jobparams_post:
                logger.debug(
                    'jobparams were updated by utility_before_payload()')
                # must update cmd
                cmd = cmd.replace(jobparams_pre, jobparams_post)

            # now run the main payload, when it finishes, run the postprocess (if necessary)
            # note: no need to run any main payload in HPO Horovod jobs on Kubernetes
            if os.environ.get('HARVESTER_HOROVOD', '') == '':

                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before payload start] stdout=%s', _stdout)
                #logger.debug('[before payload start] stderr=%s', _stderr)

                proc = self.run_payload(self.__job, cmd, self.__out,
                                        self.__err)
            else:
                proc = None

            proc_co = None
            if proc is None:
                # run the post-process command even if there was no main payload
                if os.environ.get('HARVESTER_HOROVOD', '') != '':
                    logger.info('No need to execute any main payload')
                    exit_code = self.run_utility_after_payload_finished(
                        exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2)
                    self.post_payload(self.__job)
                else:
                    break
            else:
                # the process is now running, update the server
                # test 'tobekilled' from here to try payload kill
                send_state(self.__job, self.__args, self.__job.state)

                # note: when sending a state change to the server, the server might respond with 'tobekilled'
                if self.__job.state == 'failed':
                    logger.warning(
                        'job state is \'failed\' - abort payload and run()')
                    kill_processes(proc.pid)
                    break

                # allow for a secondary command to be started after the payload (e.g. a coprocess)
                utility_cmd = self.get_utility_command(
                    order=UTILITY_AFTER_PAYLOAD_STARTED2)
                if utility_cmd:
                    logger.debug('starting utility command: %s', utility_cmd)
                    label = 'coprocess' if 'coprocess' in utility_cmd else None
                    proc_co = self.run_command(utility_cmd, label=label)

                logger.info('will wait for graceful exit')
                exit_code = self.wait_graceful(self.__args, proc)
                # reset error if Raythena decided to kill payload (no error)
                if errors.KILLPAYLOAD in self.__job.piloterrorcodes:
                    logger.debug('ignoring KILLPAYLOAD error')
                    self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code(
                        errors.KILLPAYLOAD,
                        pilot_error_codes=self.__job.piloterrorcodes,
                        pilot_error_diags=self.__job.piloterrordiags)
                    exit_code = 0
                    state = 'finished'
                else:
                    state = 'finished' if exit_code == 0 else 'failed'
                set_pilot_state(job=self.__job, state=state)
                logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n',
                            proc.pid, exit_code, self.__job.state)

                #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after payload finish] stdout=%s', _stdout)
                #logger.debug('[after payload finish] stderr=%s', _stderr)

                # stop the utility command (e.g. a coprocess if necessary
                if proc_co:
                    logger.debug('stopping utility command: %s', utility_cmd)
                    kill_processes(proc_co.pid)

                if exit_code is None:
                    logger.warning(
                        'detected unset exit_code from wait_graceful - reset to -1'
                    )
                    exit_code = -1

                for order in [
                        UTILITY_AFTER_PAYLOAD_FINISHED,
                        UTILITY_AFTER_PAYLOAD_FINISHED2
                ]:
                    exit_code = self.run_utility_after_payload_finished(
                        exit_code, state, order)

                self.post_payload(self.__job)

                # stop any running utilities
                if self.__job.utilities != {}:
                    self.stop_utilities()

            if self.__job.is_hpo and state != 'failed':
                # in case there are more hyper-parameter points, move away the previous log files
                #self.rename_log_files(iteration)
                iteration += 1
            else:
                break

        return exit_code
Exemple #11
0
def execute_payloads(queues, traces, args):
    """
    Execute queued payloads.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    job = None
    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            job = queues.validated_payloads.get(block=True, timeout=1)
            log = get_logger(job.jobid, logger)

            q_snapshot = list(queues.finished_data_in.queue)
            peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid]
            if len(peek) == 0:
                #queues.validated_payloads.put(job)
                put_in_queue(job, queues.validated_payloads)
                for i in range(10):  # Python 3
                    if args.graceful_stop.is_set():
                        break
                    time.sleep(1)
                continue

            # this job is now to be monitored, so add it to the monitored_payloads queue
            #queues.monitored_payloads.put(job)
            put_in_queue(job, queues.monitored_payloads)

            log.info('job %s added to monitored payloads queue' % job.jobid)

            out = open(os.path.join(job.workdir, config.Payload.payloadstdout),
                       'wb')
            err = open(os.path.join(job.workdir, config.Payload.payloadstderr),
                       'wb')

            send_state(job, args, 'starting')

            payload_executor = get_payload_executor(args, job, out, err,
                                                    traces)
            log.info("Got payload executor: %s" % payload_executor)

            # run the payload and measure the execution time
            job.t0 = os.times()
            exit_code = payload_executor.run()

            set_cpu_consumption_time(job)
            job.transexitcode = exit_code % 255

            out.close()
            err.close()

            # analyze and interpret the payload execution output
            perform_initial_payload_error_analysis(job, exit_code)

            # was an error already found?
            if job.piloterrorcodes:
                exit_code_interpret = 1
            else:
                pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
                user = __import__('pilot.user.%s.diagnose' % pilot_user,
                                  globals(), locals(), [pilot_user],
                                  0)  # Python 2/3
                try:
                    exit_code_interpret = user.interpret(job)
                except Exception as e:
                    log.warning('exception caught: %s' % e)
                    exit_code_interpret = -1
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                        errors.INTERNALPILOTPROBLEM)

            if exit_code_interpret == 0 and exit_code == 0:
                log.info(
                    'main payload error analysis completed - did not find any errors'
                )

                # update output lists if zipmaps were used
                #job.add_archives_to_output_lists()

                # queues.finished_payloads.put(job)
                put_in_queue(job, queues.finished_payloads)
            else:
                log.debug(
                    'main payload error analysis completed - adding job to failed_payloads queue'
                )
                #queues.failed_payloads.put(job)
                put_in_queue(job, queues.failed_payloads)

        except queue.Empty:
            continue
        except Exception as e:
            logger.fatal(
                'execute payloads caught an exception (cannot recover): %s, %s'
                % (e, traceback.format_exc()))
            if job:
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.PAYLOADEXECUTIONEXCEPTION)
                #queues.failed_payloads.put(job)
                put_in_queue(job, queues.failed_payloads)
            while not args.graceful_stop.is_set():
                # let stage-out of log finish, but stop running payloads as there should be a problem with the pilot
                time.sleep(5)

    logger.info('[payload] execute_payloads thread has finished')
Exemple #12
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue'
                        % _job.jobid)

                # now create input file metadata if required by the payload
                if config.Payload.executor_type.lower() != 'raythena':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s' % xml)
            else:
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')