def declare_output(job, work_report, worker_stageout_declaration): out_file_report = {} out_file_report[job.jobid] = [] for outfile in job.output_files.keys(): logger.debug( "File {} will be checked and declared for stage out".format( outfile)) if os.path.exists(outfile): file_desc = {} if outfile == job.log_file: file_desc['filetype'] = 'log' else: file_desc['filetype'] = 'output' file_desc['path'] = os.path.abspath(outfile) file_desc['fsize'] = os.path.getsize(outfile) if 'guid' in job.output_files[outfile].keys(): file_desc['guid'] = job.output_files[outfile]['guid'] elif work_report['outputfiles'] and work_report['outputfiles'][ outfile]: file_desc['guid'] = work_report['outputfiles'][outfile]['guid'] out_file_report[job.jobid].append(file_desc) else: logger.info( "Expected output file {0} missed. Job {1} will be failed". format(outfile, job.jobid)) set_pilot_state(job=job, state='failed') if out_file_report[job.jobid]: write_json(worker_stageout_declaration, out_file_report) logger.debug( 'Stagout declared in: {0}'.format(worker_stageout_declaration)) logger.debug('Report for stageout: {}'.format(out_file_report))
def validate_post(queues, traces, args): """ Validate finished payloads. If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). :param queues: :param traces: :param args: :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads try: job = queues.finished_payloads.get(block=True, timeout=1) except queue.Empty: time.sleep(0.1) continue log = get_logger(job.jobid, logger) # by default, both output and log should be staged out job.stageout = 'all' log.debug('adding job to data_out queue') #queues.data_out.put(job) set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) logger.info('[payload] validate_post thread has finished')
def failed_post(queues, traces, args): """ Get a Job object from the "failed_payloads" queue. Set the pilot state to "stakeout" and the stageout field to "log", and add the Job object to the "data_out" queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads try: job = queues.failed_payloads.get(block=True, timeout=1) except queue.Empty: time.sleep(0.1) continue logger.debug('adding log for log stageout') job.stageout = 'log' # only stage-out log file #queues.data_out.put(job) set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.info('[payload] failed_post thread has finished')
def failed_post(queues, traces, args): """ (add description) :param queues: :param traces: :param args: :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads try: job = queues.failed_payloads.get(block=True, timeout=1) except queue.Empty: time.sleep(0.1) continue log = get_logger(job.jobid, logger) log.debug('adding log for log stageout') job.stageout = 'log' # only stage-out log file #queues.data_out.put(job) set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) logger.info('[payload] failed_post thread has finished')
def memory_usage(job): """ Perform memory usage verification. :param job: job object :return: exit code (int), diagnostics (string). """ exit_code = 0 diagnostics = "" # Get the maxPSS value from the memory monitor summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor) if not summary_dictionary: exit_code = errors.BADMEMORYMONITORJSON diagnostics = "Memory monitor output could not be read" return exit_code, diagnostics maxdict = summary_dictionary.get('Max', {}) maxpss_int = maxdict.get('maxPSS', -1) # Only proceed if values are set if maxpss_int != -1: maxrss = job.infosys.queuedata.maxrss if maxrss: # correction for SCORE/4CORE/nCORE jobs on UCORE queues scale = get_ucore_scale_factor(job) try: maxrss_int = 2 * int(maxrss * scale) * 1024 # Convert to int and kB except Exception as e: logger.warning("unexpected value for maxRSS: %s" % e) else: # Compare the maxRSS with the maxPSS from memory monitor if maxrss_int > 0 and maxpss_int > 0: if maxpss_int > maxrss_int: diagnostics = "job has exceeded the memory limit %d kB > %d kB (2 * queuedata.maxrss)" % \ (maxpss_int, maxrss_int) logger.warning(diagnostics) # Create a lockfile to let RunJob know that it should not restart the memory monitor after it has been killed #pUtil.createLockFile(False, self.__env['jobDic'][k][1].workdir, lockfile="MEMORYEXCEEDED") # Kill the job set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXCEEDMAXMEM) kill_processes(job.pid) else: logger.info("max memory (maxPSS) used by the payload is within the allowed limit: " "%d B (2 * maxRSS = %d B)" % (maxpss_int, maxrss_int)) else: if maxrss == 0 or maxrss == "0": logger.info("queuedata.maxrss set to 0 (no memory checks will be done)") else: logger.warning("queuedata.maxrss is not set") return exit_code, diagnostics
def check_work_dir(job): """ Check the size of the work directory. The function also updates the workdirsizes list in the job object. :param job: job object. :return: exit code (int), error diagnostics (string) """ exit_code = 0 diagnostics = "" if os.path.exists(job.workdir): # get the limit of the workdir maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata) if os.path.exists(job.workdir): workdirsize = get_disk_usage(job.workdir) # is user dir within allowed size limit? if workdirsize > maxwdirsize: exit_code = errors.USERDIRTOOLARGE diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \ (job.workdir, workdirsize, maxwdirsize) logger.fatal("%s", diagnostics) cmd = 'ls -altrR %s' % job.workdir _ec, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) # kill the job # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED") set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) kill_processes(job.pid) # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: remove_files(job.workdir, lfns) # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_disk_usage(job.workdir) else: logger.info("size of work directory %s: %d B (within %d B limit)", job.workdir, workdirsize, maxwdirsize) # Store the measured disk space (the max value will later be sent with the job metrics) if workdirsize > 0: job.add_workdir_size(workdirsize) else: logger.warning('job work dir does not exist: %s', job.workdir) else: logger.warning('skipping size check of workdir since it has not been created yet') return exit_code, diagnostics
def run(self): """ (add description) :return: """ log = get_logger(str(self.__job.jobid), logger) exit_code = 1 pilot_user = os.environ.get('PILOT_USER', 'generic').lower() if self.setup_payload(self.__job, self.__out, self.__err): log.debug('running payload') proc = self.run_payload(self.__job, self.__out, self.__err) if proc is not None: # the process is now running, update the server set_pilot_state(job=self.__job, state="running") send_state(self.__job, self.__args, self.__job.state) log.info('will wait for graceful exit') exit_code = self.wait_graceful(self.__args, proc, self.__job) state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=self.__job, state=state) log.info('finished pid=%s exit_code=%s state=%s' % (proc.pid, exit_code, self.__job.state)) if exit_code is None: log.warning( 'detected unset exit_code from wait_graceful - reset to -1' ) exit_code = -1 self.utility_after_payload_finished(self.__job) self.post_payload(self.__job) # stop any running utilities if self.__job.utilities != {}: for utcmd in self.__job.utilities.keys(): utproc = self.__job.utilities[utcmd][0] if utproc: user = __import__( 'pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], -1) sig = user.get_utility_command_kill_signal(utcmd) log.info("stopping process \'%s\' with signal %d" % (utcmd, sig)) try: os.killpg(os.getpgid(utproc.pid), sig) except Exception as e: log.warning('exception caught: %s (ignoring)' % e) user.post_utility_command_action(utcmd, self.__job) return exit_code
def kill_looping_job(job): """ Kill the looping process. TODO: add allow_looping_job() exp. spec? :param job: job object. :return: (updated job object.) """ # the child process is looping, kill it diagnostics = "pilot has decided to kill looping job %s at %s" % ( job.jobid, time_stamp()) logger.fatal(diagnostics) cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINTIMEOUT) elif job.state == 'stageout': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTTIMEOUT) else: # most likely in the 'running' state, but use the catch-all 'else' job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOOPINGJOB) set_pilot_state(job=job, state="failed") # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: ec = remove_files(job.workdir, lfns) if ec != 0: logger.warning('failed to remove all files') kill_processes(job.pid)
def declare_failed_by_kill(job, queue, sig): """ Declare the job failed by a kill signal and put it in a suitable failed queue. E.g. queue=queues.failed_data_in, if the kill signal was received during stage-in. :param job: job object. :param queue: queue object. :param sig: signal. :return: """ set_pilot_state(job=job, state="failed") error_code = errors.get_kill_signal_error_code(sig) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error_code) #queue.put(job) put_in_queue(job, queue)
def run_payload(self, job, cmd, out, err): """ Setup and execute the main payload process. REFACTOR using run_command() :param job: job object. :param out: (currently not used; deprecated) :param err: (currently not used; deprecated) :return: proc (subprocess returned by Popen()) """ # main payload process steps # add time for PILOT_PRE_PAYLOAD self.pre_payload(job) logger.info("\n\npayload execution command:\n\n%s\n" % cmd) try: proc = execute(cmd, workdir=job.workdir, returnproc=True, usecontainer=True, stdout=out, stderr=err, cwd=job.workdir, job=job) except Exception as e: logger.error('could not execute: %s' % str(e)) return None if type(proc) == tuple and not proc[0]: logger.error('failed to execute payload') return None logger.info('started -- pid=%s executable=%s' % (proc.pid, cmd)) job.pid = proc.pid job.pgrp = os.getpgid(job.pid) set_pilot_state(job=job, state="running") #_cmd = self.utility_with_payload(job) #if _cmd: # logger.info('could have executed: %s' % _cmd) self.utility_after_payload_started(job) return proc
def validate_post(queues, traces, args): """ Validate finished payloads. If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads try: job = queues.finished_payloads.get(block=True, timeout=1) except queue.Empty: time.sleep(0.1) continue # by default, both output and log should be staged out job.stageout = 'all' logger.debug('adding job to data_out queue') #queues.data_out.put(job) set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.info('[payload] validate_post thread has finished')
def copytool_in(queues, traces, args): """ Call the stage-in function and put the job object in the proper queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) try: # abort if kill signal arrived too long time ago, ie loop is stuck current_time = int(time.time()) if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning( 'loop has run for too long time after first kill signal - will abort' ) break # extract a job to stage-in its input job = queues.data_in.get(block=True, timeout=1) # does the user want to execute any special commands before stage-in? pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: # xcache debug #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[before xcache start] stdout=%s', _stdout) #logger.debug('[before xcache start] stderr=%s', _stderr) _, stdout, stderr = execute(cmd.get('command')) logger.debug('stdout=%s', stdout) logger.debug('stderr=%s', stderr) # xcache debug #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[after xcache start] stdout=%s', _stdout) #logger.debug('[after xcache start] stderr=%s', _stderr) # perform any action necessary after command execution (e.g. stdout processing) kwargs = { 'label': cmd.get('label', 'utility'), 'output': stdout } user.post_prestagein_utility_command(**kwargs) # write output to log files write_utility_output(job.workdir, cmd.get('label', 'utility'), stdout, stderr) # place it in the current stage-in queue (used by the jobs' queue monitoring) put_in_queue(job, queues.current_data_in) # ready to set the job in running state send_state(job, args, 'running') # note: when sending a state change to the server, the server might respond with 'tobekilled' if job.state == 'failed': logger.warning( 'job state is \'failed\' - order log transfer and abort copytool_in()' ) job.stageout = 'log' # only stage-out log file put_in_queue(job, queues.data_out) break os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job pre stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job post stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: logger.debug( 'job %s has been removed from the current_data_in queue', _job.jobid) # now create input file metadata if required by the payload if os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'generic': pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata( file_dictionary, job.workdir) logger.info('created input file metadata:\n%s', xml) else: # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: logger.debug( 'job %s has been removed from the current_data_in queue', _job.jobid) logger.warning( 'stage-in failed, adding job object to failed_data_in queue' ) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") traces.pilot['error_code'] = job.piloterrorcodes[0] put_in_queue(job, queues.failed_data_in) # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) args.graceful_stop.set() except queue.Empty: continue # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] copytool_in thread has finished')
def queue_monitoring(queues, traces, args): """ Monitoring of Data queues. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while True: # will abort when graceful_stop has been set time.sleep(0.5) if traces.pilot['command'] == 'abort': logger.warning('data queue monitor saw the abort instruction') args.graceful_stop.set() # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) # (abort at the end of the loop) abort = should_abort(args, label='data:queue_monitoring') # monitor the failed_data_in queue try: job = queues.failed_data_in.get(block=True, timeout=1) except queue.Empty: pass else: # stage-out log file then add the job to the failed_jobs queue job.stageout = "log" # TODO: put in data_out queue instead? if not _stage_out_new(job, args): logger.info( "job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue", job.jobid) put_in_queue(job, queues.failed_data_out) else: logger.info( "job %s failed during stage-in, adding job object to failed_jobs queue", job.jobid) put_in_queue(job, queues.failed_jobs) # monitor the finished_data_out queue try: job = queues.finished_data_out.get(block=True, timeout=1) except queue.Empty: pass else: # use the payload/transform exitCode from the job report if it exists if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []: logger.info( 'finished stage-out for finished payload, adding job to finished_jobs queue' ) #queues.finished_jobs.put(job) put_in_queue(job, queues.finished_jobs) else: logger.info('finished stage-out (of log) for failed payload') #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) # monitor the failed_data_out queue try: job = queues.failed_data_out.get(block=True, timeout=1) except queue.Empty: pass else: # attempt to upload the log in case the previous stage-out failure was not an SE error job.stageout = "log" set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): logger.info("job %s failed during stage-out", job.jobid) put_in_queue(job, queues.failed_jobs) if abort: break # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] queue_monitor thread has finished')
def _stage_out_new(job, args): """ Stage-out of all output files. If job.stageout=log then only log files will be transferred. :param job: job object. :param args: pilot args object. :return: True in case of success, False otherwise. """ #logger.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) is_success = True if not job.outdata or job.is_eventservice: logger.info( 'this job does not have any output files, only stage-out log file') job.stageout = 'log' if job.stageout != 'log': ## do stage-out output files if not _do_stageout(job, job.outdata, ['pw', 'w'], args.queue, title='output', output_dir=args.output_dir): is_success = False logger.warning('transfer of output file(s) failed') if job.stageout in ['log', 'all' ] and job.logdata: ## do stage-out log files # prepare log file, consider only 1st available log file status = job.get_status('LOG_TRANSFER') if status != LOG_TRANSFER_NOT_DONE: logger.warning('log transfer already attempted') return False job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS logfile = job.logdata[0] try: tarball_name = 'tarball_PandaJob_%s_%s' % (job.jobid, job.infosys.pandaqueue) input_files = [fspec.lfn for fspec in job.indata] output_files = [fspec.lfn for fspec in job.outdata] create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup, input_files=input_files, output_files=output_files, is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug) except LogFileCreationFailure as error: logger.warning('failed to create tar file: %s', error) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOGFILECREATIONFAILURE) return False if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'], args.queue, title='log', output_dir=args.output_dir): is_success = False logger.warning('log transfer failed') job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED else: job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE elif not job.logdata: logger.info('no log was defined - will not create log file') job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) # generate fileinfo details to be send to Panda fileinfo = {} for iofile in job.outdata + job.logdata: if iofile.status in ['transferred']: fileinfo[iofile.lfn] = { 'guid': iofile.guid, 'fsize': iofile.filesize, 'adler32': iofile.checksum.get('adler32'), 'surl': iofile.turl } job.fileinfo = fileinfo # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS if not is_success: # set error code + message (a more precise error code might have been set already) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTFAILED) set_pilot_state(job=job, state="failed") logger.warning('stage-out failed') return False logger.info('stage-out finished correctly') if not job.state or ( job.state and job.state == 'stageout' ): # is the job state already set? if so, don't change the state (unless it's the stageout state) logger.debug('changing job state from %s to finished', job.state) set_pilot_state(job=job, state="finished") # send final server update since all transfers have finished correctly # send_state(job, args, 'finished', xml=dumps(fileinfodict)) return is_success
def copytool_in(queues, traces, args): """ Call the stage-in function and put the job object in the proper queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) try: # abort if kill signal arrived too long time ago, ie loop is stuck current_time = int(time.time()) if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning( 'loop has run for too long time after first kill signal - will abort' ) break # extract a job to stage-in its input job = queues.data_in.get(block=True, timeout=1) # place it in the current stage-in queue (used by the jobs' queue monitoring) if job: put_in_queue(job, queues.current_data_in) # ready to set the job in running state send_state(job, args, 'running') # note: when sending a state change to the server, the server might respond with 'tobekilled' if job.state == 'failed': logger.warning( 'job state is \'failed\' - order log transfer and abort copytool_in()' ) job.stageout = 'log' # only stage-out log file put_in_queue(job, queues.data_out) break os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job pre stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job post stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: logger.debug( 'job %s has been removed from the current_data_in queue' % _job.jobid) # now create input file metadata if required by the payload if config.Payload.executor_type.lower() != 'raythena': pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata( file_dictionary, job.workdir) logger.info('created input file metadata:\n%s' % xml) else: logger.warning( 'stage-in failed, adding job object to failed_data_in queue' ) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") traces.pilot['error_code'] = job.piloterrorcodes[0] put_in_queue(job, queues.failed_data_in) # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) args.graceful_stop.set() except queue.Empty: continue # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] copytool_in thread has finished')
def queue_monitoring(queues, traces, args): """ Monitoring of Data queues. :param queues: :param traces: :param args: :return: """ while True: # will abort when graceful_stop has been set if traces.pilot['command'] == 'abort': logger.warning('data queue monitor saw the abort instruction') # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) # (abort at the end of the loop) abort = should_abort(args, label='data:queue_monitoring') # monitor the failed_data_in queue try: job = queues.failed_data_in.get(block=True, timeout=1) except queue.Empty: pass else: log = get_logger(job.jobid) # stage-out log file then add the job to the failed_jobs queue job.stageout = "log" # TODO: put in data_out queue instead? if not _stage_out_new(job, args): log.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs " "queue" % job.jobid) #queues.failed_data_out.put(job) put_in_queue(job, queues.failed_data_out) else: log.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid) #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) # monitor the finished_data_out queue try: job = queues.finished_data_out.get(block=True, timeout=1) except queue.Empty: pass else: log = get_logger(job.jobid) # use the payload/transform exitCode from the job report if it exists if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []: log.info('finished stage-out for finished payload, adding job to finished_jobs queue') #queues.finished_jobs.put(job) put_in_queue(job, queues.finished_jobs) else: log.info('finished stage-out (of log) for failed payload') #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) # monitor the failed_data_out queue try: job = queues.failed_data_out.get(block=True, timeout=1) except queue.Empty: pass else: log = get_logger(job.jobid) # attempt to upload the log in case the previous stage-out failure was not an SE error job.stageout = "log" set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): log.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, " "adding job object to failed_jobs queue" % job.jobid) else: log.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job " "object to failed_jobs queue" % job.jobid) #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) if abort: break logger.debug('[data] queue_monitor thread has finished')
def _stage_out_new(job, args): """ Stage-out of all output files. If job.stageout=log then only log files will be transferred. :param job: job object. :param args: pilot args object. :return: True in case of success, False otherwise. """ log = get_logger(job.jobid) #log.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) is_success = True if not job.outdata or job.is_eventservice: log.info('this job does not have any output files, only stage-out log file') job.stageout = 'log' if job.stageout != 'log': ## do stage-out output files if not _do_stageout(job, job.outdata, ['pw', 'w'], title='output'): is_success = False log.warning('transfer of output file(s) failed') if job.stageout in ['log', 'all'] and job.logdata: ## do stage-out log files # prepare log file, consider only 1st available log file status = job.get_status('LOG_TRANSFER') if status != LOG_TRANSFER_NOT_DONE: log.warning('log transfer already attempted') return False job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS logfile = job.logdata[0] try: create_log(job, logfile, 'tarball_PandaJob_%s_%s' % (job.jobid, job.infosys.pandaqueue)) except LogFileCreationFailure as e: log.warning('failed to create tar file: %s' % e) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE) return False if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'], title='log'): is_success = False log.warning('log transfer failed') job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED else: job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) # generate fileinfo details to be send to Panda fileinfo = {} for e in job.outdata + job.logdata: if e.status in ['transferred']: fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize, 'adler32': e.checksum.get('adler32'), 'surl': e.turl} job.fileinfo = fileinfo log.info('prepared job.fileinfo=%s' % job.fileinfo) # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS if not is_success: # set error code + message (a more precise error code might have been set already) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED) set_pilot_state(job=job, state="failed") log.warning('stage-out failed') # with error: %d, %s (setting job state to failed)' % # log.warning('stage-out failed with error: %d, %s (setting job state to failed)' % # (job['pilotErrorCode'], job['pilotErrorDiag'])) # send_state(job, args, 'failed') return False log.info('stage-out finished correctly') if not job.state: # is the job state already set? if so, don't change the state set_pilot_state(job=job, state="finished") # send final server update since all transfers have finished correctly # send_state(job, args, 'finished', xml=dumps(fileinfodict)) return is_success
def copytool_in(queues, traces, args): """ Call the stage-in function and put the job object in the proper queue. :param queues: :param traces: :param args: :return: """ while not args.graceful_stop.is_set(): try: # extract a job to stage-in its input job = queues.data_in.get(block=True, timeout=1) # place it in the current stage-in queue (used by the jobs' queue monitoring) if job: put_in_queue(job, queues.current_data_in) # ready to set the job in running state send_state(job, args, 'running') os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING log = get_logger(job.jobid) if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_in, args.signal) break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_in, args.signal) break #queues.finished_data_in.put(job) put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: log.debug('job %s has been removed from the current_data_in queue' % _job.jobid) # now create input file metadata if required by the payload try: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], -1) _dir = '/srv' if job.usecontainer else job.workdir file_dictionary = get_input_file_dictionary(job.indata, _dir) #file_dictionary = get_input_file_dictionary(job.indata, job.workdir) log.debug('file_dictionary=%s' % str(file_dictionary)) xml = user.create_input_file_metadata(file_dictionary, job.workdir) log.info('created input file metadata:\n%s' % xml) except Exception as e: pass else: log.warning('stage-in failed, adding job object to failed_data_in queue') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") traces.pilot['error_code'] = job.piloterrorcodes[0] #queues.failed_data_in.put(job) put_in_queue(job, queues.failed_data_in) # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) args.graceful_stop.set() # send_state(job, args, 'failed') except queue.Empty: continue logger.debug('[data] copytool_in thread has finished')
def run(self): # noqa: C901 """ Run all payload processes (including pre- and post-processes, and utilities). In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special exit code. :return: """ # get the payload command from the user specific code self.pre_setup(self.__job) cmd = self.get_payload_command(self.__job) # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) self.post_setup(self.__job) # a loop is needed for HPO jobs # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: logger.info('payload iteration loop #%d', iteration + 1) os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration show_memory_usage() # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd jobparams_pre = self.__job.jobparams exit_code = self.run_preprocess(self.__job) jobparams_post = self.__job.jobparams if exit_code: if exit_code >= 160 and exit_code <= 162: exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already logger.debug( 'reset outdata since further output should not be expected after preprocess exit' ) self.__job.outdata = [] break if jobparams_pre != jobparams_post: logger.debug( 'jobparams were updated by utility_before_payload()') # must update cmd cmd = cmd.replace(jobparams_pre, jobparams_post) # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[before payload start] stdout=%s', _stdout) #logger.debug('[before payload start] stderr=%s', _stderr) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: proc = None proc_co = None if proc is None: # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') exit_code = self.run_utility_after_payload_finished( exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2) self.post_payload(self.__job) else: break else: # the process is now running, update the server # test 'tobekilled' from here to try payload kill send_state(self.__job, self.__args, self.__job.state) # note: when sending a state change to the server, the server might respond with 'tobekilled' if self.__job.state == 'failed': logger.warning( 'job state is \'failed\' - abort payload and run()') kill_processes(proc.pid) break # allow for a secondary command to be started after the payload (e.g. a coprocess) utility_cmd = self.get_utility_command( order=UTILITY_AFTER_PAYLOAD_STARTED2) if utility_cmd: logger.debug('starting utility command: %s', utility_cmd) label = 'coprocess' if 'coprocess' in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) logger.info('will wait for graceful exit') exit_code = self.wait_graceful(self.__args, proc) # reset error if Raythena decided to kill payload (no error) if errors.KILLPAYLOAD in self.__job.piloterrorcodes: logger.debug('ignoring KILLPAYLOAD error') self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code( errors.KILLPAYLOAD, pilot_error_codes=self.__job.piloterrorcodes, pilot_error_diags=self.__job.piloterrordiags) exit_code = 0 state = 'finished' else: state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=self.__job, state=state) logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state) #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[after payload finish] stdout=%s', _stdout) #logger.debug('[after payload finish] stderr=%s', _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: logger.debug('stopping utility command: %s', utility_cmd) kill_processes(proc_co.pid) if exit_code is None: logger.warning( 'detected unset exit_code from wait_graceful - reset to -1' ) exit_code = -1 for order in [ UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 ]: exit_code = self.run_utility_after_payload_finished( exit_code, state, order) self.post_payload(self.__job) # stop any running utilities if self.__job.utilities != {}: self.stop_utilities() if self.__job.is_hpo and state != 'failed': # in case there are more hyper-parameter points, move away the previous log files #self.rename_log_files(iteration) iteration += 1 else: break return exit_code
def check_payload_stdout(job): """ Check the size of the payload stdout. :param job: job object. :return: exit code (int), diagnostics (string). """ exit_code = 0 diagnostics = "" # get list of log files file_list = glob(os.path.join(job.workdir, 'log.*')) # is this a multi-trf job? n_jobs = job.jobparams.count("\n") + 1 for _i in range(n_jobs): # get name of payload stdout file created by the pilot _stdout = config.Payload.payloadstdout if n_jobs > 1: _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1)) # add the primary stdout file to the fileList file_list.append(os.path.join(job.workdir, _stdout)) tmp_list = glob(os.path.join(job.workdir, 'workDir/tmp.stdout.*')) if tmp_list: file_list += tmp_list logger.debug('file list=%s' % str(file_list)) # now loop over all files and check each individually (any large enough file will fail the job) for filename in file_list: logger.debug('check_payload_stdout: filename=%s', filename) if "job.log.tgz" in filename: logger.info("skipping file size check of file (%s) since it is a special log file", filename) continue if os.path.exists(filename): try: # get file size in bytes fsize = os.path.getsize(filename) except Exception as error: logger.warning("could not read file size of %s: %s", filename, error) else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG diagnostics = "Payload stdout file too big: %d B (larger than limit %d B)" % \ (fsize, localsizelimit_stdout) logger.warning(diagnostics) # kill the job set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) kill_processes(job.pid) # remove the payload stdout file after the log extracts have been created # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: # remove any lingering input files from the work dir exit_code = remove_files(job.workdir, lfns) else: logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize) else: logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename) return exit_code, diagnostics
def run(args): """ Main execution function for the generic HPC workflow. :param args: pilot arguments. :returns: traces object. """ # set communication point. Worker report should be placed there, matched with working directory of Harvester if args.harvester_workdir: communication_point = args.harvester_workdir else: communication_point = os.getcwd() work_report = get_initial_work_report() worker_attributes_file = config.Harvester.workerAttributesFile worker_stageout_declaration = config.Harvester.StageOutnFile payload_report_file = config.Payload.jobreport payload_stdout_file = config.Payload.payloadstdout payload_stderr_file = config.Payload.payloadstderr try: logger.info('setting up signal handling') signal.signal(signal.SIGINT, functools.partial(interrupt, args)) logger.info('setting up tracing') traces = namedtuple('traces', ['pilot']) traces.pilot = {'state': SUCCESS, 'nr_jobs': 0} if args.hpc_resource == '': logger.critical('hpc resource not specified, cannot continue') traces.pilot['state'] = FAILURE return traces # get the resource reference resource = __import__('pilot.resource.%s' % args.hpc_resource, globals(), locals(), [args.hpc_resource], -1) # get the user reference user = __import__('pilot.user.%s.common' % args.pilot_user.lower(), globals(), locals(), [args.pilot_user.lower()], -1) # get job (and rank) add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args) job, rank = resource.get_job(communication_point) add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) # cd to job working directory add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args) work_dir = resource.set_job_workdir(job, communication_point) work_report['workdir'] = work_dir worker_attributes_file = os.path.join(work_dir, worker_attributes_file) logger.debug("Worker attributes will be publeshied in: {0}".format( worker_attributes_file)) set_pilot_state(job=job, state="starting") work_report["jobStatus"] = job.state publish_work_report(work_report, worker_attributes_file) # Get HPC specific setup commands logger.info('setup for resource %s: %s' % (args.hpc_resource, str(resource.get_setup()))) setup_str = "; ".join(resource.get_setup()) # Prepare job scratch directory (RAM disk etc.) job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args) my_command = " ".join([job.script, job.script_parameters]) my_command = resource.command_fix(my_command, job_scratch_dir) my_command = setup_str + my_command add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args) # Basic execution. Should be replaced with something like 'run_payload' logger.debug("Going to launch: {0}".format(my_command)) logger.debug("Current work directory: {0}".format(job_scratch_dir)) payloadstdout = open(payload_stdout_file, "w") payloadstderr = open(payload_stderr_file, "w") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args) set_pilot_state(job=job, state="running") work_report["jobStatus"] = job.state work_report["startTime"] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") start_time = time.asctime(time.localtime(time.time())) job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") publish_work_report(work_report, worker_attributes_file) stime = time.time() t0 = os.times() exit_code, stdout, stderr = execute(my_command, stdout=payloadstdout, stderr=payloadstderr, shell=True) logger.debug("Payload exit code: {0}".format(exit_code)) t1 = os.times() exetime = time.time() - stime end_time = time.asctime(time.localtime(time.time())) t = map(lambda x, y: x - y, t1, t0) t_tot = reduce(lambda x, y: x + y, t[2:3]) job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") payloadstdout.close() payloadstderr.close() add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args) state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=job, state=state) job.exitcode = exit_code work_report["startTime"] = job.startTime work_report["endTime"] = job.endTime work_report["jobStatus"] = job.state work_report["cpuConsumptionTime"] = t_tot work_report["transExitCode"] = job.exitcode log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format( exit_code, job.jobid) log_jobreport += "CPU comsumption time: {0} JobID: {1} \n".format( t_tot, job.jobid) log_jobreport += "Start time: {0} JobID: {1} \n".format( start_time, job.jobid) log_jobreport += "End time: {0} JobID: {1} \n".format( end_time, job.jobid) log_jobreport += "Execution time: {0} sec. JobID: {1} \n".format( exetime, job.jobid) logger.info(log_jobreport) log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format( job.startTime, job.endTime) logger.debug(log_jobreport) # Parse job report file and update of work report if os.path.exists(payload_report_file): payload_report = user.parse_jobreport_data( read_json(payload_report_file)) work_report.update(payload_report) resource.process_jobreport(payload_report_file, job_scratch_dir, work_dir) resource.postprocess_workdir(job_scratch_dir) # output files should not be packed with logs protectedfiles = job.output_files.keys() # log file not produced (yet), so should be excluded if job.log_file in protectedfiles: protectedfiles.remove(job.log_file) else: logger.info("Log files was not declared") logger.info("Cleanup of working directory") protectedfiles.extend( [worker_attributes_file, worker_stageout_declaration]) user.remove_redundant_files(job_scratch_dir, protectedfiles) res = tar_files(job_scratch_dir, protectedfiles, job.log_file) if res > 0: raise FileHandlingFailure("Log file tar failed") add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) # Copy of output to shared FS for stageout if not job_scratch_dir == work_dir: copy_output(job, job_scratch_dir, work_dir) add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) logger.info("Declare stage-out") add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(), args) declare_output(job, work_report, worker_stageout_declaration) logger.info("All done") publish_work_report(work_report, worker_attributes_file) traces.pilot['state'] = SUCCESS logger.debug("Final report: {0}".format(work_report)) add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args) except Exception as e: work_report["jobStatus"] = "failed" work_report["exitMsg"] = str(e) publish_work_report(work_report, worker_attributes_file) logging.exception('exception caught:') traces.pilot['state'] = FAILURE return traces