def copytool_out(queues, traces, args): """ Main stage-out thread. Perform stage-out as soon as a job object can be extracted from the data_out queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ cont = True logger.debug('entering copytool_out loop') if args.graceful_stop.is_set(): logger.debug('graceful_stop already set') processed_jobs = [] while cont: time.sleep(0.5) # abort if kill signal arrived too long time ago, ie loop is stuck current_time = int(time.time()) if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning( 'loop has run for too long time after first kill signal - will abort' ) break # check for abort, print useful messages and include a 1 s sleep abort = should_abort(args, label='data:copytool_out') try: job = queues.data_out.get(block=True, timeout=1) if job: # hack to prevent stage-out to be called more than once for same job object (can apparently happen # in multi-output jobs) # should not be necessary unless job object is added to queues.data_out more than once - check this # for multiple output files if processed_jobs: if is_already_processed(queues, processed_jobs): continue logger.info('will perform stage-out for job id=%s', job.jobid) if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_out detected a set abort_job pre stage-out (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_out, args.signal) break if _stage_out_new(job, args): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_out detected a set abort_job post stage-out (due to a kill signal)' ) #declare_failed_by_kill(job, queues.failed_data_out, args.signal) break #queues.finished_data_out.put(job) processed_jobs.append(job.jobid) put_in_queue(job, queues.finished_data_out) logger.debug('job object added to finished_data_out queue') else: #queues.failed_data_out.put(job) put_in_queue(job, queues.failed_data_out) logger.debug('job object added to failed_data_out queue') else: logger.debug('no returned job - why no exception?') except queue.Empty: if abort: cont = False break continue if abort: cont = False break # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] copytool_out thread has finished')
def queue_monitoring(queues, traces, args): """ Monitoring of Data queues. :param queues: :param traces: :param args: :return: """ while True: # will abort when graceful_stop has been set if traces.pilot['command'] == 'abort': logger.warning('data queue monitor saw the abort instruction') # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) # (abort at the end of the loop) abort = should_abort(args, label='data:queue_monitoring') # monitor the failed_data_in queue try: job = queues.failed_data_in.get(block=True, timeout=1) except queue.Empty: pass else: log = get_logger(job.jobid) # stage-out log file then add the job to the failed_jobs queue job.stageout = "log" # TODO: put in data_out queue instead? if not _stage_out_new(job, args): log.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs " "queue" % job.jobid) #queues.failed_data_out.put(job) put_in_queue(job, queues.failed_data_out) else: log.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid) #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) # monitor the finished_data_out queue try: job = queues.finished_data_out.get(block=True, timeout=1) except queue.Empty: pass else: log = get_logger(job.jobid) # use the payload/transform exitCode from the job report if it exists if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []: log.info('finished stage-out for finished payload, adding job to finished_jobs queue') #queues.finished_jobs.put(job) put_in_queue(job, queues.finished_jobs) else: log.info('finished stage-out (of log) for failed payload') #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) # monitor the failed_data_out queue try: job = queues.failed_data_out.get(block=True, timeout=1) except queue.Empty: pass else: log = get_logger(job.jobid) # attempt to upload the log in case the previous stage-out failure was not an SE error job.stageout = "log" set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): log.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, " "adding job object to failed_jobs queue" % job.jobid) else: log.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job " "object to failed_jobs queue" % job.jobid) #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) if abort: break logger.debug('[data] queue_monitor thread has finished')
def queue_monitoring(queues, traces, args): """ Monitoring of Data queues. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while True: # will abort when graceful_stop has been set time.sleep(0.5) if traces.pilot['command'] == 'abort': logger.warning('data queue monitor saw the abort instruction') args.graceful_stop.set() # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) # (abort at the end of the loop) abort = should_abort(args, label='data:queue_monitoring') # monitor the failed_data_in queue try: job = queues.failed_data_in.get(block=True, timeout=1) except queue.Empty: pass else: # stage-out log file then add the job to the failed_jobs queue job.stageout = "log" # TODO: put in data_out queue instead? if not _stage_out_new(job, args): logger.info( "job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue", job.jobid) put_in_queue(job, queues.failed_data_out) else: logger.info( "job %s failed during stage-in, adding job object to failed_jobs queue", job.jobid) put_in_queue(job, queues.failed_jobs) # monitor the finished_data_out queue try: job = queues.finished_data_out.get(block=True, timeout=1) except queue.Empty: pass else: # use the payload/transform exitCode from the job report if it exists if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []: logger.info( 'finished stage-out for finished payload, adding job to finished_jobs queue' ) #queues.finished_jobs.put(job) put_in_queue(job, queues.finished_jobs) else: logger.info('finished stage-out (of log) for failed payload') #queues.failed_jobs.put(job) put_in_queue(job, queues.failed_jobs) # monitor the failed_data_out queue try: job = queues.failed_data_out.get(block=True, timeout=1) except queue.Empty: pass else: # attempt to upload the log in case the previous stage-out failure was not an SE error job.stageout = "log" set_pilot_state(job=job, state="failed") if not _stage_out_new(job, args): logger.info("job %s failed during stage-out", job.jobid) put_in_queue(job, queues.failed_jobs) if abort: break # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] queue_monitor thread has finished')
def copytool_out(queues, traces, args): """ Main stage-out thread. Perform stage-out as soon as a job object can be extracted from the data_out queue. :param queues: pilot queues object. :param traces: pilot traces object. :param args: pilot args object. :return: """ cont = True logger.debug('entering copytool_out loop') if args.graceful_stop.is_set(): logger.debug('graceful_stop already set') first = True # while not args.graceful_stop.is_set() and cont: while cont: if first: first = False logger.debug('inside copytool_out() loop') # check for abort, print useful messages and include a 1 s sleep abort = should_abort(args, label='data:copytool_out') if abort: logger.debug('will abort ') try: job = queues.data_out.get(block=True, timeout=1) if job: log = get_logger(job.jobid) log.info('will perform stage-out') if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_out detected a set abort_job pre stage-out (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_out, args.signal) break if _stage_out_new(job, args): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_out detected a set abort_job post stage-out (due to a kill signal)') #declare_failed_by_kill(job, queues.failed_data_out, args.signal) break #queues.finished_data_out.put(job) put_in_queue(job, queues.finished_data_out) log.debug('job object added to finished_data_out queue') else: #queues.failed_data_out.put(job) put_in_queue(job, queues.failed_data_out) log.debug('job object added to failed_data_out queue') else: log.debug('no returned job - why no exception?') except queue.Empty: if abort: logger.debug('aborting') cont = False break continue if abort: logger.debug('aborting') cont = False break logger.debug('[data] copytool_out thread has finished')