def handle_abort_interrupt(signum, frame, stream=sys.stdout): abort(True) stream.write('\b\b\r') stream.flush() handle_abort_interrupt.log = Activity('Quitting grid-control! (This can take a few seconds...)', parent='root') signal.signal(signum, signal.SIG_DFL)
def check(self, wms, maxsample = 100): jobList = self.sample(self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING)), utils.QM(self.continuous, maxsample, -1)) # Check jobs in the joblist and return changes, timeouts and successfully reported jobs (change, timeoutList, reported) = self.checkJobList(wms, jobList) if change == None: # neither True or False => abort return False # Cancel jobs which took too long if len(timeoutList): change = True print '\nTimeout for the following jobs:' self.cancel(wms, timeoutList) # Process task interventions self.processIntervention(wms, self._task.getIntervention()) # Quit when all jobs are finished if self.jobDB.getJobsN(ClassSelector(JobClass.ENDSTATE)) == len(self.jobDB): self.logDisabled() self._eventhandler.onTaskFinish(len(self.jobDB)) if self._task.canFinish(): utils.vprint('Task successfully completed. Quitting grid-control!', -1, True) utils.abort(True) return change
def jobCycle(self, wait = utils.wait): while True: (didWait, lastSpaceMsg) = (False, 0) # Check whether wms can submit if not self.wms.canSubmit(self.task.wallTime, self._submitFlag): self._submitFlag = False # Check free disk space if (self._checkSpace > 0) and utils.freeSpace(self._workDir) < self._checkSpace: if time.time() - lastSpaceMsg > 5 * 60: utils.vprint('Not enough space left in working directory', -1, True) lastSpaceMsg = time.time() else: for action in map(str.lower, self._actionList): if action.startswith('c') and not utils.abort(): # check for jobs if self.jobManager.check(self.wms): didWait = wait(self.wms.getTimings()[1]) elif action.startswith('r') and not utils.abort(): # retrieve finished jobs if self.jobManager.retrieve(self.wms): didWait = wait(self.wms.getTimings()[1]) elif action.startswith('s') and not utils.abort() and self._submitFlag: if self.jobManager.submit(self.wms): didWait = wait(self.wms.getTimings()[1]) # quit if abort flag is set or not in continuous mode if utils.abort() or not self.runContinuous: break # idle timeout if not didWait: wait(self.wms.getTimings()[0])
def process(self, wait = utils.wait): wmsTiming = self.wms.getTimings() t_start = time.time() while True: didWait = False # Check whether wms can submit if not self.wms.canSubmit(self._submitTime, self._submitFlag): self._submitFlag = False # Check free disk space spaceLogger = logging.getLogger('workflow.space') spaceLogger.addFilter(LogEveryNsec(interval = 5 * 60)) if (self._checkSpace > 0) and utils.freeSpace(self._workDir) < self._checkSpace: spaceLogger.warning('Not enough space left in working directory') else: for action in imap(str.lower, self._actionList): if action.startswith('c') and not utils.abort(): # check for jobs if self.jobManager.check(self.wms): didWait = wait(wmsTiming.waitBetweenSteps) elif action.startswith('r') and not utils.abort(): # retrieve finished jobs if self.jobManager.retrieve(self.wms): didWait = wait(wmsTiming.waitBetweenSteps) elif action.startswith('s') and not utils.abort() and self._submitFlag: if self.jobManager.submit(self.wms): didWait = wait(wmsTiming.waitBetweenSteps) # quit if abort flag is set or not in continuous mode if utils.abort() or ((self.duration >= 0) and (time.time() - t_start > self.duration)): break # idle timeout if not didWait: wait(wmsTiming.waitOnIdle) self.monitor.onFinish()
def check(self, wms): jobList = self._sample(self.jobDB.getJobs(ClassSelector(JobClass.PROCESSING)), utils.QM(self._continuous, self._chunks_check, -1)) # Check jobs in the joblist and return changes, timeouts and successfully reported jobs (change, timeoutList, reported) = self._checkJobList(wms, jobList) unreported = len(jobList) - len(reported) if unreported > 0: self._log_user_time.critical('%d job(s) did not report their status!', unreported) if change is None: # neither True or False => abort return False # Cancel jobs which took too long if len(timeoutList): change = True self._log_user.warning('Timeout for the following jobs:') self.cancel(wms, timeoutList) # Process task interventions self._processIntervention(wms, self._task.getIntervention()) # Quit when all jobs are finished if self.jobDB.getJobsN(ClassSelector(JobClass.ENDSTATE)) == len(self.jobDB): self._logDisabledJobs() self._eventhandler.onTaskFinish(len(self.jobDB)) if self._task.canFinish(): self._log_user_time.info('Task successfully completed. Quitting grid-control!') utils.abort(True) return change
def run(self): if self._duration < 0: self._log.info('Running in continuous mode. Press ^C to exit.') elif self._duration > 0: self._log.info('Running for %s', str_time_short(self._duration)) # Prepare work package self.backend.deploy_task(self.task, transfer_se=self._transfer_se, transfer_sb=self._transfer_sb) # Job submission loop backend_timing_info = self.backend.get_interval_info() t_start = time.time() while not abort(): did_wait = False # Check whether backend can submit if not self.backend.can_submit(self._submit_time, self._submit_flag): self._submit_flag = False # Check free disk space if self._no_disk_space_left(): self._check_space_log.warning('Not enough space left in working directory') else: did_wait = self._run_actions(backend_timing_info) # quit if abort flag is set or not in continuous mode if abort() or ((self._duration >= 0) and (time.time() - t_start > self._duration)): break # idle timeout if not did_wait: wait(backend_timing_info.wait_on_idle) self.job_manager.finish()
def check(self, task, wms): jobnum_list = self._sample(self.job_db.get_job_list(ClassSelector(JobClass.PROCESSING)), self._get_chunk_size(self._chunks_check)) # Check jobs in the jobnum_list and return changes, timeouts and successfully reported jobs (change, jobnum_list_timeout, reported) = self._check_get_jobnum_list(task, wms, jobnum_list) unreported = len(jobnum_list) - len(reported) if unreported > 0: self._log.log_time(logging.CRITICAL, '%d job(s) did not report their status!', unreported) if change is None: # neither True or False => abort return False # Cancel jobs which took too long if len(jobnum_list_timeout): change = True self._log.warning('Timeout for the following jobs:') self._cancel(task, wms, jobnum_list_timeout, interactive=False, show_jobs=True) # Process task interventions self._process_intervention(task, wms) # Quit when all jobs are finished if self.job_db.get_job_len(ClassSelector(JobClass.ENDSTATE)) == len(self.job_db): self._log_disabled_jobs() if task.can_finish(): self._local_event_handler.on_task_finish(task, len(self.job_db)) abort(True) return change
def iter(self): while True: try: line = self.proc.fromchild.readline() except Exception: abort(True) break if not line: break self.stdout.append(line) yield line
def execute(self, wms_id_list, wms_name): proc = self._proc_factory.create_proc(wms_id_list) for result in self._parse(wms_id_list, proc): if not abort(): yield result if proc.status(timeout=0, terminate=True) != 0: self._handle_error(proc)
def retrieve(self, wms): change = False jobList = self._sample(self.jobDB.getJobs(ClassSelector(JobClass.DONE)), utils.QM(self._continuous, self._chunks_retrieve, -1)) for (jobNum, retCode, data, outputdir) in wms.retrieveJobs(self._wmsArgs(jobList)): jobObj = self.jobDB.get(jobNum) if jobObj is None: continue if retCode == 0: state = Job.SUCCESS elif retCode == 107: # set ABORTED instead of FAILED for errorcode 107 state = Job.ABORTED else: state = Job.FAILED if state == Job.SUCCESS: if not self._outputProcessor.process(outputdir): retCode = 108 state = Job.FAILED if state != jobObj.state: change = True jobObj.set('retcode', retCode) jobObj.set('runtime', data.get('TIME', -1)) self._update(jobObj, jobNum, state) self._eventhandler.onJobOutput(wms, jobObj, jobNum, retCode) if utils.abort(): return False return change
def retrieve(self, wms, maxsample = 100): change = False jobList = self.sample(self.jobDB.getJobs(ClassSelector(JobClass.DONE)), QM(self.continuous, maxsample, -1)) for jobNum, retCode, data in wms.retrieveJobs(self.wmsArgs(jobList)): jobObj = self.jobDB.get(jobNum) if jobObj == None: continue if retCode == 0: state = Job.SUCCESS elif retCode == 107: # set ABORTED instead of FAILED for errorcode 107 state = Job.ABORTED else: state = Job.FAILED if state != jobObj.state: change = True jobObj.set('retcode', retCode) jobObj.set('runtime', data.get('TIME', -1)) self._update(jobObj, jobNum, state) self._eventhandler.onJobOutput(wms, jobObj, jobNum, retCode) if utils.abort(): return False return change
def retrieve(self, task, wms): change = False jobnum_list = self._sample(self.job_db.get_job_list(ClassSelector(JobClass.DONE)), self._get_chunk_size(self._chunks_retrieve)) job_output_iter = wms.retrieve_jobs(self._get_wms_args(jobnum_list)) for (jobnum, exit_code, data, outputdir) in job_output_iter: job_obj = self.job_db.get_job(jobnum) if job_obj is None: continue if exit_code == 0: state = Job.SUCCESS elif exit_code == 107: # set ABORTED instead of FAILED for errorcode 107 state = Job.ABORTED else: state = Job.FAILED if state == Job.SUCCESS: if not self._output_processor.process(outputdir, task): exit_code = 108 state = Job.FAILED if state != job_obj.state: change = True job_obj.set('retcode', exit_code) job_obj.set('runtime', data.get('TIME', -1)) self._update(task, job_obj, jobnum, state) self._local_event_handler.on_job_output(task, wms, job_obj, jobnum, exit_code) if abort(): return False return change
def submit(self, task, wms): jobnum_list = self._submit_get_jobs(task) if len(jobnum_list) == 0: return False submitted = [] for (jobnum, gc_id, data) in wms.submit_jobs(jobnum_list, task): submitted.append(jobnum) job_obj = self.job_db.get_job_persistent(jobnum) job_obj.clear_old_state() if gc_id is None: # Could not register at WMS self._update(task, job_obj, jobnum, Job.FAILED) continue job_obj.assign_id(gc_id) for (key, value) in data.items(): job_obj.set(key, value) self._update(task, job_obj, jobnum, Job.SUBMITTED) self._local_event_handler.on_job_submit(task, wms, job_obj, jobnum) if abort(): return False return len(submitted) != 0
def execute(self, wmsIDs, wmsName): proc = self._proc_factory.create_proc(wmsIDs) for result in self._parse(wmsIDs, proc): if not utils.abort(): yield result if proc.status(timeout = 0, terminate = True) != 0: self._handleError(proc)
def getAllBlocks(): for provider in self._providerList: try: for block in provider.getBlocks(silent): yield block except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!')
def __init__(self, config, name, task=None, backend=None, job_manager=None): NamedPlugin.__init__(self, config, name) # Configure workflow settings jobs_config = config.change_view(view_class='TaggedConfigView', add_sections=['jobs']) self._action_list = jobs_config.get_list('action', ['check', 'retrieve', 'submit'], on_change=None) self._duration = 0 if jobs_config.get_bool('continuous', False, on_change=None): # legacy option self._duration = -1 self._duration = jobs_config.get_time('duration', self._duration, on_change=None) self._submit_flag = jobs_config.get_bool('submission', True, on_change=None) # Work directory settings self._check_space_dn = config.get_work_path() self._check_space = config.get_int('workdir space', 10, on_change=None) self._check_space_timeout = config.get_time('workdir space timeout', 5, on_change=None) self._check_space_log = logging.getLogger('workflow.space') self._check_space_log.addFilter(LogEveryNsec(interval=5 * 60)) # Configure local/job_manager and remote/backend monitoring module jobs_config.get_plugin(['event handler manager'], 'CompatEventHandlerManager', cls=EventHandlerManager, on_change=None) # Initialise task module self.task = config.get_plugin(['module', 'task'], cls=TaskModule, bind_kwargs={'tags': [self]}, override=task) if abort(): return # Initialise workload management interface self.backend = config.get_composited_plugin('backend', default_compositor='MultiWMS', cls=WMS, bind_kwargs={'tags': [self, self.task]}, override=backend) if abort(): return # Initialise job database self.job_manager = jobs_config.get_plugin('job manager', 'SimpleJobManager', cls=JobManager, bind_kwargs={'tags': [self, self.task, self.backend]}, pargs=(self.task,), override=job_manager) # Store submission settings / states self._transfer_se = config.get_state('init', detail='storage') self._transfer_sb = config.get_state('init', detail='sandbox') self._submit_time = jobs_config.get_time('submission time requirement', self.task.wall_time, on_change=None)
def _check_jobs_raw(self, wms, jobnum_list): # ask wms and yield (jobnum, job_obj, job_status, job_info) map_gc_id2jobnum = self._get_map_gc_id_jobnum(jobnum_list) for (gc_id, job_state, job_info) in wms.check_jobs(map_gc_id2jobnum.keys()): if not abort(): jobnum = map_gc_id2jobnum.pop(gc_id, None) if jobnum is not None: yield (jobnum, self.job_db.get_job(jobnum), job_state, job_info) for jobnum in map_gc_id2jobnum.values(): # missing jobs are returned with Job.UNKNOWN state yield (jobnum, self.job_db.get_job(jobnum), Job.UNKNOWN, {})
def execute(self, wmsIDs): # yields list of (wmsID, job_status, job_info) self._status = CheckStatus.OK proc = self._proc_factory.create_proc(wmsIDs) for job_info in self._parse(proc): if job_info and not utils.abort(): yield self._parse_job_info(job_info) if proc.status(timeout = 0, terminate = True) != 0: self._handleError(proc) if self._log_everything: self._log.log_process(proc, level = logging.DEBUG, msg = 'Finished checking jobs')
def gc_run(args=None, intro=True): # display the 'grid-control' logo and version if intro and not os.environ.get('GC_DISABLE_INTRO'): sys.stdout.write(SafeFile(get_path_share('logo.txt'), 'r').read_close()) sys.stdout.write('Revision: %s\n' % get_version()) pyver = (sys.version_info[0], sys.version_info[1]) if pyver < (2, 3): deprecated('This python version (%d.%d) is not supported anymore!' % pyver) atexit.register(lambda: sys.stdout.write('\n')) # main try... except block to catch exceptions and show error message try: return _gc_run(args) except SystemExit: # avoid getting caught for Python < 2.5 abort(True) raise except Exception: # coverage overrides sys.excepthook abort(True) gc_excepthook(*sys.exc_info()) sys.exit(os.EX_SOFTWARE)
def getBlocksInternal(self): ec = ExceptionCollector() for provider in self.subprovider: try: for block in provider.getBlocks(): yield block except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!') ec.raise_any(DatasetError('Could not retrieve all datasets!'))
def getBlocksInternal(self): exceptions = '' for provider in self.subprovider: try: for block in provider.getBlocks(): yield block except: exceptions += logException() + '\n' if utils.abort(): raise DatasetError('Could not retrieve all datasets!') if exceptions: raise DatasetError('Could not retrieve all datasets!\n' + exceptions)
def getDatasets(self): if self._cache_dataset is None: self._cache_dataset = [] ec = ExceptionCollector() for provider in self._providerList: try: self._cache_dataset.extend(provider.getDatasets()) except Exception: ec.collect() if utils.abort(): raise DatasetError('Could not retrieve all datasets!') ec.raise_any(DatasetError('Could not retrieve all datasets!')) return self._cache_dataset
def _run_actions(self, backend_timing_info): did_wait = False for action in imap(str.lower, self._action_list): if not abort(): if action.startswith('c'): # check for jobs if self.job_manager.check(self.task, self.backend): did_wait = wait(backend_timing_info.wait_between_steps) elif action.startswith('r'): # retrieve finished jobs if self.job_manager.retrieve(self.task, self.backend): did_wait = wait(backend_timing_info.wait_between_steps) elif action.startswith('s') and self._submit_flag: if self.job_manager.submit(self.task, self.backend): did_wait = wait(backend_timing_info.wait_between_steps) return did_wait
def _gc_run(args): config = gc_create_config(args or sys.argv[1:], use_default_files=True) (workflow, gui) = _gc_create_workflow(config) if not abort(): DebugInterface.callback_list.append((gui.end_interface, gui.start_interface)) try: try: gui.start_interface() except Exception: ex_value = GUIException('GUI init exception') ignore_exception(Exception, None, gui.end_interface) raise ex_value try: workflow.run() finally: gui.end_interface() finally: DebugInterface.callback_list.remove((gui.end_interface, gui.start_interface))
def checkJobsDirect(self, ids): if len(ids) == 0: raise StopIteration activity = utils.ActivityLog('checking job status') errors = [] for (wmsId, jobNum) in ids: try: data = utils.filterDict(dict(getStatusDirect(self._splitId(wmsId)[0])), vF = lambda v: (v != '') and (v != '0')) data['id'] = self._createId(data.get('jobid', wmsId)) data['dest'] = data.get('destination', 'N/A') yield (jobNum, data['id'], self._statusMap[data['status'].lower()], data) except: errors.append(repr(sys.exc_info()[1])) if utils.abort(): break del activity if errors: utils.eprint('The following glite errors have occured:\n%s' % str.join('\n', errors))
def _checkJobList(self, wms, jobList): (change, timeoutList, reported) = (False, [], []) for (jobNum, _, state, info) in wms.checkJobs(self._wmsArgs(jobList)): reported.append(jobNum) jobObj = self.jobDB.get(jobNum) if state != jobObj.state: change = True for (key, value) in info.items(): jobObj.set(key, value) self._update(jobObj, jobNum, state) self._eventhandler.onJobUpdate(wms, jobObj, jobNum, info) else: # If a job stays too long in an inital state, cancel it if jobObj.state in (Job.SUBMITTED, Job.WAITING, Job.READY, Job.QUEUED): if self._job_timeout > 0 and time.time() - jobObj.submitted > self._job_timeout: timeoutList.append(jobNum) if utils.abort(): return (None, timeoutList, reported) return (change, timeoutList, reported)
def execute(self, wmsIDs): # yields list of (wmsID, job_status, job_info) ec = ExceptionCollector() for wmsID in wmsIDs: try: job_info = utils.filterDict(dict(self._status_fun(wmsID)), vF = lambda v: v not in ['', '0']) job_info[CheckInfo.RAW_STATUS] = job_info.pop('status', '').lower() if 'destination' in job_info: try: dest_info = job_info['destination'].split('/', 1) job_info[CheckInfo.SITE] = dest_info[0].strip() job_info[CheckInfo.QUEUE] = dest_info[1].strip() except Exception: clear_current_exception() yield (wmsID, self._status_map.get(job_info[CheckInfo.RAW_STATUS], Job.UNKNOWN), job_info) except Exception: ec.collect() if utils.abort(): break ec.raise_any(BackendError('Encountered errors while checking job status'))
def submit(self, wms): jobList = self._getSubmissionJobs(self._chunks_submit) if len(jobList) == 0: return False submitted = [] for (jobNum, wmsId, data) in wms.submitJobs(jobList, self._task): submitted.append(jobNum) jobObj = self.jobDB.get(jobNum, create = True) if wmsId is None: # Could not register at WMS self._update(jobObj, jobNum, Job.FAILED) continue jobObj.assignId(wmsId) for (key, value) in data.items(): jobObj.set(key, value) self._update(jobObj, jobNum, Job.SUBMITTED) self._eventhandler.onJobSubmit(wms, jobObj, jobNum) if utils.abort(): return False return len(submitted) != 0
def _check_get_jobnum_list(self, task, wms, jobnum_list): (change, jobnum_list_timeout, reported) = (False, [], []) if not jobnum_list: return (change, jobnum_list_timeout, reported) for (jobnum, job_obj, state, info) in self._check_jobs_raw(wms, jobnum_list): if state != Job.UNKNOWN: reported.append(jobnum) if state != job_obj.state: change = True for (key, value) in info.items(): job_obj.set(key, value) self._update(task, job_obj, jobnum, state) self._local_event_handler.on_job_update(task, wms, job_obj, jobnum, info) else: # If a job stays too long in an inital state, cancel it if job_obj.state in (Job.SUBMITTED, Job.WAITING, Job.READY, Job.QUEUED): if self._timeout_queue > 0 and time.time() - job_obj.submitted > self._timeout_queue: jobnum_list_timeout.append(jobnum) if job_obj.state == Job.UNKNOWN: if self._timeout_unknown > 0 and time.time() - job_obj.submitted > self._timeout_unknown: jobnum_list_timeout.append(jobnum) if abort(): return (None, jobnum_list_timeout, reported) return (change, jobnum_list_timeout, reported)
def submit_jobs(self, jobnum_list, task): for jobnum in jobnum_list: if abort(): break yield self._submit_job(jobnum, task)
def interrupt(sig, frame): global log, handler utils.abort(True) log = utils.ActivityLog('Quitting grid-control! (This can take a few seconds...)') signal.signal(signal.SIGINT, handler)
def handle_abort_interrupt(sig, frame): utils.abort(True) handle_abort_interrupt.log = utils.ActivityLog('Quitting grid-control! (This can take a few seconds...)') signal.signal(signal.SIGINT, signal.SIG_DFL)
def handle_abort_interrupt(signum, frame): utils.abort(True) handle_abort_interrupt.log = Activity( 'Quitting grid-control! (This can take a few seconds...)', parent='root') signal.signal(signum, signal.SIG_DFL)