def master_auto_resubmit(self, rjobs): '''Duplicate of the IBackend.master_resubmit but hooked into auto resubmission such that the monitoring server is used rather than the user server''' from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception incomplete = 0 def handleError(x): if incomplete: raise x else: return 0 try: for sj in rjobs: fqid = sj.getFQID('.') logger.info("resubmitting job %s to %s backend", fqid, getName(sj.backend)) try: b = sj.backend sj.updateStatus('submitting') result = b._resubmit() if result: sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 else: return handleError(IncompleteJobSubmissionError(fqid, 'resubmission failed')) except Exception as x: log_user_exception(logger, debug=isType(x, GangaException)) return handleError(IncompleteJobSubmissionError(fqid, str(x))) finally: master = self.getJobObject().master if master: master.updateMasterJobStatus() return 1
def runAutoMerge(job, new_status): """Method to run the merge command.""" result = False #we only run on master jobs (which have no parent) if job._getParent() != None: return result allowed_states = ['completed','failed','killed'] if not new_status in allowed_states: return result try: if job.merger: #we run if master is in a failed state if ignorefailed flag is set if new_status == allowed_states[0] or job.merger.ignorefailed: # leave the output directory to the implementation (fix for http://savannah.cern.ch/bugs/?76445) sum_outputdir = None if job.merger.set_outputdir_for_automerge: sum_outputdir = job.outputdir result = job.merger.merge(job.subjobs, sum_outputdir) except Exception: log_user_exception() raise return result
def log_error(): log.error('Problem in the monitoring loop: %s', str(x)) #if show_traceback: # log.error("exception: ", exc_info=1) # #log_unknown_exception() # import traceback # traceback.print_stack() if show_traceback: log_user_exception(log)
def submit(self,**opts): """Submission of a job. Called by: ganga client. """ ret = {} for monService in self.monMonServices: try: monClass = str(monService.__class__) ret[monClass] = monService.submit(**opts) except Exception,e: #discard errors in initialization of monitoring services self._log(level="warning",msg="%s monitoring service failed in job *submit*" % monClass) from Ganga.Utility.logging import log_user_exception log_user_exception(self.logger)
def submit(self, **opts): """Submission of a job. Called by: ganga client. """ ret = {} for monService in self.monMonServices: try: monClass = str(monService.__class__) ret[monClass] = monService.submit(**opts) except Exception as e: # discard errors in initialization of monitoring services self.logger.warning("%s monitoring service failed in job *submit*" % monClass) from Ganga.Utility.logging import log_user_exception log_user_exception(self.logger) return ret
def __init__(self, monClasses, jobInfos, configInfos): """Create a new composite monitoring service based on the lists of monitoring classes, jobs and configs (all the same length). If this is called in the Ganga client, i.e. from Ganga/GPIDev/MonitoringServices, then jobInfos is a list of Job (all the same), configInfos is a list of Config (specific to each monitoring class). If this is called on the worker node, i.e. from the text generated by getWrapperScriptConstructorText(), the jobInfos are dictionaries (specific to each monitoring class) and configInfos are dictionaries of effective config options (specific to each monitoring class). """ if not (len(monClasses) == len(jobInfos) == len(configInfos)): raise Exception( "cannot create monitoring object, list of monitoring classes, jobs and configs are not the same length.") IMonitoringService.__init__(self, jobInfos, configInfos) # init the logger try: import Ganga.Utility.logging self.logger = Ganga.Utility.logging.getLogger() except ImportError: # on the worker node we don't have access to Ganga logging facilities # so we simple print out the log message #@see self._log() self.logger = None # init the monitoring services self.monMonServices = [] for i in range(len(monClasses)): try: monClass = monClasses[i] # allow for existing monitoring classes which do not take # config_info in constructor if configInfos[i] is None: monService = monClass(jobInfos[i]) else: monService = monClass(jobInfos[i], configInfos[i]) self.monMonServices.append(monService) except Exception as e: # discard errors in initialization of monitoring services self._log( level="warning", msg="Failed to init %s monitoring service...discarding it" % str(monClass)) from Ganga.Utility.logging import log_user_exception log_user_exception(self.logger)
def process(self, sj_info): my_sc = sj_info[0] my_sj = sj_info[1] try: logger.debug("preparing job %s" % my_sj.getFQID('.')) jdlpath = my_sj.backend.preparejob(my_sc, master_input_sandbox) if (not jdlpath) or (not os.path.exists(jdlpath)): raise GangaException('job %s not properly prepared' % my_sj.getFQID('.')) self.__appendResult__( my_sj.id, jdlpath ) return True except Exception,x: log_user_exception() return False
def __init__(self, monClasses, jobInfos, configInfos): """Create a new composite monitoring service based on the lists of monitoring classes, jobs and configs (all the same length). If this is called in the Ganga client, i.e. from Ganga/GPIDev/MonitoringServices, then jobInfos is a list of Job (all the same), configInfos is a list of Config (specific to each monitoring class). If this is called on the worker node, i.e. from the text generated by getWrapperScriptConstructorText(), the jobInfos are dictionaries (specific to each monitoring class) and configInfos are dictionaries of effective config options (specific to each monitoring class). """ if not (len(monClasses) == len(jobInfos) == len(configInfos)): raise Exception( "cannot create monitoring object, list of monitoring classes, jobs and configs are not the same length.") IMonitoringService.__init__(self, jobInfos, configInfos) # init the logger try: import Ganga.Utility.logging self.logger = Ganga.Utility.logging.getLogger() except ImportError: # on the worker node we don't have access to Ganga logging facilities # so we simple print out the log message self.logger = None # init the monitoring services self.monMonServices = [] for i in range(len(monClasses)): try: monClass = monClasses[i] # allow for existing monitoring classes which do not take # config_info in constructor if configInfos[i] is None: monService = monClass(jobInfos[i]) else: monService = monClass(jobInfos[i], configInfos[i]) self.monMonServices.append(monService) except Exception as e: # discard errors in initialization of monitoring services self.logger.warning("Failed to init %s monitoring service...discarding it" % str(monClass)) from Ganga.Utility.logging import log_user_exception log_user_exception(self.logger)
def master_resubmit(self, rjobs, backend=None): """ Resubmit (previously submitted) job. Configuration phase is skipped. Default implementation works is an emulated-bulk operation. If you override this method for bulk optimization then make sure that you call updateMasterJobStatus() on the master job, so the master job will be monitored by the monitoring loop. """ from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception incomplete = 0 def handleError(x): if incomplete: raise x else: return 0 try: for sj in rjobs: fqid = sj.getFQID('.') logger.info("resubmitting job %s to %s backend", fqid, getName(sj.backend)) try: b = sj.backend sj.updateStatus('submitting') if backend is None: result = b.resubmit() else: result = b.resubmit(backend=backend) if result: sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 else: return handleError( IncompleteJobSubmissionError( fqid, 'resubmission failed')) except Exception as x: log_user_exception(logger, debug=isType(x, GangaException)) return handleError( IncompleteJobSubmissionError(fqid, str(x))) finally: master = self.getJobObject().master if master: master.updateMasterJobStatus() return 1
def process(self, sj_info): my_sc = sj_info[0] my_sj = sj_info[1] try: logger.debug("preparing job %s" % my_sj.getFQID('.')) jdlpath = my_sj.backend.preparejob(my_sc, master_input_sandbox) if (not jdlpath) or (not os.path.exists(jdlpath)): raise GangaException('job %s not properly prepared' % my_sj.getFQID('.')) self.__appendResult__(my_sj.id, jdlpath) return True except Exception as x: log_user_exception() return False
def master_resubmit(self, rjobs, backend=None): """ Resubmit (previously submitted) job. Configuration phase is skipped. Default implementation works is an emulated-bulk operation. If you override this method for bulk optimization then make sure that you call updateMasterJobStatus() on the master job, so the master job will be monitored by the monitoring loop. """ from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception incomplete = 0 def handleError(x): if incomplete: raise x else: return 0 try: for sj in rjobs: fqid = sj.getFQID('.') logger.info( "resubmitting job %s to %s backend", fqid, sj.backend._name) try: b = sj.backend sj.updateStatus('submitting') if backend is None: result = b.resubmit() else: result = b.resubmit(backend=backend) if result: sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 else: return handleError(IncompleteJobSubmissionError(fqid, 'resubmission failed')) except Exception as x: log_user_exception( logger, debug=isinstance(x, GangaException)) return handleError(IncompleteJobSubmissionError(fqid, str(x))) finally: master = self.getJobObject().master if master: master.updateMasterJobStatus() return 1
def _parallel_submit(self, b, sj, sc, master_input_sandbox, fqid, logger): try: sj.updateStatus('submitting') if b.submit(sc, master_input_sandbox): sj.updateStatus('submitted') sj.info.increment() else: raise IncompleteJobSubmissionError(fqid, 'submission failed') except Exception as err: #from Ganga.Utility.logging import log_user_exception sj.updateStatus('failed') from Ganga.Core.exceptions import GangaException if isinstance(err, GangaException): logger.error(str(err)) log_user_exception(logger, debug=True) else: log_user_exception(logger, debug=False) finally: pass
def master_submit(self, rjobs, subjobconfigs, masterjobconfig, keep_going=False, parallel_submit=False): """ Submit the master job and all its subjobs. The masterjobconfig is shared, individual subjob configs are defined in subjobconfigs. Submission of individual jobs (not-split) also always goes via this method. In that case the subjobconfigs contains just one element - the job itself. The default implementation of this method emulates the bulk submission calling a submit() method on individual subjob objects. If submission of any of the subjobs fails then the whole process is aborted with IncompleteSubmissionError exception. The subjobs which have already been submitted stay submitted. The default implementation does not process the masterjobconfig. Therefore this method may be overriden in the derived class in the following way: def master_submit(self,masterjobconfig,subjobconfigs,keep_going): ... do_some_processsing_of(masterjobconfig) ... return IBackend.master_submit(self,subjobconfigs,masterjobconfig,keep_joing) Implementation note: we set keep_going to be optional in the signature of IBackend.master_submit() to allow the existing backend implementations which do not support keep_going=True and which at some point may call IBackend.master_submit() to work without change. It may sometimes be non-trivial to enable support for keep_going=True in some backends, even if the finally call IBackend.master_submit(). Therefore it is left to the decision of backend developer to explicitly enable the support for keep_going flag. """ from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception job = self.getJobObject() logger.debug("SubJobConfigs: %s" % len(subjobconfigs)) logger.debug("rjobs: %s" % len(rjobs)) assert(implies(rjobs, len(subjobconfigs) == len(rjobs))) incomplete = 0 incomplete_subjobs = [] def handleError(x): if keep_going: incomplete_subjobs.append(fqid) return False else: if incomplete: raise x else: return True master_input_sandbox = self.master_prepare(masterjobconfig) if parallel_submit: from Ganga.GPI import queues threads_before = queues.totalNumIntThreads() for sc, sj in zip(subjobconfigs, rjobs): fqid = sj.getFQID('.') b = sj.backend # FIXME would be nice to move this to the internal threads not user ones #from Ganga.GPIDev.Base.Proxy import stripProxy #all_queues = stripProxy(queues) #all_queues._addSystem( self._parallel_submit, ( b, sj, sc, master_input_sandbox, fqid, logger ) ) queues._monitoring_threadpool.add_function(self._parallel_submit, (b, sj, sc, master_input_sandbox, fqid, logger)) def subjob_status_check(rjobs): has_submitted = True for sj in rjobs: if sj.status not in ["submitted","failed","completed","running","completing"]: has_submitted = False break return has_submitted while not subjob_status_check(rjobs): import time time.sleep(1.) for i in rjobs: if i.status in ["new", "failed"]: return 0 return 1 for sc, sj in zip(subjobconfigs, rjobs): fqid = sj.getFQID('.') logger.info("submitting job %s to %s backend", fqid, sj.backend._name) try: b = sj.backend sj.updateStatus('submitting') if b.submit(sc, master_input_sandbox): sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 sj.info.increment() else: if handleError(IncompleteJobSubmissionError(fqid, 'submission failed')): return 0 except Exception as x: #sj.updateStatus('new') if isinstance(x, GangaException): logger.error(str(x)) log_user_exception(logger, debug=True) else: log_user_exception(logger, debug=False) if handleError(IncompleteJobSubmissionError(fqid, str(x))): return 0 if incomplete_subjobs: raise IncompleteJobSubmissionError( incomplete_subjobs, 'submission failed') return 1
def master_submit(self, rjobs, subjobconfigs, masterjobconfig, keep_going=False, parallel_submit=False): """ Submit the master job and all its subjobs. The masterjobconfig is shared, individual subjob configs are defined in subjobconfigs. Submission of individual jobs (not-split) also always goes via this method. In that case the subjobconfigs contains just one element - the job itself. The default implementation of this method emulates the bulk submission calling a submit() method on individual subjob objects. If submission of any of the subjobs fails then the whole process is aborted with IncompleteSubmissionError exception. The subjobs which have already been submitted stay submitted. The default implementation does not process the masterjobconfig. Therefore this method may be overriden in the derived class in the following way: def master_submit(self,masterjobconfig,subjobconfigs,keep_going): ... do_some_processsing_of(masterjobconfig) ... return IBackend.master_submit(self,subjobconfigs,masterjobconfig,keep_joing) Implementation note: we set keep_going to be optional in the signature of IBackend.master_submit() to allow the existing backend implementations which do not support keep_going=True and which at some point may call IBackend.master_submit() to work without change. It may sometimes be non-trivial to enable support for keep_going=True in some backends, even if the finally call IBackend.master_submit(). Therefore it is left to the decision of backend developer to explicitly enable the support for keep_going flag. """ from Ganga.Utility.logging import log_user_exception logger.debug("SubJobConfigs: %s" % len(subjobconfigs)) logger.debug("rjobs: %s" % len(rjobs)) assert(implies(rjobs, len(subjobconfigs) == len(rjobs))) incomplete = 0 incomplete_subjobs = [] def handleError(x): if keep_going: incomplete_subjobs.append(fqid) return False else: if incomplete: raise x else: return True master_input_sandbox = self.master_prepare(masterjobconfig) # Shall we submit in parallel if parallel_submit: from Ganga.Core.GangaThread.WorkerThreads import getQueues threads_before = getQueues().totalNumIntThreads() for sc, sj in zip(subjobconfigs, rjobs): b = sj.backend # Must check for credentials here as we cannot handle missing credentials on Queues by design! if hasattr(b, 'credential_requirements') and b.credential_requirements is not None: from Ganga.GPIDev.Credentials.CredentialStore import credential_store try: cred = credential_store[b.credential_requirements] except GangaKeyError: credential_store.create(b.credential_requirements) fqid = sj.getFQID('.') # FIXME would be nice to move this to the internal threads not user ones getQueues()._monitoring_threadpool.add_function(self._parallel_submit, (b, sj, sc, master_input_sandbox, fqid, logger), callback_func = self._successfulSubmit, callback_args = (sj, incomplete_subjobs)) def subjob_status_check(rjobs): has_submitted = True for sj in rjobs: if sj.status not in ["submitted","failed","completed","running","completing"] and sj.getFQID('.') not in incomplete_subjobs: has_submitted = False break return has_submitted while not subjob_status_check(rjobs): import time time.sleep(1.) if incomplete_subjobs: raise IncompleteJobSubmissionError( incomplete_subjobs, 'submission failed for subjobs %s' % incomplete_subjobs) return 1 # Alternatively submit sequentially for sc, sj in zip(subjobconfigs, rjobs): fqid = sj.getFQID('.') logger.info("submitting job %s to %s backend", fqid, getName(sj.backend)) try: b = stripProxy(sj.backend) sj.updateStatus('submitting') if b.submit(sc, master_input_sandbox): sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 stripProxy(sj.info).increment() else: if handleError(IncompleteJobSubmissionError(fqid, 'submission failed')): raise IncompleteJobSubmissionError(fqid, 'submission failed') except Exception as x: sj.updateStatus('new') if isType(x, GangaException): logger.error("%s" % x) log_user_exception(logger, debug=True) else: log_user_exception(logger, debug=False) raise IncompleteJobSubmissionError(fqid, 'submission failed') return 1
def master_submit(self, rjobs, subjobconfigs, masterjobconfig, keep_going=False, parallel_submit=False): """ Submit the master job and all its subjobs. The masterjobconfig is shared, individual subjob configs are defined in subjobconfigs. Submission of individual jobs (not-split) also always goes via this method. In that case the subjobconfigs contains just one element - the job itself. The default implementation of this method emulates the bulk submission calling a submit() method on individual subjob objects. If submission of any of the subjobs fails then the whole process is aborted with IncompleteSubmissionError exception. The subjobs which have already been submitted stay submitted. The default implementation does not process the masterjobconfig. Therefore this method may be overriden in the derived class in the following way: def master_submit(self,masterjobconfig,subjobconfigs,keep_going): ... do_some_processsing_of(masterjobconfig) ... return IBackend.master_submit(self,subjobconfigs,masterjobconfig,keep_joing) Implementation note: we set keep_going to be optional in the signature of IBackend.master_submit() to allow the existing backend implementations which do not support keep_going=True and which at some point may call IBackend.master_submit() to work without change. It may sometimes be non-trivial to enable support for keep_going=True in some backends, even if the finally call IBackend.master_submit(). Therefore it is left to the decision of backend developer to explicitly enable the support for keep_going flag. """ from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception logger.debug("SubJobConfigs: %s" % len(subjobconfigs)) logger.debug("rjobs: %s" % len(rjobs)) assert (implies(rjobs, len(subjobconfigs) == len(rjobs))) incomplete = 0 incomplete_subjobs = [] def handleError(x): if keep_going: incomplete_subjobs.append(fqid) return False else: if incomplete: raise x else: return True master_input_sandbox = self.master_prepare(masterjobconfig) if parallel_submit: from Ganga.GPI import queues threads_before = queues.totalNumIntThreads() for sc, sj in zip(subjobconfigs, rjobs): fqid = sj.getFQID('.') b = sj.backend # FIXME would be nice to move this to the internal threads not user ones #from Ganga.GPIDev.Base.Proxy import stripProxy #all_queues = stripProxy(queues) #all_queues._addSystem( self._parallel_submit, ( b, sj, sc, master_input_sandbox, fqid, logger ) ) queues._monitoring_threadpool.add_function( self._parallel_submit, (b, sj, sc, master_input_sandbox, fqid, logger)) def subjob_status_check(rjobs): has_submitted = True for sj in rjobs: if sj.status not in [ "submitted", "failed", "completed", "running", "completing" ]: has_submitted = False break return has_submitted while not subjob_status_check(rjobs): import time time.sleep(1.) for i in rjobs: if i.status in ["new", "failed"]: return 0 return 1 for sc, sj in zip(subjobconfigs, rjobs): fqid = sj.getFQID('.') logger.info("submitting job %s to %s backend", fqid, getName(sj.backend)) try: b = stripProxy(sj.backend) sj.updateStatus('submitting') if b.submit(sc, master_input_sandbox): sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 stripProxy(sj.info).increment() else: if handleError( IncompleteJobSubmissionError( fqid, 'submission failed')): return 0 except Exception as x: #sj.updateStatus('new') if isType(x, GangaException): logger.error(str(x)) log_user_exception(logger, debug=True) else: log_user_exception(logger, debug=False) if handleError(IncompleteJobSubmissionError(fqid, str(x))): return 0 if incomplete_subjobs: raise IncompleteJobSubmissionError(incomplete_subjobs, 'submission failed') return 1
def master_submit(self,rjobs,subjobconfigs,masterjobconfig,keep_going=False): """ Submit the master job and all its subjobs. The masterjobconfig is shared, individual subjob configs are defined in subjobconfigs. Submission of individual jobs (not-split) also always goes via this method. In that case the subjobconfigs contains just one element - the job itself. The default implementation of this method emulates the bulk submission calling a submit() method on individual subjob objects. If submission of any of the subjobs fails then the whole process is aborted with IncompleteSubmissionError exception. The subjobs which have already been submitted stay submitted. The default implementation does not process the masterjobconfig. Therefore this method may be overriden in the derived class in the following way: def master_submit(self,masterjobconfig,subjobconfigs,keep_going): ... do_some_processsing_of(masterjobconfig) ... return IBackend.master_submit(self,subjobconfigs,masterjobconfig,keep_joing) Implementation note: we set keep_going to be optional in the signature of IBackend.master_submit() to allow the existing backend implementations which do not support keep_going=True and which at some point may call IBackend.master_submit() to work without change. It may sometimes be non-trivial to enable support for keep_going=True in some backends, even if the finally call IBackend.master_submit(). Therefore it is left to the decision of backend developer to explicitly enable the support for keep_going flag. """ from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception job = self.getJobObject() assert(implies(rjobs,len(subjobconfigs)==len(rjobs))) incomplete = 0 incomplete_subjobs = [] def handleError(x): if keep_going: incomplete_subjobs.append(fqid) return False else: if incomplete: raise x else: return True master_input_sandbox=self.master_prepare(masterjobconfig) for sc,sj in zip(subjobconfigs,rjobs): fqid = sj.getFQID('.') logger.info("submitting job %s to %s backend",fqid,sj.backend._name) try: b = sj.backend sj.updateStatus('submitting') if b.submit(sc,master_input_sandbox): sj.updateStatus('submitted') #sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 else: if handleError(IncompleteJobSubmissionError(fqid,'submission failed')): return 0 except Exception,x: sj.updateStatus('new') if isinstance(x,GangaException): logger.error(str(x)) log_user_exception(logger,debug = True) else: log_user_exception(logger,debug = False) if handleError(IncompleteJobSubmissionError(fqid,str(x))): return 0
for c in config: logger.error('%s = %s',c,config[c]) s = 'Cannot connect to the repository: '+str(x) logger.error(s) return s reps = [] try: for n in names: reps.append(factory(dir = os.path.join(getLocalRoot(),version,n))) except RepositoryError,x: s = print_error(x) raise except Exception,x: s = print_error(x) log_user_exception(logger) raise from Ganga.GPIDev.Lib.JobRegistry import JobRegistryInstance, JobRegistryInterface, allJobRegistries regs = map(lambda x: JobRegistryInstance(*x), zip(names,reps)) for n,r in zip(names,regs): allJobRegistries['native_'+n] = r if n == 'jobs' and config['DEBUG_startup_profile']: PROFN = 'xml.startup.profile.txt' print 'profiling ON, saving status to',PROFN import profile profile.runctx('r._scan_repository()',globals(),{'r':r},PROFN) else: try: