def testAlertsMessagingBasic(self): config = getConfig("/tmp") self.assertTrue(hasattr(config, "Alert")) # initialization # sender: instance of Alert messages Sender # preAlert: pre-defined values for Alert instances generated from this class self.config = config # needed in setUpAlertsMessaging preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "testBasic") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) # set up a temporary alert message receiver handler, receiver = utils.setUpReceiver(config.Alert.address, config.Alert.controlAddr) # test sending alert msg = "this is my message Basic" sendAlert(100, msg = msg) # wait for the alert to arrive while len(handler.queue) == 0: time.sleep(0.3) print "%s waiting for alert to arrive ..." % inspect.stack()[0][3] self.assertEqual(len(handler.queue), 1) alert = handler.queue[0] self.assertEqual(alert["Component"], "testBasic") self.assertEqual(alert["Level"], 100) self.assertEqual(alert["Source"], self.__class__.__name__) self.assertEqual(alert["Details"]["msg"], msg) sender.unregister() receiver.shutdown()
def testAlertsMessagingBasic(self): config = getConfig("/tmp") self.assertTrue(hasattr(config, "Alert")) # initialization # sender: instance of Alert messages Sender # preAlert: pre-defined values for Alert instances generated from this class self.config = config # needed in setUpAlertsMessaging preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName="testBasic") sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert) # set up a temporary alert message receiver handler, receiver = utils.setUpReceiver(config.Alert.address, config.Alert.controlAddr) # test sending alert msg = "this is my message Basic" sendAlert(100, msg=msg) # wait for the alert to arrive while len(handler.queue) == 0: time.sleep(0.3) print "%s waiting for alert to arrive ..." % inspect.stack()[0][3] self.assertEqual(len(handler.queue), 1) alert = handler.queue[0] self.assertEqual(alert["Component"], "testBasic") self.assertEqual(alert["Level"], 100) self.assertEqual(alert["Source"], self.__class__.__name__) self.assertEqual(alert["Details"]["msg"], msg) sender.unregister() receiver.shutdown()
def initAlerts(self, compName=None): """ _initAlerts_ Setup the alerts for the rest of the system. sender: instance of the Alert messages Sender sendAlert: the code what sends the actual Alerts (documented in WMCore/Alerts/APIgetSendAlert) note: Tests are done in the API_t belonging to Alerts fw. This particular method is called from a number of components and some have particular tests on alerts sending. """ if not compName: compName = self.__class__.__name__ preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName=compName) sendAlert = alertAPI.getSendAlert(sender=sender, preAlert=preAlert) self.sender = sender self.sendAlert = sendAlert
def initAlerts(self, compName = None): """ _initAlerts_ Setup the alerts for the rest of the system. sender: instance of the Alert messages Sender sendAlert: the code what sends the actual Alerts (documented in WMCore/Alerts/APIgetSendAlert) note: Tests are done in the API_t belonging to Alerts fw. This particular method is called from a number of components and some have particular tests on alerts sending. """ if not compName: compName = self.__class__.__name__ preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = compName) sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) self.sender = sender self.sendAlert = sendAlert
def submit(self, jobs, info): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up CondorPlugin worker pool") self.input = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target = submitWorker, args = (self.input, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards queueError = False for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x['jobid'] for x in jobList] if queueError: # If the queue has failed, then we must not process # any more jobs this cycle. continue while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList = jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, 'w') handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" %(len(jobsReady))) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.input.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): try: res = self.result.get(block = True, timeout = timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds." % timeout) queueError = True continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) queueError = True continue try: output = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone crticially wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in CondorPlugin") return successfulJobs, failedJobs
self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order
def __init__(self, logger = None, dbi = None, **params): WorkQueueBase.__init__(self, logger, dbi) self.parent_queue = None self.params = params # config argument (within params) shall be reference to # Configuration instance (will later be checked for presence of "Alert") self.config = params.get("Config", None) self.params.setdefault('CouchUrl', os.environ.get('COUCHURL')) if not self.params.get('CouchUrl'): raise RuntimeError, 'CouchUrl config value mandatory' self.params.setdefault('DbName', 'workqueue') self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox') self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'], self.params['InboxDbName'], self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'), logger = self.logger) if self.params.get('ParentQueueCouchUrl'): self.parent_queue = WorkQueueBackend(self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0], self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1]) self.params.setdefault("GlobalDBS", "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet") self.params.setdefault('QueueDepth', 2) # when less than this locally self.params.setdefault('LocationRefreshInterval', 600) self.params.setdefault('FullLocationRefreshInterval', 7200) self.params.setdefault('TrackLocationOrSubscription', 'subscription') self.params.setdefault('ReleaseIncompleteBlocks', False) self.params.setdefault('ReleaseRequireSubscribed', True) self.params.setdefault('PhEDExEndpoint', None) self.params.setdefault('PopulateFilesets', True) self.params.setdefault('LocalQueueFlag', True) self.params.setdefault('JobDumpConfig', None) self.params.setdefault('BossAirConfig', None) self.params['QueueURL'] = self.backend.queueUrl # url this queue is visible on # backend took previous QueueURL and sanitized it self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue self.params.setdefault('Teams', ['']) self.params.setdefault('DrainMode', False) if self.params.get('CacheDir'): try: os.makedirs(self.params['CacheDir']) except OSError: pass elif self.params.get('PopulateFilesets'): raise RuntimeError, 'CacheDir mandatory for local queue' self.params.setdefault('SplittingMapping', {}) self.params['SplittingMapping'].setdefault('DatasetBlock', {'name': 'Block', 'args': {}} ) self.params['SplittingMapping'].setdefault('MonteCarlo', {'name': 'MonteCarlo', 'args':{}} ) self.params['SplittingMapping'].setdefault('Dataset', {'name': 'Dataset', 'args': {}} ) self.params['SplittingMapping'].setdefault('Block', {'name': 'Block', 'args': {}} ) self.params['SplittingMapping'].setdefault('ResubmitBlock', {'name': 'ResubmitBlock', 'args': {}} ) self.params.setdefault('EndPolicySettings', {}) assert(self.params['TrackLocationOrSubscription'] in ('subscription', 'location')) # Can only release blocks on location if self.params['TrackLocationOrSubscription'] == 'location': if self.params['SplittingMapping']['DatasetBlock']['name'] != 'Block': raise RuntimeError, 'Only blocks can be released on location' if self.params.get('PhEDEx'): self.phedexService = self.params['PhEDEx'] else: phedexArgs = {} if self.params.get('PhEDExEndpoint'): phedexArgs['endpoint'] = self.params['PhEDExEndpoint'] self.phedexService = PhEDEx(phedexArgs) if self.params.get('SiteDB'): self.SiteDB = self.params['SiteDB'] else: self.SiteDB = SiteDB() if type(self.params['Teams']) in types.StringTypes: self.params['Teams'] = [x.strip() for x in \ self.params['Teams'].split(',')] self.dataLocationMapper = WorkQueueDataLocationMapper(self.logger, self.backend, phedex = self.phedexService, sitedb = self.SiteDB, locationFrom = self.params['TrackLocationOrSubscription'], incompleteBlocks = self.params['ReleaseIncompleteBlocks'], requireBlocksSubscribed = not self.params['ReleaseIncompleteBlocks'], fullRefreshInterval = self.params['FullLocationRefreshInterval'], updateIntervalCoarseness = self.params['LocationRefreshInterval']) # initialize alerts sending client (self.sendAlert() method) # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10 # 1 - 4 - lower levels ; 5 - 10 higher levels preAlert, self.alertSender = \ alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager") self.sendAlert = alertAPI.getSendAlert(sender = self.alertSender, preAlert = preAlert) self.logger.debug("WorkQueue created successfully")
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up CondorPlugin worker pool") self.input = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target = submitWorker, args = (self.input, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards queueError = False for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x['jobid'] for x in jobList] if queueError: # If the queue has failed, then we must not process # any more jobs this cycle. continue while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList = jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, 'w') handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" %(len(jobsReady))) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.input.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): try: res = self.result.get(block = True, timeout = timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds." % timeout) queueError = True continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) queueError = True continue try: output = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in CondorPlugin") return successfulJobs, failedJobs
self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirPyCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order
class WorkQueue(WorkQueueBase): """ _WorkQueue_ WorkQueue object - interface to WorkQueue functionality. """ def __init__(self, logger=None, dbi=None, **params): WorkQueueBase.__init__(self, logger, dbi) self.parent_queue = None self.params = params # config argument (within params) shall be reference to # Configuration instance (will later be checked for presence of "Alert") self.config = params.get("Config", None) self.params.setdefault('CouchUrl', os.environ.get('COUCHURL')) if not self.params.get('CouchUrl'): raise RuntimeError, 'CouchUrl config value mandatory' self.params.setdefault('DbName', 'workqueue') self.params.setdefault('InboxDbName', self.params['DbName'] + '_inbox') self.params.setdefault('ParentQueueCouchUrl', None) # We get work from here self.backend = WorkQueueBackend(self.params['CouchUrl'], self.params['DbName'], self.params['InboxDbName'], self.params['ParentQueueCouchUrl'], self.params.get('QueueURL'), logger=self.logger) if self.params.get('ParentQueueCouchUrl'): try: self.parent_queue = WorkQueueBackend( self.params['ParentQueueCouchUrl'].rsplit('/', 1)[0], self.params['ParentQueueCouchUrl'].rsplit('/', 1)[1]) except IndexError, ex: # Probable cause: Someone didn't put the global WorkQueue name in # the ParentCouchUrl msg = "Parsing failure for ParentQueueCouchUrl - probably missing dbname in input\n" msg += "Exception: %s\n" % str(ex) msg += str("ParentQueueCouchUrl: %s\n" % self.params['ParentQueueCouchUrl']) self.logger.error(msg) raise WorkQueueError(msg) self.params['ParentQueueCouchUrl'] = self.parent_queue.queueUrl self.params.setdefault( "GlobalDBS", "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet") self.params.setdefault('QueueDepth', 0.5) # when less than this locally self.params.setdefault('LocationRefreshInterval', 600) self.params.setdefault('FullLocationRefreshInterval', 7200) self.params.setdefault('TrackLocationOrSubscription', 'subscription') self.params.setdefault('ReleaseIncompleteBlocks', False) self.params.setdefault('ReleaseRequireSubscribed', True) self.params.setdefault('PhEDExEndpoint', None) self.params.setdefault('PopulateFilesets', True) self.params.setdefault('LocalQueueFlag', True) self.params.setdefault('QueueRetryTime', 86400) self.params.setdefault('stuckElementAlertTime', 86400) self.params.setdefault('reqmgrCompleteGraceTime', 604800) self.params.setdefault('cancelGraceTime', 604800) self.params.setdefault('JobDumpConfig', None) self.params.setdefault('BossAirConfig', None) self.params[ 'QueueURL'] = self.backend.queueUrl # url this queue is visible on # backend took previous QueueURL and sanitized it self.params.setdefault('WMBSUrl', None) # this will only be set on local Queue if self.params.get('WMBSUrl'): self.params['WMBSUrl'] = Lexicon.sanitizeURL( self.params['WMBSUrl'])['url'] self.params.setdefault('Teams', []) self.params.setdefault('DrainMode', False) if self.params.get('CacheDir'): try: os.makedirs(self.params['CacheDir']) except OSError: pass elif self.params.get('PopulateFilesets'): raise RuntimeError, 'CacheDir mandatory for local queue' self.params.setdefault('SplittingMapping', {}) self.params['SplittingMapping'].setdefault('DatasetBlock', { 'name': 'Block', 'args': {} }) self.params['SplittingMapping'].setdefault('MonteCarlo', { 'name': 'MonteCarlo', 'args': {} }) self.params['SplittingMapping'].setdefault('Dataset', { 'name': 'Dataset', 'args': {} }) self.params['SplittingMapping'].setdefault('Block', { 'name': 'Block', 'args': {} }) self.params['SplittingMapping'].setdefault('ResubmitBlock', { 'name': 'ResubmitBlock', 'args': {} }) self.params.setdefault('EndPolicySettings', {}) assert (self.params['TrackLocationOrSubscription'] in ('subscription', 'location')) # Can only release blocks on location if self.params['TrackLocationOrSubscription'] == 'location': if self.params['SplittingMapping']['DatasetBlock'][ 'name'] != 'Block': raise RuntimeError, 'Only blocks can be released on location' if self.params.get('PhEDEx'): self.phedexService = self.params['PhEDEx'] else: phedexArgs = {} if self.params.get('PhEDExEndpoint'): phedexArgs['endpoint'] = self.params['PhEDExEndpoint'] self.phedexService = PhEDEx(phedexArgs) if self.params.get('SiteDB'): self.SiteDB = self.params['SiteDB'] else: self.SiteDB = SiteDB() if type(self.params['Teams']) in types.StringTypes: self.params['Teams'] = [x.strip() for x in \ self.params['Teams'].split(',')] self.dataLocationMapper = WorkQueueDataLocationMapper( self.logger, self.backend, phedex=self.phedexService, sitedb=self.SiteDB, locationFrom=self.params['TrackLocationOrSubscription'], incompleteBlocks=self.params['ReleaseIncompleteBlocks'], requireBlocksSubscribed=not self.params['ReleaseIncompleteBlocks'], fullRefreshInterval=self.params['FullLocationRefreshInterval'], updateIntervalCoarseness=self.params['LocationRefreshInterval']) # initialize alerts sending client (self.sendAlert() method) # usage: self.sendAlert(levelNum, msg = msg) ; level - integer 1 .. 10 # 1 - 4 - lower levels ; 5 - 10 higher levels preAlert, self.alertSender = \ alertAPI.setUpAlertsMessaging(self, compName = "WorkQueueManager") self.sendAlert = alertAPI.getSendAlert(sender=self.alertSender, preAlert=preAlert) self.logger.debug("WorkQueue created successfully")