def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True
def __init__(self, db_url, db_name='workqueue', inbox_name='workqueue_inbox', parentQueue=None, queueUrl=None, logger=None): if logger: self.logger = logger else: import logging self.logger = logging self.server = CouchServer(db_url) self.parentCouchUrlWithAuth = parentQueue if parentQueue: self.parentCouchUrl = sanitizeURL(parentQueue)['url'] else: self.parentCouchUrl = None self.db = self.server.connectDatabase(db_name, create=False, size=10000) self.hostWithAuth = db_url self.inbox = self.server.connectDatabase(inbox_name, create=False, size=10000) self.queueUrl = sanitizeURL(queueUrl or (db_url + '/' + db_name))['url']
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Instantiating central reqmgr and local workqueue print "ReqMgr2 URL : %s" % sanitizeURL(config.JobUpdater.reqMgr2Url)['url'] print "WorkQueue URL: %s and dbname %s" % (sanitizeURL(config.WorkQueueManager.couchurl)['url'], config.WorkQueueManager.dbname) reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url) workqueue = WorkQueue(config.WorkQueueManager.couchurl, config.WorkQueueManager.dbname) print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue" priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()] print "Retrieved %d workflows from workqueue" % len(workflowsToCheck) for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority'] except Exception, ex: print "Couldn't retrieve the priority of request %s" % workflow print "Error: %s" % ex continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow]
def __init__(self, db_url, db_name='workqueue', inbox_name=None, parentQueue=None, queueUrl=None, logger=None): if logger: self.logger = logger else: import logging self.logger = logging if inbox_name is None: inbox_name = "%s_inbox" % db_name self.server = CouchServer(db_url) self.parentCouchUrlWithAuth = parentQueue if parentQueue: self.parentCouchUrl = sanitizeURL(parentQueue)['url'] else: self.parentCouchUrl = None self.db = self.server.connectDatabase(db_name, create=False, size=10000) self.hostWithAuth = db_url self.inbox = self.server.connectDatabase(inbox_name, create=False, size=10000) self.queueUrl = sanitizeURL(queueUrl or (db_url + '/' + db_name))['url'] self.eleKey = 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement'
def post(self, workload_pair_list, multi_update_flag = False): """ Create and update couchDB with a new request. request argument is passed from validation (validation convert cherrypy.request.body data to argument) TODO: this method will have some parts factored out so that e.g. clone call can share functionality. NOTES: 1) do not strip spaces, #4705 will fails upon injection with spaces ; currently the chain relies on a number of things coming in #4705 2) reqInputArgs = Utilities.unidecode(JsonWrapper.loads(body)) (from ReqMgrRESTModel.putRequest) """ # storing the request document into Couch if multi_update_flag: return self.put(workload_pair_list) out = [] for workload, request_args in workload_pair_list: cherrypy.log("INFO: Create request, input args: %s ..." % request_args) request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"], metadata=request_args) out.append({'request':workload.name()}) return out
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL) #TODO: we might need to use local db for Tier0 self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) if self.useReqMgrForCompletionCheck: self.deletableStates = ["announced"] self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: remove this for reqmgr2 self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableStates = ["completed"] # use local for update self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL) if self.useReqMgrForCompletionCheck: self.deletableStates = ["announced"] self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL) self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableStates = ["completed"] self.centralCouchDBWriter = self.wmstatsCouchDB jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
def checkReplicationStatus(self): """ _checkReplicationStatus_ Check if the workqueue replication is ok, if not then delete the documents so that new replications can be triggered when appropiate. It returns True if there is no error, and False otherwise. """ status = self.server.status() replicationError = False replicationCount = 0 expectedReplicationCount = 2 # GQ -> LQ-Inbox & LQ-Inbox -> GQ # Remove the protocol frm the sanitized url inboxUrl = sanitizeURL('%s/%s' % (self.server.url, self.inbox.name))['url'].split('/', 2)[2] try: for activeTasks in status['active_tasks']: if activeTasks['type'] == 'Replication': if inboxUrl in activeTasks['task']: replicationCount += 1 if replicationCount < expectedReplicationCount: replicationError = True except: replicationError = True if replicationError: # Stop workqueue related replication self.logger.error("Stopping replication as it was in error state. It will be restarted.") self.pullFromParent(continuous = True, cancel = True) self.sendToParent(continuous = True, cancel = True) return not replicationError
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBReader = WMStatsReader( self.config.TaskArchiver.centralWMStatsURL) if self.useReqMgrForCompletionCheck: self.deletableStates = ["announced"] self.centralCouchDBWriter = WMStatsWriter( self.config.TaskArchiver.centralWMStatsURL) self.reqmgrSvc = RequestManager( {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableStates = ["completed"] self.centralCouchDBWriter = self.wmstatsCouchDB jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase( statSummaryDBName)
def _getCouchACDCHtmlBase(acdcCouchURL): """ TODO: currently it is hard code to the front page of ACDC When there is more information is available, it can be added through """ return '%s/_design/ACDC/collections.html' % sanitizeURL(acdcCouchURL)['url']
def __init__(self, couchURL, reqdbURL = None, reqdbCouchApp = "ReqMgr"): couchURL = sanitizeURL(couchURL)['url'] # set the connection for local couchDB call self._commonInit(couchURL) if reqdbURL: self.reqDB = RequestDBReader(reqdbURL) else: self.reqDB = None
def __init__(self, couchURL, reqdbURL=None, reqdbCouchApp="ReqMgr"): couchURL = sanitizeURL(couchURL)['url'] # set the connection for local couchDB call self._commonInit(couchURL) if reqdbURL: self.reqDB = RequestDBReader(reqdbURL) else: self.reqDB = None
def __init__(self, db_url, db_name = 'workqueue', inbox_name = 'workqueue_inbox', parentQueue = None, queueUrl = None, logger = None): if logger: self.logger = logger else: import logging self.logger = logging self.server = CouchServer(db_url) self.parentCouchUrlWithAuth = parentQueue if parentQueue: self.parentCouchUrl = sanitizeURL(parentQueue)['url'] else: self.parentCouchUrl = None self.db = self.server.connectDatabase(db_name, create = False, size = 10000) self.hostWithAuth = db_url self.inbox = self.server.connectDatabase(inbox_name, create = False, size = 10000) self.queueUrl = sanitizeURL(queueUrl or (db_url + '/' + db_name))['url']
def __init__(self, couchURL, dbName = None): couchURL = sanitizeURL(couchURL)['url'] # set the connection for local couchDB call if dbName: self.couchURL = couchURL self.dbName = dbName else: self.couchURL, self.dbName = splitCouchServiceURL(couchURL) self.couchServer = CouchServer(self.couchURL) self.couchDB = CouchServer(self.couchURL).connectDatabase(self.dbName, False)
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Instantiating central reqmgr and local workqueue print "ReqMgr2 URL : %s" % sanitizeURL( config.JobUpdater.reqMgr2Url)['url'] print "WorkQueue URL: %s and dbname %s" % (sanitizeURL( config.WorkQueueManager.couchurl)['url'], config.WorkQueueManager.dbname) reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url) workqueue = WorkQueue(config.WorkQueueManager.couchurl, config.WorkQueueManager.dbname) print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue" priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()] print "Retrieved %d workflows from workqueue" % len(workflowsToCheck) for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = reqmgr2.getRequestByNames( workflow)[workflow]['RequestPriority'] except Exception, ex: print "Couldn't retrieve the priority of request %s" % workflow print "Error: %s" % ex continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow]
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL) self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
def __init__(self, db_url, db_name='workqueue', inbox_name=None, parentQueue=None, queueUrl=None, logger=None): if logger: self.logger = logger else: import logging self.logger = logging if inbox_name == None: inbox_name = "%s_inbox" % db_name self.server = CouchServer(db_url) self.parentCouchUrlWithAuth = parentQueue if parentQueue: self.parentCouchUrl = sanitizeURL(parentQueue)['url'] else: self.parentCouchUrl = None self.db = self.server.connectDatabase(db_name, create=False, size=10000) self.hostWithAuth = db_url self.inbox = self.server.connectDatabase(inbox_name, create=False, size=10000) self.queueUrl = sanitizeURL(queueUrl or (db_url + '/' + db_name))['url'] self.eleKey = 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement'
def archiveCouchSummary(self, workflow, spec): """ _archiveCouchSummary_ For each workflow pull its information from couch and turn it into a summary for archiving """ failedJobs = [] jobErrors = [] outputLFNs = [] workflowData = {} workflowName = workflow.task.split('/')[1] # Set campaign workflowData['campaign'] = spec.getCampaign() # Get a list of failed job IDs # Make sure you get it for ALL tasks in the spec for taskName in spec.listAllTaskPathNames(): failedTmp = self.jobsdatabase.loadView("JobDump", "failedJobsByWorkflowName", options = {"startkey": [workflowName, taskName], "endkey": [workflowName, taskName]})['rows'] for entry in failedTmp: failedJobs.append(entry['value']) output = self.fwjrdatabase.loadView("FWJRDump", "outputByWorkflowName", options = {"group_level": 2, "startkey": [workflowName], "endkey": [workflowName, {}], "group": True})['rows'] perf = self.handleCouchPerformance(workflowName = workflowName) workflowData['performance'] = {} for key in perf: workflowData['performance'][key] = {} for attr in perf[key].keys(): workflowData['performance'][key][attr] = perf[key][attr] workflowData["_id"] = workflow.task.split('/')[1] try: workflowData["ACDCServer"] = sanitizeURL(self.config.ACDC.couchurl)['url'] workflowData["ACDCDatabase"] = self.config.ACDC.database except AttributeError, ex: # We're missing the ACDC info. # Keep going logging.error("ACDC info missing from config. Skipping this step in the workflow summary.") logging.debug("Error: %s" % str(ex))
def testResetWork(self): """Reset work in global to different child queue""" #TODO: This test sometimes fails - i suspect a race condition (maybe conflict in couch) # Cancel code needs reworking so this will hopefully be fixed then totalBlocks = 2 self.globalQueue.queueWork(self.processingSpec.specUrl()) self.globalQueue.updateLocationInfo() self.assertEqual(self.localQueue.pullWork({'T2_XX_SiteA' : 1000}), totalBlocks) syncQueues(self.localQueue) work = self.localQueue.getWork({'T2_XX_SiteA' : 1000, 'T2_XX_SiteB' : 1000}) self.assertEqual(len(work), totalBlocks) self.assertEqual(len(self.localQueue.status(status = 'Running')), 2) syncQueues(self.localQueue) self.assertEqual(len(self.globalQueue.status(status = 'Running')), 2) # Re-assign work in global self.globalQueue.resetWork([x.id for x in self.globalQueue.status(status = 'Running')]) # work should be canceled in local #TODO: Note the work in local will be orphaned but not canceled syncQueues(self.localQueue) work_at_local = [x for x in self.globalQueue.status(status = 'Running') \ if x['ChildQueueUrl'] == sanitizeURL(self.localQueue.params['QueueURL'])['url']] self.assertEqual(len(work_at_local), 0) # now 2nd queue calls and acquires work self.assertEqual(self.localQueue2.pullWork({'T2_XX_SiteA' : 1000}), totalBlocks) syncQueues(self.localQueue2) # check work in global assigned to local2 self.assertEqual(len(self.localQueue2.status(status = 'Available')), 2) # work in local2 work_at_local2 = [x for x in self.globalQueue.status(status = 'Acquired') \ if x['ChildQueueUrl'] == sanitizeURL(self.localQueue2.params['QueueURL'])['url']] self.assertEqual(len(work_at_local2), 2)
def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl' : childURL, 'parentUrl' : sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete or replicator docs befor setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter = rp['filter'], query_params = rp.get('query_params', False), continuous = True, useReplicator = True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.archiveDelayHours = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0) self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL, "WMStatsAgent") #TODO: we might need to use local db for Tier0 self.centralRequestDBReader = RequestDBReader( self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) if self.useReqMgrForCompletionCheck: self.deletableState = "announced" self.centralRequestDBWriter = RequestDBWriter( self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) if self.config.TaskArchiver.reqmgr2Only: self.reqmgr2Svc = ReqMgr( self.config.TaskArchiver.ReqMgr2ServiceURL) else: #TODO: remove this for reqmgr2 self.reqmgrSvc = RequestManager( {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableState = "completed" # use local for update self.centralRequestDBWriter = RequestDBWriter( self.config.AnalyticsDataCollector.localT0RequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase( statSummaryDBName)
def _update_additional_request_args(self, workload, request_args): """ add to request_args properties which is not initially set from user. This data will put in to couchdb. Update request_args here if additional information need to be put in couchdb """ request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists request_args['OutputDatasets'] = workload.listOutputDatasets() #TODO: remove this after reqmgr2 replice reqmgr (reqmgr2Only) request_args['ReqMgr2Only'] = True return
def __init__(self, url='http://localhost', idict=None): """ url should really be host - TODO fix that when have sufficient code coverage and change _getURLOpener if needed """ if not idict: idict = {} dict.__init__(self, idict) self.pycurl = idict.get('pycurl', None) self.capath = idict.get('capath', None) if self.pycurl: self.reqmgr = RequestHandler() # set up defaults self.setdefault("accept_type", 'text/html') self.setdefault("content_type", 'application/x-www-form-urlencoded') self.additionalHeaders = {} # check for basic auth early, as if found this changes the url urlComponent = sanitizeURL(url) if urlComponent['username'] is not None: self.addBasicAuth(urlComponent['username'], urlComponent['password']) url = urlComponent['url'] # remove user, password from url self.setdefault("host", url) # then update with the incoming dict self.update(idict) self['endpoint_components'] = urlparse.urlparse(self['host']) # If cachepath = None disable caching if 'cachepath' in idict and idict['cachepath'] is None: self["req_cache_path"] = None else: cache_dir = (self.cachePath(idict.get('cachepath'), idict.get('service_name'))) self["cachepath"] = cache_dir self["req_cache_path"] = os.path.join(cache_dir, '.cache') self.setdefault("cert", None) self.setdefault("key", None) self.setdefault('capath', None) self.setdefault("timeout", 300) self.setdefault("logger", logging) check_server_url(self['host'])
def __init__(self, url = 'http://localhost', idict=None): """ url should really be host - TODO fix that when have sufficient code coverage and change _getURLOpener if needed """ if not idict: idict = {} dict.__init__(self, idict) self.pycurl = idict.get('pycurl', None) self.capath = idict.get('capath', None) if self.pycurl: self.reqmgr = RequestHandler() #set up defaults self.setdefault("accept_type", 'text/html') self.setdefault("content_type", 'application/x-www-form-urlencoded') self.additionalHeaders = {} # check for basic auth early, as if found this changes the url urlComponent = sanitizeURL(url) if urlComponent['username'] is not None: self.addBasicAuth(\ urlComponent['username'], urlComponent['password']) url = urlComponent['url'] # remove user, password from url self.setdefault("host", url) # then update with the incoming dict self.update(idict) self['endpoint_components'] = urlparse.urlparse(self['host']) # If cachepath = None disable caching if 'cachepath' in idict and idict['cachepath'] is None: self["req_cache_path"] = None else: cache_dir = (self.cachePath(idict.get('cachepath'), \ idict.get('service_name'))) self["cachepath"] = cache_dir self["req_cache_path"] = os.path.join(cache_dir, '.cache') self.setdefault("timeout", 300) self.setdefault("logger", logging) check_server_url(self['host']) # and then get the URL opener self.setdefault("conn", self._getURLOpener())
def _update_additional_request_args(self, workload, request_args): """ add to request_args properties which is not initially set from user. This data will put in to couchdb. Update request_args here if additional information need to be put in couchdb """ request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists request_args['OutputDatasets'] = workload.listOutputDatasets() # TODO: remove this after reqmgr2 replice reqmgr (reqmgr2Only) request_args['ReqMgr2Only'] = True return
def _commonInit(self, couchURL, couchapp): """ setting up comon variables for inherited class. inherited class should call this in their init function """ if isinstance(couchURL, Database): self.couchDB = couchURL self.couchURL = self.couchDB['host'] self.dbName = self.couchDB.name self.couchServer = CouchServer(self.couchURL) else: couchURL = sanitizeURL(couchURL)['url'] self.couchURL, self.dbName = splitCouchServiceURL(couchURL) self.couchServer = CouchServer(self.couchURL) self.couchDB = self.couchServer.connectDatabase(self.dbName, False) self.couchapp = couchapp self.defaultStale = {"stale": "update_after"}
def _update_additional_request_args(self, workload, request_args): """ add to request_args properties which is not initially set from user. This data will put in to couchdb. Update request_args here if additional information need to be put in couchdb """ request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists request_args['OutputDatasets'] = workload.listOutputDatasets() # Add initial priority only for the creation of the request request_args['InitialPriority'] = request_args["RequestPriority"] return
def testProductionMultiQueue(self): """Test production with multiple queueus""" specfile = self.spec.specUrl() numUnit = 1 jobSlot = [10] * numUnit # array of jobs per block total = sum(jobSlot) self.globalQueue.queueWork(specfile) self.assertEqual(numUnit, len(self.globalQueue)) # pull work to localQueue2 - check local doesn't get any self.assertEqual(numUnit, self.localQueue2.pullWork({'T2_XX_SiteA' : total})) self.assertEqual(0, self.localQueue.pullWork({'T2_XX_SiteA' : total})) syncQueues(self.localQueue) syncQueues(self.localQueue2) self.assertEqual(numUnit, len(self.localQueue2.status(status = 'Available'))) self.assertEqual(0, len(self.localQueue.status(status = 'Available'))) self.assertEqual(numUnit, len(self.globalQueue.status(status = 'Acquired'))) self.assertEqual(sanitizeURL(self.localQueue2.params['QueueURL'])['url'], self.globalQueue.status()[0]['ChildQueueUrl'])
def post(self, workload_pair_list, multi_update_flag=False): """ Create and update couchDB with a new request. request argument is passed from validation (validation convert cherrypy.request.body data to argument) TODO: this method will have some parts factored out so that e.g. clone call can share functionality. NOTES: 1) do not strip spaces, #4705 will fails upon injection with spaces ; currently the chain relies on a number of things coming in #4705 2) reqInputArgs = Utilities.unidecode(JsonWrapper.loads(body)) (from ReqMgrRESTModel.putRequest) """ # storing the request document into Couch if multi_update_flag: return self.put(workload_pair_list) out = [] for workload, request_args in workload_pair_list: cherrypy.log("INFO: Create request, input args: %s ..." % request_args) request_args['RequestWorkflow'] = sanitizeURL( "%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"], metadata=request_args) out.append({'request': workload.name()}) return out
def setSpecUrl(self, url): self.data.persistency.specUrl = sanitizeURL(url)["url"]
def testA_BasicFunctionTest(self): """ _BasicFunctionTest_ Tests the components, by seeing if they can process a simple set of closeouts """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName = workloadPath) testJobGroup = self.createTestJobGroup(config = config, name = workload.name(), specLocation = workloadPath, error = False) # Create second workload testJobGroup2 = self.createTestJobGroup(config = config, name = workload.name(), filesetName = "TestFileset_2", specLocation = workloadPath, task = "/TestWorkload/ReReco/LogCollect") cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) cachePath2 = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "LogCollect") os.makedirs(cachePath2) self.assertTrue(os.path.exists(cachePath2)) result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 2) workflowName = "TestWorkload" dbname = config.TaskArchiver.workloadSummaryCouchDBName couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase(dbname) jobdb = couchdb.connectDatabase("%s/jobs" % self.databaseName) fwjrdb = couchdb.connectDatabase("%s/fwjrs" % self.databaseName) jobs = jobdb.loadView("JobDump", "jobsByWorkflowName", options = {"startkey": [workflowName], "endkey": [workflowName, {}]})['rows'] self.assertEqual(len(jobs), 2*self.nJobs) from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase create = CreateWMBSBase() tables = [] for x in create.requiredTables: tables.append(x[2:]) testTaskArchiver = TaskArchiverPoller(config = config) testTaskArchiver.algorithm() result = myThread.dbi.processData("SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_fileset")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) # Make sure we deleted the directory self.assertFalse(os.path.exists(cachePath)) self.assertFalse(os.path.exists(os.path.join(self.testDir, 'workloadTest/TestWorkload'))) testWMBSFileset = Fileset(id = 1) self.assertEqual(testWMBSFileset.exists(), False) workloadSummary = workdatabase.document(id = "TestWorkload") # Check ACDC self.assertEqual(workloadSummary['ACDCServer'], sanitizeURL(config.ACDC.couchurl)['url']) # Check the output self.assertEqual(workloadSummary['output'].keys(), ['/Electron/MorePenguins-v0/RECO', '/Electron/MorePenguins-v0/ALCARECO']) # Check performance # Check histograms self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['average'], 0.062651899999999996, places = 2) self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['nEvents'], 5) # Check standard performance self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['average'], 9.4950600000000005, places = 2) self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['stdDev'], 8.2912400000000002, places = 2) # Check worstOffenders self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['worstOffenders'], [{'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1}, {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 2}, {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 3}]) # Check retryData self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'], {'0': 10}) # LogCollect task is made out of identical FWJRs # assert that it is identical for x in workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys(): if x in config.TaskArchiver.histogramKeys: continue for y in ['average', 'stdDev']: self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y], workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'][x][y], places = 2) # The TestWorkload should have no jobs left workflowName = "TestWorkload" jobs = jobdb.loadView("JobDump", "jobsByWorkflowName", options = {"startkey": [workflowName], "endkey": [workflowName, {}]})['rows'] self.assertEqual(len(jobs), 0) jobs = fwjrdb.loadView("FWJRDump", "fwjrsByWorkflowName", options = {"startkey": [workflowName], "endkey": [workflowName, {}]})['rows'] self.assertEqual(len(jobs), 0) return
# Query destination DB for list of workflows summaryBase = "%s/%s%%2Ffwjrs/_design/FWJRDump/_show/workflowSummary/%s" # dest host, dest db base, workflow name successBase = "%s/%s%%2Fjobs/_design/JobDump/_list/successJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false" # dest host, dest db base, workflow, workflow failedBase = "%s/%s%%2Fjobs/_design/JobDump/_list/failedJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false" # dest host, dest db base, workflow, workflow srcJobsDb = srcCouchServer.connectDatabase(srcDbBase + "/jobs") statusResult = srcJobsDb.loadView("JobDump", "statusByWorkflowName", options={"group_level": 1}) fileHandle = open("archived.html", "w") fileHandle.write("<html><head><title>Archived Workflows</title></head>\n") fileHandle.write("<body>\n") workflowNames = [] for statusRow in statusResult["rows"]: wfName = statusRow["key"][0] summaryUrl = summaryBase % (destCouchHost, destDbBase, wfName) successUrl = successBase % (destCouchHost, destDbBase, wfName, wfName) failedUrl = successBase % (destCouchHost, destDbBase, wfName, wfName) fileHandle.write("%s " % wfName) fileHandle.write("<a href=%s>(summary)</a>" % sanitizeURL(summaryUrl)["url"]) fileHandle.write(" <a href=%s>(success)</a>" % sanitizeURL(successUrl)["url"]) fileHandle.write(" <a href=%s>(failure)</a><br>\n" % sanitizeURL(failedUrl)["url"]) fileHandle.write("</body></html>\n") fileHandle.close()
def testA_BasicFunctionTest(self): """ _BasicFunctionTest_ Tests the components, by seeing if they can process a simple set of closeouts """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName = workloadPath) testJobGroup = self.createTestJobGroup(config = config, name = workload.name(), specLocation = workloadPath, error = False) # Create second workload testJobGroup2 = self.createTestJobGroup(config = config, name = workload.name(), filesetName = "TestFileset_2", specLocation = workloadPath, task = "/TestWorkload/ReReco/LogCollect", type = "LogCollect") cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) cachePath2 = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "LogCollect") os.makedirs(cachePath2) self.assertTrue(os.path.exists(cachePath2)) result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 2) workflowName = "TestWorkload" dbname = config.TaskArchiver.workloadSummaryCouchDBName couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase(dbname) jobdb = couchdb.connectDatabase("%s/jobs" % self.databaseName) fwjrdb = couchdb.connectDatabase("%s/fwjrs" % self.databaseName) jobs = jobdb.loadView("JobDump", "jobsByWorkflowName", options = {"startkey": [workflowName], "endkey": [workflowName, {}]})['rows'] fwjrdb.loadView("FWJRDump", "fwjrsByWorkflowName", options = {"startkey": [workflowName], "endkey": [workflowName, {}]})['rows'] self.assertEqual(len(jobs), 2*self.nJobs) from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase create = CreateWMBSBase() tables = [] for x in create.requiredTables: tables.append(x[2:]) self.populateWorkflowWithCompleteStatus() testTaskArchiver = TaskArchiverPoller(config = config) testTaskArchiver.algorithm() cleanCouch = CleanCouchPoller(config = config) cleanCouch.setup() cleanCouch.algorithm() result = myThread.dbi.processData("SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_fileset")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) # Make sure we deleted the directory self.assertFalse(os.path.exists(cachePath)) self.assertFalse(os.path.exists(os.path.join(self.testDir, 'workloadTest/TestWorkload'))) testWMBSFileset = Fileset(id = 1) self.assertEqual(testWMBSFileset.exists(), False) workloadSummary = workdatabase.document(id = "TestWorkload") # Check ACDC self.assertEqual(workloadSummary['ACDCServer'], sanitizeURL(config.ACDC.couchurl)['url']) # Check the output self.assertEqual(workloadSummary['output'].keys(), ['/Electron/MorePenguins-v0/RECO']) self.assertEqual(sorted(workloadSummary['output']['/Electron/MorePenguins-v0/RECO']['tasks']), ['/TestWorkload/ReReco', '/TestWorkload/ReReco/LogCollect']) # Check performance # Check histograms self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['average'], 0.89405199999999996, places = 2) self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['nEvents'], 10) # Check standard performance self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['average'], 17.786300000000001, places = 2) self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['stdDev'], 0.0, places = 2) # Check worstOffenders self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['worstOffenders'], [{'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1}, {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1}, {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 2}]) # Check retryData self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'], {'1': 10}) logCollectPFN = 'srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms/store/logs/prod/2012/11/WMAgent/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8-AlcaSkimLogCollect-1-logs.tar' self.assertEqual(workloadSummary['logArchives'], {'/TestWorkload/ReReco/LogCollect' : [logCollectPFN for _ in range(10)]}) # LogCollect task is made out of identical FWJRs # assert that it is identical for x in workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys(): if x in config.TaskArchiver.histogramKeys: continue for y in ['average', 'stdDev']: self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y], workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'][x][y], places = 2) return
def _sanitizeURL(self, couchURL): return sanitizeURL(couchURL)['url']
def recordInCouch(self, jobs, newstate, oldstate, updatesummary = False): """ _recordInCouch_ Record relevant job information in couch. If the job does not yet exist in couch it will be saved as a seperate document. If the job has a FWJR attached that will be saved as a seperate document. """ if not self._connectDatabases(): logging.error('Databases not connected properly') return timestamp = int(time.time()) couchRecordsToUpdate = [] for job in jobs: couchDocID = job.get("couch_record", None) if newstate == "new": oldstate = "none" if job.get("site_cms_name", None): if newstate == "executing": jobLocation = job["site_cms_name"] else: jobLocation = "Agent" else: jobLocation = "Agent" if couchDocID == None: jobDocument = {} jobDocument["_id"] = str(job["id"]) job["couch_record"] = jobDocument["_id"] jobDocument["jobid"] = job["id"] jobDocument["workflow"] = job["workflow"] jobDocument["task"] = job["task"] jobDocument["owner"] = job["owner"] jobDocument["inputfiles"] = [] for inputFile in job["input_files"]: docInputFile = inputFile.json() docInputFile["parents"] = [] for parent in inputFile["parents"]: docInputFile["parents"].append({"lfn": parent["lfn"]}) jobDocument["inputfiles"].append(docInputFile) jobDocument["states"] = {"0": {"oldstate": oldstate, "newstate": newstate, "location": jobLocation, "timestamp": timestamp}} jobDocument["jobgroup"] = job["jobgroup"] jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"], "LastEvent": job["mask"]["LastEvent"], "FirstLumi": job["mask"]["FirstLumi"], "LastLumi": job["mask"]["LastLumi"], "FirstRun": job["mask"]["FirstRun"], "LastRun": job["mask"]["LastRun"]} if job['mask']['runAndLumis'] != {}: # Then we have to save the mask runAndLumis jobDocument['mask']['runAndLumis'] = {} for key in job['mask']['runAndLumis'].keys(): jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key] jobDocument["name"] = job["name"] jobDocument["type"] = "job" jobDocument["user"] = job.get("user", None) jobDocument["group"] = job.get("group", None) jobDocument["taskType"] = job.get("taskType", "Unknown") jobDocument["jobType"] = job.get("jobType", "Unknown") couchRecordsToUpdate.append({"jobid": job["id"], "couchid": jobDocument["_id"]}) self.jobsdatabase.queue(jobDocument, callback = discardConflictingDocument) else: # We send a PUT request to the stateTransition update handler. # Couch expects the parameters to be passed as arguments to in # the URI while the Requests class will only encode arguments # this way for GET requests. Changing the Requests class to # encode PUT arguments as couch expects broke a bunch of code so # we'll just do our own encoding here. updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, newstate, jobLocation, timestamp) self.jobsdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False) # updating the status of the summary doc only when it is explicitely requested # doc is already in couch if updatesummary: jobSummaryId = job["name"] updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobSummaryState/" + jobSummaryId # map retrydone state to jobfailed state for monitoring if newstate == "retrydone": monitorState = "jobfailed" else: monitorState = newstate updateUri += "?newstate=%s×tamp=%s" % (monitorState, timestamp) self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False) logging.debug("Updated job summary status for job %s" % jobSummaryId) updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobStateTransition/" + jobSummaryId updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, monitorState, job["location"], timestamp) self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False) logging.debug("Updated job summary state history for job %s" % jobSummaryId) if job.get("fwjr", None): # If there are too many input files, strip them out # of the FWJR, as they should already # be in the database # This is not critical try: if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles: job['fwjr'].stripInputFiles() except: logging.error("Error while trying to strip input files from FWJR. Ignoring.") pass # complete fwjr document job["fwjr"].setTaskName(job["task"]) fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]), "jobid": job["id"], "retrycount": job["retry_count"], "fwjr": job["fwjr"].__to_json__(None), "type": "fwjr"} self.fwjrdatabase.queue(fwjrDocument, timestamp = True, callback = discardConflictingDocument) updateSummaryDB(self.statsumdatabase, job) #TODO: can add config switch to swich on and off # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'): if (job["retry_count"] > 0) or (newstate != 'success'): jobSummaryId = job["name"] # building a summary of fwjr logging.debug("Pushing job summary for job %s" % jobSummaryId) errmsgs = {} inputs = [] if "steps" in fwjrDocument["fwjr"]: for step in fwjrDocument["fwjr"]["steps"]: if "errors" in fwjrDocument["fwjr"]["steps"][step]: errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]] if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in fwjrDocument["fwjr"]["steps"][step]["input"]: inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source] ) outputs = [] outputDataset = None for singlestep in job["fwjr"].listSteps(): for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep): if singlefile: outputs.append({'type': 'output' if CMSSTEP.match(singlestep) else singlefile.get('module_label', None), 'lfn': singlefile.get('lfn', None), 'location': list(singlefile.get('locations', set([]))) if len(singlefile.get('locations', set([]))) > 1 else singlefile['locations'].pop(), 'checksums': singlefile.get('checksums', {}), 'size': singlefile.get('size', None) }) #it should have one output dataset for all the files outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset inputFiles = [] for inputFileStruct in job["fwjr"].getAllInputFiles(): # check if inputFileSummary needs to be extended inputFileSummary = {} inputFileSummary["lfn"] = inputFileStruct["lfn"] inputFileSummary["input_type"] = inputFileStruct["input_type"] inputFiles.append(inputFileSummary) # Don't record intermediate jobfailed status in the jobsummary # change to jobcooloff which will be overwritten by error handler anyway if (job["retry_count"] > 0) and (newstate == 'jobfailed'): summarystate = 'jobcooloff' else: summarystate = newstate jobSummary = {"_id": jobSummaryId, "wmbsid": job["id"], "type": "jobsummary", "retrycount": job["retry_count"], "workflow": job["workflow"], "task": job["task"], "jobtype": job["jobType"], "state": summarystate, "site": job.get("location", None), "cms_location": job["fwjr"].getSiteName(), "exitcode": job["fwjr"].getExitCode(), "errors": errmsgs, "lumis": inputs, "outputdataset": outputDataset, "inputfiles": inputFiles, "acdc_url": "%s/%s" % (sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database), "agent_name": self.config.Agent.hostName, "output": outputs } if couchDocID is not None: try: currentJobDoc = self.jsumdatabase.document(id = jobSummaryId) jobSummary['_rev'] = currentJobDoc['_rev'] jobSummary['state_history'] = currentJobDoc.get('state_history', []) # record final status transition if newstate == 'success': finalStateDict = {'oldstate': oldstate, 'newstate': newstate, 'location': job["location"], 'timestamp': timestamp} jobSummary['state_history'].append(finalStateDict) noEmptyList = ["inputfiles", "lumis"] for prop in noEmptyList: jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, []) except CouchNotFoundError: pass self.jsumdatabase.queue(jobSummary, timestamp = True) if len(couchRecordsToUpdate) > 0: self.setCouchDAO.execute(bulkList = couchRecordsToUpdate, conn = self.getDBConn(), transaction = self.existingTransaction()) self.jobsdatabase.commit(callback = discardConflictingDocument) self.fwjrdatabase.commit(callback = discardConflictingDocument) self.jsumdatabase.commit() return
destJobsDb.loadView("JobDump", "statusByWorkflowName", options = {"limit": 1}) print " Triggering view generation for fwjrs database..." destFwjrsDb.loadView("FWJRDump", "outputByWorkflowName", options = {"limit": 1}) print "" # Query destination DB for list of workflows summaryBase = "%s/%s%%2Ffwjrs/_design/FWJRDump/_show/workflowSummary/%s" # dest host, dest db base, workflow name successBase = "%s/%s%%2Fjobs/_design/JobDump/_list/successJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false" # dest host, dest db base, workflow, workflow failedBase = "%s/%s%%2Fjobs/_design/JobDump/_list/failedJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false" # dest host, dest db base, workflow, workflow srcJobsDb = srcCouchServer.connectDatabase(srcDbBase + "/jobs") statusResult = srcJobsDb.loadView("JobDump", "statusByWorkflowName", options = {"group_level": 1}) fileHandle = open("archived.html", "w") fileHandle.write("<html><head><title>Archived Workflows</title></head>\n") fileHandle.write("<body>\n") workflowNames = [] for statusRow in statusResult["rows"]: wfName = statusRow["key"][0] summaryUrl = summaryBase % (destCouchHost, destDbBase, wfName) successUrl = successBase % (destCouchHost, destDbBase, wfName, wfName) failedUrl = successBase % (destCouchHost, destDbBase, wfName, wfName) fileHandle.write("%s " % wfName) fileHandle.write("<a href=%s>(summary)</a>" % sanitizeURL(summaryUrl)["url"]) fileHandle.write(" <a href=%s>(success)</a>" % sanitizeURL(successUrl)["url"]) fileHandle.write(" <a href=%s>(failure)</a><br>\n" % sanitizeURL(failedUrl)["url"]) fileHandle.write("</body></html>\n") fileHandle.close()
def __init__(self, couchURL, couchapp="ReqMgr"): couchURL = sanitizeURL(couchURL)['url'] # set the connection for local couchDB call self._commonInit(couchURL, couchapp)
def __init__(self, couchURL, dbName = None): couchURL = sanitizeURL(couchURL)['url'] # set the connection for local couchDB call self._commonInit(couchURL, dbName)
def testA_BasicFunctionTest(self): """ _BasicFunctionTest_ Tests the components, by seeing if they can process a simple set of closeouts """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName=workloadPath) testJobGroup = self.createTestJobGroup(config=config, name=workload.name(), specLocation=workloadPath, error=False) # Create second workload testJobGroup2 = self.createTestJobGroup( config=config, name=workload.name(), filesetName="TestFileset_2", specLocation=workloadPath, task="/TestWorkload/ReReco/LogCollect") cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) cachePath2 = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "LogCollect") os.makedirs(cachePath2) self.assertTrue(os.path.exists(cachePath2)) result = myThread.dbi.processData( "SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 2) workflowName = "TestWorkload" dbname = config.TaskArchiver.workloadSummaryCouchDBName couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase(dbname) jobdb = couchdb.connectDatabase("%s/jobs" % self.databaseName) fwjrdb = couchdb.connectDatabase("%s/fwjrs" % self.databaseName) jobs = jobdb.loadView("JobDump", "jobsByWorkflowName", options={ "startkey": [workflowName], "endkey": [workflowName, {}] })['rows'] self.assertEqual(len(jobs), 2 * self.nJobs) from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase create = CreateWMBSBase() tables = [] for x in create.requiredTables: tables.append(x[2:]) testTaskArchiver = TaskArchiverPoller(config=config) testTaskArchiver.algorithm() result = myThread.dbi.processData( "SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_fileset")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) # Make sure we deleted the directory self.assertFalse(os.path.exists(cachePath)) self.assertFalse( os.path.exists( os.path.join(self.testDir, 'workloadTest/TestWorkload'))) testWMBSFileset = Fileset(id=1) self.assertEqual(testWMBSFileset.exists(), False) workloadSummary = workdatabase.document(id="TestWorkload") # Check ACDC self.assertEqual(workloadSummary['ACDCServer'], sanitizeURL(config.ACDC.couchurl)['url']) # Check the output self.assertEqual(workloadSummary['output'].keys(), ['/Electron/MorePenguins-v0/RECO']) self.assertEqual( sorted(workloadSummary['output']['/Electron/MorePenguins-v0/RECO'] ['tasks']), ['/TestWorkload/ReReco', '/TestWorkload/ReReco/LogCollect']) # Check performance # Check histograms self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['AvgEventTime']['histogram'][0]['average'], 0.89405199999999996, places=2) self.assertEqual( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['AvgEventTime']['histogram'][0]['nEvents'], 10) # Check standard performance self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['TotalJobCPU']['average'], 17.786300000000001, places=2) self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['TotalJobCPU']['stdDev'], 0.0, places=2) # Check worstOffenders self.assertEqual( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['AvgEventTime']['worstOffenders'], [{ 'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1 }, { 'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1 }, { 'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 2 }]) # Check retryData self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'], {'1': 10}) logCollectPFN = 'srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms/store/logs/prod/2012/11/WMAgent/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8-AlcaSkimLogCollect-1-logs.tar' self.assertEqual(workloadSummary['logArchives'], { '/TestWorkload/ReReco/LogCollect': [logCollectPFN for _ in range(10)] }) # LogCollect task is made out of identical FWJRs # assert that it is identical for x in workloadSummary['performance'][ '/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys(): if x in config.TaskArchiver.histogramKeys: continue for y in ['average', 'stdDev']: self.assertAlmostEquals( workloadSummary['performance'] ['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y], workloadSummary['performance']['/TestWorkload/ReReco'] ['cmsRun1'][x][y], places=2) return
def archiveWorkflowSummary(self, spec): """ _archiveWorkflowSummary_ For each workflow pull its information from couch and WMBS and turn it into a summary for archiving """ failedJobs = [] workflowData = {'retryData': {}} workflowName = spec.name() #First make sure that we didn't upload something already #Could be the that the WMBS deletion epic failed, #so we can skip this if there is a summary already up there #TODO: With multiple agents sharing workflows, we will need to differentiate and combine summaries for a request if self.workdatabase.documentExists(workflowName): logging.info( "Couch summary for %s already exists, proceeding only with cleanup" % workflowName) return # Set campaign workflowData['campaign'] = spec.getCampaign() # Set inputdataset workflowData['inputdatasets'] = spec.listInputDatasets() # Set histograms histograms = { 'workflowLevel': { 'failuresBySite': DiscreteSummaryHistogram('Failed jobs by site', 'Site') }, 'taskLevel': {}, 'stepLevel': {} } # Get a list of failed job IDs # Make sure you get it for ALL tasks in the spec for taskName in spec.listAllTaskPathNames(): failedTmp = self.jobsdatabase.loadView( "JobDump", "failedJobsByWorkflowName", options={ "startkey": [workflowName, taskName], "endkey": [workflowName, taskName] })['rows'] for entry in failedTmp: failedJobs.append(entry['value']) retryData = self.jobsdatabase.loadView("JobDump", "retriesByTask", options={ 'group_level': 3, 'startkey': [workflowName], 'endkey': [workflowName, {}] })['rows'] for row in retryData: taskName = row['key'][2] count = str(row['key'][1]) if not taskName in workflowData['retryData'].keys(): workflowData['retryData'][taskName] = {} workflowData['retryData'][taskName][count] = row['value'] output = self.fwjrdatabase.loadView("FWJRDump", "outputByWorkflowName", options={ "group_level": 2, "startkey": [workflowName], "endkey": [workflowName, {}], "group": True })['rows'] outputListStr = self.fwjrdatabase.loadList("FWJRDump", "workflowOutputTaskMapping", "outputByWorkflowName", options={ "startkey": [workflowName], "endkey": [workflowName, {}], "reduce": False }) outputList = json.loads(outputListStr) perf = self.handleCouchPerformance(workflowName=workflowName) workflowData['performance'] = {} for key in perf: workflowData['performance'][key] = {} for attr in perf[key].keys(): workflowData['performance'][key][attr] = perf[key][attr] workflowData["_id"] = workflowName try: workflowData["ACDCServer"] = sanitizeURL( self.config.ACDC.couchurl)['url'] workflowData["ACDCDatabase"] = self.config.ACDC.database except AttributeError, ex: # We're missing the ACDC info. # Keep going logging.error( "ACDC info missing from config. Skipping this step in the workflow summary." ) logging.error("Error: %s" % str(ex))
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.config = config self.jobCacheDir = self.config.JobCreator.jobCacheDir if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False: # Get workqueue setup from config unless overridden if hasattr(self.config.TaskArchiver, 'WorkQueueParams'): self.workQueue = localQueue( **self.config.TaskArchiver.WorkQueueParams) else: from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig self.workQueue = queueFromConfig(self.config) else: self.workQueue = None self.maxProcessSize = getattr(self.config.TaskArchiver, 'maxProcessSize', 250) self.timeout = getattr(self.config.TaskArchiver, "timeOut", None) self.nOffenders = getattr(self.config.TaskArchiver, 'nOffenders', 3) self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.uploadPublishInfo = getattr(self.config.TaskArchiver, 'uploadPublishInfo', False) self.uploadPublishDir = getattr(self.config.TaskArchiver, 'uploadPublishDir', None) self.userFileCacheURL = getattr(self.config.TaskArchiver, 'userFileCacheURL', None) # Set up optional histograms self.histogramKeys = getattr(self.config.TaskArchiver, "histogramKeys", []) self.histogramBins = getattr(self.config.TaskArchiver, "histogramBins", 10) self.histogramLimit = getattr(self.config.TaskArchiver, "histogramLimit", 5.0) if not self.useReqMgrForCompletionCheck: #sets the local monitor summary couch db self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBWriter = self.wmstatsCouchDB else: self.centralCouchDBWriter = WMStatsWriter( self.config.TaskArchiver.centralWMStatsURL) # Start a couch server for getting job info # from the FWJRs for committal to archive try: workDBName = getattr(self.config.TaskArchiver, 'workloadSummaryCouchDBName', 'workloadsummary') workDBurl = getattr(self.config.TaskArchiver, 'workloadSummaryCouchURL') jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.workCouchdb = CouchServer(workDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.workdatabase = self.workCouchdb.connectDatabase(workDBName) logging.debug("Using url %s/%s for job" % (jobDBurl, jobDBName)) logging.debug("Writing to %s/%s for workloadSummary" % (sanitizeURL(workDBurl)['url'], workDBName)) self.requireCouch = getattr(self.config.TaskArchiver, 'requireCouch', False) except Exception, ex: msg = "Error in connecting to couch.\n" msg += str(ex) logging.error(msg) self.jobsdatabase = None self.fwjrdatabase = None if getattr(self.config.TaskArchiver, 'requireCouch', False): raise TaskArchiverPollerException(msg)
def availableWork(self, thresholds, siteJobCounts, teams=None, wfs=None): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. """ self.logger.info("Getting available work from %s/%s" % (sanitizeURL(self.server.url)['url'], self.db.name)) elements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if teams: options['teams'] = teams self.logger.info("setting teams %s" % teams) if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i + 20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key=lambda x: x[ 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement' ]['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if len(result) == 0: self.logger.info( """No available work in WQ or didn't pass workqueue restriction - check Pileup, site white list, etc""") self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds)) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority prio = element['Priority'] curJobCount = sum( map(lambda x: x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items())) self.logger.debug("Job Count: %s, site: %s threshods: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: elements.append(element) if site not in siteJobCounts: siteJobCounts[site] = {} siteJobCounts[site][prio] = siteJobCounts[site].setdefault( prio, 0) + element['Jobs'] else: self.logger.info("No possible site for %s" % element) # sort elements to get them in priority first and timestamp order elements.sort(key=lambda element: element['CreationTime']) elements.sort(key=lambda x: x['Priority'], reverse=True) return elements, thresholds, siteJobCounts
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.config = config self.jobCacheDir = self.config.JobCreator.jobCacheDir if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False: # Get workqueue setup from config unless overridden if hasattr(self.config.TaskArchiver, 'WorkQueueParams'): self.workQueue = localQueue(**self.config.TaskArchiver.WorkQueueParams) else: from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig self.workQueue = queueFromConfig(self.config) else: self.workQueue = None self.maxProcessSize = getattr(self.config.TaskArchiver, 'maxProcessSize', 250) self.timeout = getattr(self.config.TaskArchiver, "timeOut", None) self.nOffenders = getattr(self.config.TaskArchiver, 'nOffenders', 3) self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.uploadPublishInfo = getattr(self.config.TaskArchiver, 'uploadPublishInfo', False) self.uploadPublishDir = getattr(self.config.TaskArchiver, 'uploadPublishDir', None) self.userFileCacheURL = getattr(self.config.TaskArchiver, 'userFileCacheURL', None) # Set up optional histograms self.histogramKeys = getattr(self.config.TaskArchiver, "histogramKeys", []) self.histogramBins = getattr(self.config.TaskArchiver, "histogramBins", 10) self.histogramLimit = getattr(self.config.TaskArchiver, "histogramLimit", 5.0) if not self.useReqMgrForCompletionCheck: #sets the local monitor summary couch db self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL); # Start a couch server for getting job info # from the FWJRs for committal to archive try: workDBName = getattr(self.config.TaskArchiver, 'workloadSummaryCouchDBName', 'workloadsummary') workDBurl = getattr(self.config.TaskArchiver, 'workloadSummaryCouchURL') jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.workCouchdb = CouchServer(workDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.workdatabase = self.workCouchdb.connectDatabase(workDBName) logging.debug("Using url %s/%s for job" % (jobDBurl, jobDBName)) logging.debug("Writing to %s/%s for workloadSummary" % (sanitizeURL(workDBurl)['url'], workDBName)) self.requireCouch = getattr(self.config.TaskArchiver, 'requireCouch', False) except Exception, ex: msg = "Error in connecting to couch.\n" msg += str(ex) logging.error(msg) self.jobsdatabase = None self.fwjrdatabase = None if getattr(self.config.TaskArchiver, 'requireCouch', False): raise TaskArchiverPollerException(msg)
def availableWork(self, thresholds, siteJobCounts, teams = None, wfs = None): """ Get work which is available to be run Assume thresholds is a dictionary; keys are the site name, values are the maximum number of running jobs at that site. Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site name and task priorities. The value is the number of jobs running at that priority. """ self.logger.info("Getting available work from %s/%s" % (sanitizeURL(self.server.url)['url'], self.db.name)) elements = [] # We used to pre-filter sites, looking to see if there are idle job slots # We don't do this anymore, as we may over-allocate # jobs to sites if the new jobs have a higher priority. # If there are no sites, punt early. if not thresholds: self.logger.error("No thresholds is set: Please check") return elements, thresholds, siteJobCounts options = {} options['include_docs'] = True options['descending'] = True options['resources'] = thresholds if teams: options['teams'] = teams self.logger.info("setting teams %s" % teams) if wfs: result = [] for i in xrange(0, len(wfs), 20): options['wfs'] = wfs[i:i+20] data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result.extend(json.loads(data)) # sort final list result.sort(key = lambda x: x['WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']['Priority']) else: result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options) result = json.loads(result) if len(result) == 0: self.logger.info("""No available work in WQ or didn't pass workqueue restriction - check Pileup, site white list, etc""") self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds)) # Iterate through the results; apply whitelist / blacklist / data # locality restrictions. Only assign jobs if they are high enough # priority. for i in result: element = CouchWorkQueueElement.fromDocument(self.db, i) prio = element['Priority'] possibleSite = None sites = thresholds.keys() random.shuffle(sites) for site in sites: if element.passesSiteRestriction(site): # Count the number of jobs currently running of greater priority prio = element['Priority'] curJobCount = sum(map(lambda x : x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items())) self.logger.debug("Job Count: %s, site: %s threshods: %s" % (curJobCount, site, thresholds[site])) if curJobCount < thresholds[site]: possibleSite = site break if possibleSite: self.logger.debug("Possible site exists %s" % str(possibleSite)) elements.append(element) if site not in siteJobCounts: siteJobCounts[site] = {} siteJobCounts[site][prio] = siteJobCounts[site].setdefault(prio, 0) + element['Jobs']*element.get('blowupFactor', 1.0) else: self.logger.info("No possible site for %s" % element['RequestName']) # sort elements to get them in priority first and timestamp order elements.sort(key=lambda element: element['CreationTime']) elements.sort(key = lambda x: x['Priority'], reverse = True) return elements, thresholds, siteJobCounts
def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) self.getOutputMapAction = self.daofactory(classname = "Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory(classname = "Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory(classname = "Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname = "Jobs.GetType") self.getParentInfoAction = self.daofactory(classname = "Files.GetParentInfo") self.setParentageByJob = self.daofactory(classname = "Files.SetParentageByJob") self.setFileRunLumi = self.daofactory(classname = "Files.AddRunLumi") self.setFileLocation = self.daofactory(classname = "Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory(classname = "Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname = "Files.Add") self.jobCompleteInput = self.daofactory(classname = "Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname = "Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname = "Jobs.LoadFromID") self.getFullJobInfo = self.daofactory(classname = "Jobs.LoadForErrorHandler") self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory(classname = "DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname = "ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # ACDC service self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL) # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen = 1000) self.datasetAlgoPaths = collections.deque(maxlen = 1000) self.dbsLocations = collections.deque(maxlen = 1000) self.workflowIDs = collections.deque(maxlen = 1000) self.workflowPaths = collections.deque(maxlen = 1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return
def __init__(self, couchURL, dbName=None): couchURL = sanitizeURL(couchURL)['url'] # set the connection for local couchDB call self._commonInit(couchURL, dbName)
def testA_BasicFunctionTest(self): """ _BasicFunctionTest_ Tests the components, by seeing if they can process a simple set of closeouts """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, "specDir", "spec.pkl") workload = self.createWorkload(workloadName=workloadPath) testJobGroup = self.createTestJobGroup( config=config, name=workload.name(), specLocation=workloadPath, error=False ) # Create second workload testJobGroup2 = self.createTestJobGroup( config=config, name="%s_2" % workload.name(), specLocation=workloadPath, task="/TestWorkload/ReReco/LogCollect", ) cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) cachePath2 = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "LogCollect") os.makedirs(cachePath2) self.assertTrue(os.path.exists(cachePath2)) result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 2) testTaskArchiver = TaskArchiverPoller(config=config) testTaskArchiver.algorithm() result = myThread.dbi.processData("SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData("SELECT * FROM wmbs_fileset")[0].fetchall() result = myThread.dbi.processData("SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) # Make sure we deleted the directory self.assertFalse(os.path.exists(cachePath)) self.assertFalse(os.path.exists(os.path.join(self.testDir, "workloadTest/TestWorkload"))) testWMBSFileset = Fileset(id=1) self.assertEqual(testWMBSFileset.exists(), False) dbname = getattr(config.JobStateMachine, "couchDBName") couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase(dbname) workloadSummary = workdatabase.document(id="TestWorkload") # Check ACDC self.assertEqual(workloadSummary["ACDCServer"], sanitizeURL(config.ACDC.couchurl)["url"]) # Check the output self.assertEqual( workloadSummary["output"].keys(), ["/Electron/MorePenguins-v0/RECO", "/Electron/MorePenguins-v0/ALCARECO"] ) # Check performance # Check histograms self.assertEqual( workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["AvgEventTime"]["histogram"][0][ "average" ], 0.062651899999999996, ) self.assertEqual( workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["AvgEventTime"]["histogram"][0][ "nEvents" ], 5, ) # Check standard performance self.assertEqual( workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["TotalJobCPU"]["average"], 9.4950600000000005, ) self.assertEqual( workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["TotalJobCPU"]["stdDev"], 8.2912400000000002, ) # Check worstOffenders self.assertEqual( workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["AvgEventTime"]["worstOffenders"], [ {"logCollect": None, "log": None, "value": "0.894052", "jobID": 1}, {"logCollect": None, "log": None, "value": "0.894052", "jobID": 2}, {"logCollect": None, "log": None, "value": "0.894052", "jobID": 3}, ], ) # LogCollect task is made out of identical FWJRs # assert that it is identical for x in workloadSummary["performance"]["/TestWorkload/ReReco/LogCollect"]["cmsRun1"].keys(): if x in config.TaskArchiver.histogramKeys: continue for y in ["average", "stdDev"]: self.assertEqual( workloadSummary["performance"]["/TestWorkload/ReReco/LogCollect"]["cmsRun1"][x][y], workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"][x][y], ) return
def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return
def archiveWorkflowSummary(self, spec): """ _archiveWorkflowSummary_ For each workflow pull its information from couch and WMBS and turn it into a summary for archiving """ failedJobs = [] workflowData = {'retryData': {}} workflowName = spec.name() #First make sure that we didn't upload something already #Could be the that the WMBS deletion epic failed, #so we can skip this if there is a summary already up there #TODO: With multiple agents sharing workflows, we will need to differentiate and combine summaries for a request if self.workdatabase.documentExists(workflowName): logging.info("Couch summary for %s already exists, proceeding only with cleanup" % workflowName) return # Set campaign workflowData['campaign'] = spec.getCampaign() # Get a list of failed job IDs # Make sure you get it for ALL tasks in the spec for taskName in spec.listAllTaskPathNames(): failedTmp = self.jobsdatabase.loadView("JobDump", "failedJobsByWorkflowName", options = {"startkey": [workflowName, taskName], "endkey": [workflowName, taskName]})['rows'] for entry in failedTmp: failedJobs.append(entry['value']) retryData = self.jobsdatabase.loadView("JobDump", "retriesByTask", options = {'group_level': 3, 'startkey': [workflowName], 'endkey': [workflowName, {}]})['rows'] for row in retryData: taskName = row['key'][2] count = str(row['key'][1]) if not taskName in workflowData['retryData'].keys(): workflowData['retryData'][taskName] = {} workflowData['retryData'][taskName][count] = row['value'] output = self.fwjrdatabase.loadView("FWJRDump", "outputByWorkflowName", options = {"group_level": 2, "startkey": [workflowName], "endkey": [workflowName, {}], "group": True})['rows'] perf = self.handleCouchPerformance(workflowName = workflowName) workflowData['performance'] = {} for key in perf: workflowData['performance'][key] = {} for attr in perf[key].keys(): workflowData['performance'][key][attr] = perf[key][attr] workflowData["_id"] = workflowName try: workflowData["ACDCServer"] = sanitizeURL(self.config.ACDC.couchurl)['url'] workflowData["ACDCDatabase"] = self.config.ACDC.database except AttributeError, ex: # We're missing the ACDC info. # Keep going logging.error("ACDC info missing from config. Skipping this step in the workflow summary.") logging.error("Error: %s" % str(ex))
def recordInCouch(self, jobs, newstate, oldstate, updatesummary=False): """ _recordInCouch_ Record relevant job information in couch. If the job does not yet exist in couch it will be saved as a seperate document. If the job has a FWJR attached that will be saved as a seperate document. """ if not self._connectDatabases(): logging.error('Databases not connected properly') return timestamp = int(time.time()) couchRecordsToUpdate = [] for job in jobs: couchDocID = job.get("couch_record", None) if newstate == "new": oldstate = "none" if job.get("site_cms_name", None): if newstate == "executing": jobLocation = job["site_cms_name"] else: jobLocation = "Agent" else: jobLocation = "Agent" if couchDocID is None: jobDocument = {} jobDocument["_id"] = str(job["id"]) job["couch_record"] = jobDocument["_id"] jobDocument["jobid"] = job["id"] jobDocument["workflow"] = job["workflow"] jobDocument["task"] = job["task"] jobDocument["owner"] = job["owner"] jobDocument["inputfiles"] = [] for inputFile in job["input_files"]: docInputFile = inputFile.json() docInputFile["parents"] = [] for parent in inputFile["parents"]: docInputFile["parents"].append({"lfn": parent["lfn"]}) jobDocument["inputfiles"].append(docInputFile) jobDocument["states"] = {"0": {"oldstate": oldstate, "newstate": newstate, "location": jobLocation, "timestamp": timestamp}} jobDocument["jobgroup"] = job["jobgroup"] jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"], "LastEvent": job["mask"]["LastEvent"], "FirstLumi": job["mask"]["FirstLumi"], "LastLumi": job["mask"]["LastLumi"], "FirstRun": job["mask"]["FirstRun"], "LastRun": job["mask"]["LastRun"]} if job['mask']['runAndLumis'] != {}: # Then we have to save the mask runAndLumis jobDocument['mask']['runAndLumis'] = {} for key in job['mask']['runAndLumis'].keys(): jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key] jobDocument["name"] = job["name"] jobDocument["type"] = "job" jobDocument["user"] = job.get("user", None) jobDocument["group"] = job.get("group", None) jobDocument["taskType"] = job.get("taskType", "Unknown") jobDocument["jobType"] = job.get("jobType", "Unknown") couchRecordsToUpdate.append({"jobid": job["id"], "couchid": jobDocument["_id"]}) self.jobsdatabase.queue(jobDocument, callback=discardConflictingDocument) else: # We send a PUT request to the stateTransition update handler. # Couch expects the parameters to be passed as arguments to in # the URI while the Requests class will only encode arguments # this way for GET requests. Changing the Requests class to # encode PUT arguments as couch expects broke a bunch of code so # we'll just do our own encoding here. updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, newstate, jobLocation, timestamp) self.jobsdatabase.makeRequest(uri=updateUri, type="PUT", decode=False) # updating the status of the summary doc only when it is explicitely requested # doc is already in couch if updatesummary: jobSummaryId = job["name"] updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobSummaryState/" + jobSummaryId # map retrydone state to jobfailed state for monitoring if newstate == "retrydone": monitorState = "jobfailed" else: monitorState = newstate updateUri += "?newstate=%s×tamp=%s" % (monitorState, timestamp) self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False) logging.debug("Updated job summary status for job %s", jobSummaryId) updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobStateTransition/" + jobSummaryId updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, monitorState, job["location"], timestamp) self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False) logging.debug("Updated job summary state history for job %s", jobSummaryId) if job.get("fwjr", None): cachedByWorkflow = self.workloadCache.setdefault(job['workflow'], getDataFromSpecFile( self.getWorkflowSpecDAO.execute(job['task'])[ job['task']]['spec'])) job['fwjr'].setCampaign(cachedByWorkflow.get('Campaign', '')) job['fwjr'].setPrepID(cachedByWorkflow.get(job['task'], '')) # If there are too many input files, strip them out # of the FWJR, as they should already # be in the database # This is not critical try: if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles: job['fwjr'].stripInputFiles() except Exception as ex: logging.error("Error while trying to strip input files from FWJR. Ignoring. : %s", str(ex)) if newstate == "retrydone": jobState = "jobfailed" else: jobState = newstate # there is race condition updating couch record location and job is completed. # for the fast fail job, it could miss the location update job["location"] = job["fwjr"].getSiteName() or job.get("location", "Unknown") # complete fwjr document job["fwjr"].setTaskName(job["task"]) jsonFWJR = job["fwjr"].__to_json__(None) # Don't archive cleanup job report if job["jobType"] == "Cleanup": archStatus = "skip" else: archStatus = "ready" fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]), "jobid": job["id"], "jobtype": job["jobType"], "jobstate": jobState, "retrycount": job["retry_count"], "archivestatus": archStatus, "fwjr": jsonFWJR, "type": "fwjr"} self.fwjrdatabase.queue(fwjrDocument, timestamp=True, callback=discardConflictingDocument) updateSummaryDB(self.statsumdatabase, job) # TODO: can add config switch to swich on and off # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'): if (job["retry_count"] > 0) or (newstate != 'success'): jobSummaryId = job["name"] # building a summary of fwjr logging.debug("Pushing job summary for job %s", jobSummaryId) errmsgs = {} inputs = [] if "steps" in fwjrDocument["fwjr"]: for step in fwjrDocument["fwjr"]["steps"]: if "errors" in fwjrDocument["fwjr"]["steps"][step]: errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]] if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in \ fwjrDocument["fwjr"]["steps"][step]["input"]: inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source]) outputs = [] outputDataset = None for singlestep in job["fwjr"].listSteps(): for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep): if singlefile: if len(singlefile.get('locations', set())) > 1: locations = list(singlefile.get('locations')) elif singlefile.get('locations'): locations = singlefile['locations'].pop() else: locations = set() if CMSSTEP.match(singlestep): outType = 'output' else: outType = singlefile.get('module_label', None) outputs.append({'type': outType, 'lfn': singlefile.get('lfn', None), 'location': locations, 'checksums': singlefile.get('checksums', {}), 'size': singlefile.get('size', None)}) # it should have one output dataset for all the files outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset inputFiles = [] for inputFileStruct in job["fwjr"].getAllInputFiles(): # check if inputFileSummary needs to be extended inputFileSummary = {} inputFileSummary["lfn"] = inputFileStruct["lfn"] inputFileSummary["input_type"] = inputFileStruct["input_type"] inputFiles.append(inputFileSummary) # Don't record intermediate jobfailed status in the jobsummary # change to jobcooloff which will be overwritten by error handler anyway if (job["retry_count"] > 0) and (newstate == 'jobfailed'): summarystate = 'jobcooloff' else: summarystate = newstate jobSummary = {"_id": jobSummaryId, "wmbsid": job["id"], "type": "jobsummary", "retrycount": job["retry_count"], "workflow": job["workflow"], "task": job["task"], "jobtype": job["jobType"], "state": summarystate, "site": job.get("location", None), "cms_location": job["fwjr"].getSiteName(), "exitcode": job["fwjr"].getExitCode(), "eos_log_url": job["fwjr"].getLogURL(), "worker_node_info": job["fwjr"].getWorkerNodeInfo(), "errors": errmsgs, "lumis": inputs, "outputdataset": outputDataset, "inputfiles": inputFiles, "acdc_url": "%s/%s" % ( sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database), "agent_name": self.config.Agent.hostName, "output": outputs} if couchDocID is not None: try: currentJobDoc = self.jsumdatabase.document(id=jobSummaryId) jobSummary['_rev'] = currentJobDoc['_rev'] jobSummary['state_history'] = currentJobDoc.get('state_history', []) # record final status transition if newstate == 'success': finalStateDict = {'oldstate': oldstate, 'newstate': newstate, 'location': job["location"], 'timestamp': timestamp} jobSummary['state_history'].append(finalStateDict) noEmptyList = ["inputfiles", "lumis"] for prop in noEmptyList: jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, []) except CouchNotFoundError: pass self.jsumdatabase.queue(jobSummary, timestamp=True) if len(couchRecordsToUpdate) > 0: self.setCouchDAO.execute(bulkList=couchRecordsToUpdate, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobsdatabase.commit(callback=discardConflictingDocument) self.fwjrdatabase.commit(callback=discardConflictingDocument) self.jsumdatabase.commit() return