def main(): # FIXME update the workflow name here wf = "mcremone_task_EXO-RunIISummer15wmLHEGS-04802__v1_T_170811_181808_305" print("Looking for problematic inbox elements...") wq = WorkQueue("https://cmsweb.cern.ch/couchdb/workqueue") print("Workqueue config: server %s and db %s" % (wq.server.url, wq.db.name)) nonCancelableElements = ['Done', 'Canceled', 'Failed'] data = wq.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', { 'startkey': [wf], 'endkey': [wf, {}], 'reduce': False }) elements = [ x['id'] for x in data.get('rows', []) if x['key'][1] not in nonCancelableElements ] print("Found %d elements for wf %s" % (len(elements), wf)) total = 0 for eleSlice in grouper(elements, 100): try: wq.updateElements(*eleSlice, Status='CancelRequested') except Exception as ex: print("Exception happened, but keep going: %s" % str(ex)) else: total += 100 print("Elements updated: %s" % total) print("Done!") sys.exit(0)
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Instantiating central reqmgr and local workqueue print "ReqMgr2 URL : %s" % sanitizeURL(config.JobUpdater.reqMgr2Url)['url'] print "WorkQueue URL: %s and dbname %s" % (sanitizeURL(config.WorkQueueManager.couchurl)['url'], config.WorkQueueManager.dbname) reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url) workqueue = WorkQueue(config.WorkQueueManager.couchurl, config.WorkQueueManager.dbname) print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue" priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()] print "Retrieved %d workflows from workqueue" % len(workflowsToCheck) for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority'] except Exception, ex: print "Couldn't retrieve the priority of request %s" % workflow print "Error: %s" % ex continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow]
def main(): # FIXME update the workflow name here wf = "mcremone_task_EXO-RunIISummer15wmLHEGS-04802__v1_T_170811_181808_305" print("Looking for problematic inbox elements...") wq = WorkQueue("https://cmsweb.cern.ch/couchdb/workqueue") print("Workqueue config: server %s and db %s" % (wq.server.url, wq.db.name)) nonCancelableElements = ['Done', 'Canceled', 'Failed'] data = wq.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', {'startkey': [wf], 'endkey': [wf, {}], 'reduce': False}) elements = [x['id'] for x in data.get('rows', []) if x['key'][1] not in nonCancelableElements] print("Found %d elements for wf %s" % (len(elements), wf)) total = 0 for eleSlice in grouper(elements, 100): try: wq.updateElements(*eleSlice, Status='CancelRequested') except Exception as ex: print("Exception happened, but keep going: %s" % str(ex)) else: total += 100 print("Elements updated: %s" % total) print("Done!") sys.exit(0)
def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) self.reqmgr_db = api.db_handler.get_db(config.couch_reqmgr_db) self.reqmgr_db_service = RequestDBWriter(self.reqmgr_db, couchapp="ReqMgr") # this need for the post validtiaon self.gq_service = WorkQueue(config.couch_host, config.couch_workqueue_db)
def testCompletedWorkflow(self): # test getWork specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec( specName, "file", assignKwargs={'SiteWhitelist': ['T2_XX_SiteA']}) globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl, UnittestFlag=True, **self.queueParams) self.assertTrue(globalQ.queueWork(specUrl, specName, "teamA") > 0) wqApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') # overwrite default - can't test with stale view wqApi.defaultOptions = {'reduce': True, 'group': True} # This only checks minimum client call not exactly correctness of return # values. self.assertEqual(wqApi.getTopLevelJobsByRequest(), [{ 'total_jobs': 339, 'request_name': specName }]) results = wqApi.getJobsByStatus() self.assertEqual(results['Available']['sum_jobs'], 339) results = wqApi.getJobsByStatusAndPriority() resultsPrio = set( [item['priority'] for item in results.get('Available')]) self.assertItemsEqual(resultsPrio, [8000]) resultsJobs = sum([ item['sum_jobs'] for item in results.get('Available') if item['priority'] == 8000 ]) self.assertTrue(resultsJobs, 339) result = wqApi.getElementsCountAndJobsByWorkflow() self.assertEqual(len(result), 1) self.assertEqual(result[specName]['Available']['Jobs'], 339) data = wqApi.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', { 'startkey': [specName], 'endkey': [specName, {}], 'reduce': False }) elements = [x['id'] for x in data.get('rows', [])] wqApi.updateElements(*elements, Status='Canceled') # load this view once again to make sure it will be updated in the next assert.. data = wqApi.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', { 'startkey': [specName], 'endkey': [specName, {}], 'reduce': False }) self.assertEqual(len(wqApi.getCompletedWorkflow(stale=False)), 1) results = wqApi.getJobsByStatusAndPriority() resultsPrio = set( [item['priority'] for item in results.get('Canceled')]) self.assertItemsEqual(resultsPrio, [8000])
def getSiteInfoFromGlobalQueue(serviceURL): url, dbName = splitCouchServiceURL(serviceURL) globalQ = WorkQueue(url, dbName) try: queues = globalQ.getChildQueues() except Exception, ex: errorInfo = {} errorInfo['site_name'] = serviceURL return [errorInfo]
def testCompletedWorkflow(self): # test getWork specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file") globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl, UnittestFlag=True) self.assertTrue(globalQ.queueWork(specUrl, specName, "teamA") > 0) wqApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') # overwrite default - can't test with stale view wqApi.defaultOptions = {'reduce': True, 'group': True} # This only checks minimum client call not exactly correctness of return # values. self.assertEqual(wqApi.getTopLevelJobsByRequest(), [{'total_jobs': 339, 'request_name': specName}]) results = wqApi.getJobsByStatusAndPriority() self.assertEqual(results.keys(), ['Available']) self.assertEqual(results['Available'].keys(), [8000]) self.assertTrue(results['Available'][8000]['sum'], 339) result = wqApi.getElementsCountAndJobsByWorkflow() self.assertEqual(len(result), 1) self.assertEqual(result[specName]['Available']['Jobs'], 339) data = wqApi.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', {'startkey': [specName], 'endkey': [specName, {}], 'reduce': False}) elements = [x['id'] for x in data.get('rows', [])] wqApi.updateElements(*elements, Status='Canceled') # load this view once again to make sure it will be updated in the next assert.. data = wqApi.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', {'startkey': [specName], 'endkey': [specName, {}], 'reduce': False}) self.assertEqual(len(wqApi.getCompletedWorkflow(stale=False)), 1) self.assertEqual(wqApi.getJobsByStatusAndPriority().keys(), ['Canceled'])
def getRequestInfoFromLocalQueue(serviceURL): """ get the request info from local queue """ url, dbName = splitCouchServiceURL(serviceURL) service = WorkQueue(url, dbName) try: wmbsUrls = service.getWMBSUrl() jobStatusInfo = service.getJobInjectStatusByRequest() except Exception, ex: logging.error("%s: %s" % (serviceURL, str(ex))) return DFormatter.errorFormatter(serviceURL, "LocalQueue Down")
def getSiteInfoFromGlobalQueue(serviceURL): url, dbName = splitCouchServiceURL(serviceURL) globalQ = WorkQueue(url, dbName) try: queues = globalQ.getChildQueues() except Exception, ex: logging.warning("Error: %s" % str(ex)) errorInfo = {} errorInfo['site_name'] = serviceURL return [errorInfo]
def getSiteInfoFromLocalQueue(serviceURL): """ get agent status from local agent """ url, dbName = splitCouchServiceURL(serviceURL) wqService = WorkQueue(url, dbName) try: wmbsUrls = wqService.getWMBSUrl() except Exception, ex: errorInfo = {} errorInfo['site_name'] = serviceURL return [errorInfo]
def getAgentInfoFromGlobalQueue(serviceURL): url, dbName = splitCouchServiceURL(serviceURL) globalQ = WorkQueue(url, dbName) try: childQueues = globalQ.getChildQueues() except Exception, ex: errorInfo = {} errorInfo['url'] = serviceURL errorInfo['status'] = "Global Queue down: %s" % serviceURL errorInfo['acdc'] = 'N/A' return [errorInfo]
def getSiteInfoFromLocalQueue(serviceURL): """ get agent status from local agent """ url, dbName = splitCouchServiceURL(serviceURL) wqService = WorkQueue(url, dbName) try: wmbsUrls = wqService.getWMBSUrl() except Exception, ex: logging.warning("Error: %s" % str(ex)) errorInfo = {} errorInfo['site_name'] = serviceURL return [errorInfo]
def getAgentInfoFromGlobalQueue(serviceURL): url, dbName = splitCouchServiceURL(serviceURL) globalQ = WorkQueue(url, dbName) try: childQueues = globalQ.getChildQueues() except Exception, ex: logging.warning("Error: %s" % str(ex)) errorInfo = {} errorInfo['url'] = serviceURL errorInfo['status'] = "Global Queue down: %s" % serviceURL errorInfo['acdc'] = 'N/A' return [errorInfo]
def getAgentInfoFromLocalQueue(serviceURL): """ get agent status from local agent """ url, dbName = splitCouchServiceURL(serviceURL) localQ = WorkQueue(url, dbName) try: wmbsUrl = localQ.getWMBSUrl() except Exception, ex: errorInfo = {} errorInfo['url'] = serviceURL errorInfo['status'] = "Local Queue down: %s" % serviceURL errorInfo['acdc'] = 'N/A' return errorInfo
def testWorkQueueService(self): # test getWork specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file", assignKwargs={'SiteWhitelist': ['T2_XX_SiteA']}) globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl, UnittestFlag=True) self.assertTrue(globalQ.queueWork(specUrl, specName, "teamA") > 0) wqApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') # overwrite default - can't test with stale view wqApi.defaultOptions = {'reduce': True, 'group': True} # This only checks minimum client call not exactly correctness of return # values. self.assertEqual(wqApi.getTopLevelJobsByRequest(), [{'total_jobs': 339, 'request_name': specName}]) # work still available, so no childQueue results = wqApi.getChildQueuesAndStatus() self.assertItemsEqual(set([item['agent_name'] for item in results]), ["AgentNotDefined"]) result = wqApi.getElementsCountAndJobsByWorkflow() self.assertEqual(len(result), 1) self.assertEqual(result[specName]['Available']['Jobs'], 339) results = wqApi.getChildQueuesAndPriority() resultsPrio = set([item['priority'] for item in results if item['agent_name'] == "AgentNotDefined"]) self.assertItemsEqual(resultsPrio, [8000]) self.assertEqual(wqApi.getWMBSUrl(), []) self.assertEqual(wqApi.getWMBSUrlByRequest(), [])
def testWorkQueueService(self): # test getWork specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec( specName, "file", assignKwargs={'SiteWhitelist': ['T2_XX_SiteA']}) globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl, UnittestFlag=True) self.assertTrue(globalQ.queueWork(specUrl, specName, "teamA") > 0) wqApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') # overwrite default - can't test with stale view wqApi.defaultOptions = {'reduce': True, 'group': True} # This only checks minimum client call not exactly correctness of return # values. self.assertEqual(wqApi.getTopLevelJobsByRequest(), [{ 'total_jobs': 339, 'request_name': specName }]) # work still available, so no childQueue self.assertEqual(wqApi.getChildQueuesAndStatus().keys(), [None]) result = wqApi.getElementsCountAndJobsByWorkflow() self.assertEqual(len(result), 1) self.assertEqual(result[specName]['Available']['Jobs'], 339) self.assertEqual(wqApi.getChildQueuesAndPriority()[None].keys(), [8000]) self.assertEqual(wqApi.getWMBSUrl(), []) self.assertEqual(wqApi.getWMBSUrlByRequest(), [])
def getAgentInfoFromLocalQueue(serviceURL): """ get agent status from local agent """ url, dbName = splitCouchServiceURL(serviceURL) localQ = WorkQueue(url, dbName) try: wmbsUrl = localQ.getWMBSUrl() except Exception, ex: logging.warning("Error: %s" % str(ex)) errorInfo = {} errorInfo['url'] = serviceURL errorInfo['status'] = "Local Queue down: %s" % serviceURL errorInfo['acdc'] = 'N/A' return errorInfo
def testWorkQueueService(self): # test getWork specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file") globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl) self.assertTrue(globalQ.queueWork(specUrl, "RerecoSpec", "teamA") > 0) wqApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') #overwrite default - can't test with stale view wqApi.defaultOptions = {'reduce': True, 'group': True} #This only checks minimum client call not exactly correctness of return # values. self.assertEqual(wqApi.getTopLevelJobsByRequest(), [{ 'total_jobs': 10, 'request_name': specName }]) self.assertEqual(wqApi.getChildQueues(), []) self.assertEqual(wqApi.getJobStatusByRequest(), [{ 'status': 'Available', 'jobs': 10, 'request_name': specName }]) self.assertEqual(wqApi.getChildQueuesByRequest(), []) self.assertEqual(wqApi.getWMBSUrl(), []) self.assertEqual(wqApi.getWMBSUrlByRequest(), [])
def getRequestInfoFromGlobalQueue(serviceURL): """ get the request info from global queue """ url, dbName = splitCouchServiceURL(serviceURL) service = WorkQueue(url, dbName) try: jobInfo = service.getTopLevelJobsByRequest() qInfo = service.getChildQueuesByRequest() siteWhitelists = service.getSiteWhitelistByRequest() childQueueURLs = set() for item in qInfo: childQueueURLs.add(item['local_queue']) except Exception, ex: logging.error("%s: %s" % (serviceURL, str(ex))) return DFormatter.errorFormatter(serviceURL, "GlobalQueue Down")
def advanceStatus(self, config): """ gather active data statistics """ reqDBWriter = RequestDBWriter(config.reqmgrdb_url) gqService = WorkQueue(config.workqueue_url) self.logger.info("Getting GQ data for status check") wfStatusDict = gqService.getWorkflowStatusFromWQE() self.logger.info("Advancing status") moveForwardStatus(reqDBWriter, wfStatusDict, self.logger) moveToArchivedForNoJobs(reqDBWriter, wfStatusDict, self.logger) return
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = {'proxy': {'error': 3, 'warning': 5}, 'certificate': {'error': 10, 'warning': 20}} # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl)
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = {'proxy': {'error': 3, 'warning': 5}, 'certificate': {'error': 10, 'warning': 20}} # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl)
def test40WorkQueueAcquires(self): """WorkQueue picks up request""" if not self.__class__.request_name: raise nose.SkipTest start = time.time() while True: workqueue = self.reqmgr.getWorkQueue( request=self.__class__.request_name) if workqueue: self.__class__.workqueue = WorkQueue(workqueue[0]) self.__class__.request = self.__class__.reqmgr.getRequest( self.__class__.request_name) self.assertTrue( self.__class__.request['RequestStatus'] in ('acquired', 'running')) request = [x for x in self.__class__.workqueue.getJobStatusByRequest() if \ x['request_name'] == self.__class__.request_name] if [ x for x in request if x['status'] in ('Available', 'Negotiating', 'Acquired', 'Running') ]: break if start + (60 * 20) < time.time(): raise RuntimeError('timeout waiting for workqueue to acquire') time.sleep(15)
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, "WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL) logging.info("Setting the replication to central monitor ...") self.localSummaryCouchDB.replicate(self.config.AnalyticsDataCollector.centralWMStatsURL) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) self.reqmgr_db = api.db_handler.get_db(config.couch_reqmgr_db) self.reqmgr_db_service = RequestDBWriter(self.reqmgr_db, couchapp="ReqMgr") # this need for the post validtiaon self.reqmgr_aux_db = api.db_handler.get_db(config.couch_reqmgr_aux_db) self.gq_service = WorkQueue(config.couch_host, config.couch_workqueue_db)
def advanceStatus(self, config): """ Advance the request status based on the global workqueue elements status """ reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger) gqService = WorkQueue(config.workqueue_url) self.logger.info("Getting GQ data for status check") wfStatusDict = gqService.getWorkflowStatusFromWQE() self.logger.info("Advancing statuses") if getattr(config, "enableMSStatusTransition", False): moveTransferorStatus(reqmgrSvc, self.logger) moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger) moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger) self.logger.info("Done advancing status") return
def advanceStatus(self, config): """ Advance the request status based on the global workqueue elements status """ reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger) gqService = WorkQueue(config.workqueue_url) wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger) self.logger.info("Getting GQ data for status check") wfStatusDict = gqService.getWorkflowStatusFromWQE() self.logger.info("Advancing status") moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger) moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger) moveToArchived(wmstatsSvc, reqmgrSvc, self.logDB, config.archiveDelayHours, self.logger) self.logger.info("Done advancing status") return
def testCancelWorkGlobal(self): """Cancel work in global queue""" # queue to global & pull to local self.globalQueue.queueWork(self.processingSpec.specUrl()) self.globalQueue.updateLocationInfo() self.assertEqual(self.localQueue.pullWork({'T2_XX_SiteA' : 1000}), 2) syncQueues(self.localQueue) work = self.localQueue.getWork({'T2_XX_SiteA' : 1000, 'T2_XX_SiteB' : 1000}) self.assertEqual(len(work), 2) syncQueues(self.localQueue) # cancel in global, and propagate down to local #service = WorkQueueService({'endpoint': self.localQueue.backend.parentCouchUrl}) service = WorkQueueService(self.localQueue.backend.parentCouchUrlWithAuth) service.cancelWorkflow(self.processingSpec.name()) #self.globalQueue.cancelWork(WorkflowName = self.spec.name()) self.globalQueue.performQueueCleanupActions() self.assertEqual(len(self.globalQueue.statusInbox(status='CancelRequested')), 1) self.assertEqual(len(self.globalQueue.status(status='CancelRequested')), 2) syncQueues(self.localQueue) self.assertEqual(len(self.localQueue.statusInbox(status='Canceled')), 2) self.assertEqual(len(self.localQueue.status()), 0) # check cancel propagated back to global syncQueues(self.localQueue) self.assertEqual(len(self.globalQueue.status(status='Canceled')), 2) self.globalQueue.performQueueCleanupActions() syncQueues(self.localQueue) self.assertEqual(len(self.localQueue.statusInbox()), 0) self.assertEqual(len(self.globalQueue.statusInbox(status='Canceled')), 1) self.assertEqual(len(self.globalQueue.status()), 0) self.globalQueue.deleteWorkflows(self.processingSpec.name()) # cancel work in global before it reaches a local queue self.globalQueue.queueWork(self.spec.specUrl()) self.assertEqual(len(self.globalQueue.status(status='Available')), 1) service.cancelWorkflow(self.spec.name()) self.globalQueue.performQueueCleanupActions() self.assertEqual(len(self.globalQueue.status()), 0) self.assertEqual(len(self.globalQueue.statusInbox(status='Canceled')), 1) self.globalQueue.deleteWorkflows(self.spec.name())
def advanceStatus(self, config): """ Advance the request status based on the global workqueue elements status """ reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger) gqService = WorkQueue(config.workqueue_url) wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger) logdb = LogDB(config.central_logdb_url, config.log_reporter) self.logger.info("Getting GQ data for status check") wfStatusDict = gqService.getWorkflowStatusFromWQE() self.logger.info("Advancing status") moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger) moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger) moveToArchived(wmstatsSvc, reqmgrSvc, logdb, config.archiveDelayHours, self.logger) self.logger.info("Done advancing status") return
def advanceStatus(self, config): """ gather active data statistics """ reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger) gqService = WorkQueue(config.workqueue_url) wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger) self.logger.info("Getting GQ data for status check") wfStatusDict = gqService.getWorkflowStatusFromWQE() self.logger.info("Advancing status") moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger) moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger) moveToArchived(wmstatsSvc, reqmgrSvc, config.archiveDelayHours, self.logger) self.logger.info("Done advancing status") return
def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ[ 'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) # Instantiating central reqmgr and local workqueue print "ReqMgr2 URL : %s" % sanitizeURL( config.JobUpdater.reqMgr2Url)['url'] print "WorkQueue URL: %s and dbname %s" % (sanitizeURL( config.WorkQueueManager.couchurl)['url'], config.WorkQueueManager.dbname) reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url) workqueue = WorkQueue(config.WorkQueueManager.couchurl, config.WorkQueueManager.dbname) print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue" priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()] print "Retrieved %d workflows from workqueue" % len(workflowsToCheck) for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = reqmgr2.getRequestByNames( workflow)[workflow]['RequestPriority'] except Exception, ex: print "Couldn't retrieve the priority of request %s" % workflow print "Error: %s" % ex continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow]
def testUpdatePriorityService(self): """ _testUpdatePriorityService_ Check that we can update the priority correctly also check the available workflows feature """ specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file") globalQ = globalQueue(DbName = 'workqueue_t', QueueURL = self.testInit.couchUrl) localQ = localQueue(DbName = 'local_workqueue_t', QueueURL = self.testInit.couchUrl, CacheDir = self.testInit.testDir, ParentQueueCouchUrl = '%s/workqueue_t' % self.testInit.couchUrl, ParentQueueInboxCouchDBName = 'workqueue_t_inbox' ) # Try a full chain of priority update and propagation self.assertTrue(globalQ.queueWork(specUrl, "RerecoSpec", "teamA") > 0) globalApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') globalApi.updatePriority(specName, 100) self.assertEqual(globalQ.backend.getWMSpec(specName).priority(), 100) storedElements = globalQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) self.assertTrue(localQ.pullWork({'T2_XX_SiteA' : 10}, continuousReplication = False) > 0) localQ.processInboundWork(continuous = False) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) localApi = WorkQueueDS(self.testInit.couchUrl, 'local_workqueue_t') localApi.updatePriority(specName, 500) self.assertEqual(localQ.backend.getWMSpec(specName).priority(), 500) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 500) self.assertEqual(localApi.getAvailableWorkflows(), set([(specName, 500)])) # Attempt to update an inexistent workflow in the queue try: globalApi.updatePriority('NotExistent', 2) except: self.fail('No exception should be raised.')
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl)
def testWorkQueueService(self): # test getWork specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file") globalQ = globalQueue(DbName = 'workqueue_t', QueueURL = self.testInit.couchUrl) self.assertTrue(globalQ.queueWork(specUrl, "RerecoSpec", "teamA") > 0) wqApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') #overwrite default - can't test with stale view wqApi.defaultOptions = {'reduce' : True, 'group' : True} #This only checks minimum client call not exactly correctness of return # values. self.assertEqual(wqApi.getTopLevelJobsByRequest(), [{'total_jobs': 10, 'request_name': specName}]) self.assertEqual(wqApi.getChildQueues(), []) self.assertEqual(wqApi.getJobStatusByRequest(), [{'status': 'Available', 'jobs': 10, 'request_name': specName}]) self.assertEqual(wqApi.getChildQueuesByRequest(), []) self.assertEqual(wqApi.getWMBSUrl(), []) self.assertEqual(wqApi.getWMBSUrlByRequest(), [])
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.JobUpdater.reqMgr2Url) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")
def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = { 'proxy': { 'error': 3, 'warning': 5 }, 'certificate': { 'error': 10, 'warning': 20 } } localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('dashb-mb.cern.ch', 61113)])
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = {'proxy': {'error': 3, 'warning': 5}, 'certificate': {'error': 10, 'warning': 20}} # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkCredLifetime(agentInfo, "proxy") self.checkCredLifetime(agentInfo, "certificate") timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not self.isT0agent: timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) self.uploadAgentInfoToCentralWMStats(agentInfo) self.buildMonITDocs(agentInfo) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} wqStates = ['Available', 'Acquired'] results['workByStatus'] = self.workqueueDS.getJobsByStatus() results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority() elements = self.workqueueDS.getElementsByStatus(wqStates) uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo): """ Add some required fields to the document before it can get uploaded to WMStats. :param agentInfo: dict with agent stats to be posted to couchdb """ agentInfo['_id'] = agentInfo["agent_url"] agentInfo['timestamp'] = int(time.time()) agentInfo['type'] = "agent_info" # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed try: self.centralWMStatsCouchDB.updateAgentInfo(agentInfo, propertiesToKeep=["data_last_update", "data_error"]) except Exception as e: logging.error("Failed to upload agent statistics to WMStats. Error: %s", str(e)) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkCredLifetime(self, agInfo, credType): """ Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :param credType: credential type, can be: "proxy" or "certificate" :return: same dictionary object plus additional keys/values if needed. """ if credType == "proxy": credFile = self.proxyFile secsLeft = self.proxy.getTimeLeft(proxy=credFile) elif credType == "certificate": credFile = self.userCertFile secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True) else: logging.error("Unknown credential type. Available options are: [proxy, certificate]") return logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft) daysLeft = secsLeft / (60. * 60 * 24) if daysLeft <= self.credThresholds[credType]['error']: credWarning = True agInfo['status'] = "error" elif daysLeft <= self.credThresholds[credType]['warning']: credWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: credWarning = False if credWarning: warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile) warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.) agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg logging.warning(warnMsg) return def buildMonITDocs(self, dataStats): """ Convert agent statistics into MonIT-friendly documents to be posted to AMQ/ES. It creates 5 different type of documents: * priority information * site information * work information * agent information * agent health information Note that the internal methods are popping some metrics out of dataStats """ if not self.postToAMQ: return logging.info("Preparing documents to be posted to AMQ/MonIT..") allDocs = self._buildMonITPrioDocs(dataStats) allDocs.extend(self._buildMonITSitesDocs(dataStats)) allDocs.extend(self._buildMonITWorkDocs(dataStats)) allDocs.extend(self._buildMonITWMBSDocs(dataStats)) allDocs.extend(self._buildMonITAgentDocs(dataStats)) allDocs.extend(self._buildMonITHealthDocs(dataStats)) allDocs.extend(self._buildMonITSummaryDocs(dataStats)) # and finally post them all to AMQ logging.info("Found %d documents to post to AMQ", len(allDocs)) self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp']) def _buildMonITPrioDocs(self, dataStats): """ Uses the `sitePendCountByPrio` metric in order to build documents reporting the site name, job priority and amount of jobs within that priority. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_prio_info MonIT docs """ docType = "wma_prio_info" prioDocs = [] sitePendCountByPrio = dataStats['WMBS_INFO'].pop('sitePendCountByPrio', []) for site, item in sitePendCountByPrio.iteritems(): # it seems sites with no jobs are also always here as "Sitename": {0: 0} if item.keys() == [0]: continue for prio, jobs in item.iteritems(): prioDoc = {} prioDoc['site_name'] = site prioDoc['type'] = docType prioDoc['priority'] = prio prioDoc['job_count'] = jobs prioDocs.append(prioDoc) return prioDocs def _buildMonITSitesDocs(self, dataStats): """ Uses the site thresholds and job information for each site in order to build a `site_info` document type for MonIT. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_site_info MonIT docs """ docType = "wma_site_info" siteDocs = [] thresholds = dataStats['WMBS_INFO'].pop('thresholds', {}) thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {}) if self.isT0agent: possibleJobsPerSite = {} uniqueJobsPerSite = {} else: possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop('possibleJobsPerSite', {}) uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop('uniqueJobsPerSite', {}) for site in sorted(thresholds): siteDoc = {} siteDoc['site_name'] = site siteDoc['type'] = docType siteDoc['thresholds'] = thresholds[site] siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown') siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0) for status in possibleJobsPerSite.keys(): # make sure these keys are always present in the documents jobKey = "possible_%s_jobs" % status.lower() elemKey = "num_%s_elem" % status.lower() uniJobKey = "unique_%s_jobs" % status.lower() siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0 if site in possibleJobsPerSite[status]: siteDoc[jobKey] = possibleJobsPerSite[status][site]['sum_jobs'] siteDoc[elemKey] = possibleJobsPerSite[status][site]['num_elem'] if site in uniqueJobsPerSite[status]: siteDoc[uniJobKey] = uniqueJobsPerSite[status][site]['sum_jobs'] siteDocs.append(siteDoc) return siteDocs def _buildMonITWorkDocs(self, dataStats): """ Uses the local workqueue information order by WQE status and build statistics for the workload in terms of workqueue elements and top level jobs. Using the WMBS data, also builds documents to show the amount of work in 'created' and 'executing' WMBS status. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_work_info MonIT docs """ workDocs = [] if self.isT0agent: return workDocs docType = "wma_work_info" workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {}) for status, info in workByStatus.items(): workDoc = {} workDoc['type'] = docType workDoc['status'] = status workDoc['num_elem'] = info.get('num_elem', 0) workDoc['sum_jobs'] = info.get('sum_jobs', 0) workDocs.append(workDoc) return workDocs def _buildMonITWMBSDocs(self, dataStats): """ Using the WMBS data, builds documents to show the amount of work in 'created' and 'executing' WMBS status. It also builds a document for every single wmbs_status in the database. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs """ docType = "wma_wmbs_info" wmbsDocs = [] wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop('wmbsCreatedTypeCount', {}) wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop('wmbsExecutingTypeCount', {}) for jobType in wmbsCreatedTypeCount: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['job_type'] = jobType wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType] wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType] wmbsDocs.append(wmbsDoc) docType = "wma_wmbs_state_info" wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {}) for wmbsStatus in wmbsCountByState: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['wmbs_status'] = wmbsStatus wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus] wmbsDocs.append(wmbsDoc) return wmbsDocs def _buildMonITAgentDocs(self, dataStats): """ Uses the BossAir and WMBS table information in order to build a view of amount of jobs in different statuses. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_agent_info MonIT docs """ docType = "wma_agent_info" agentDocs = [] activeRunJobByStatus = dataStats['WMBS_INFO'].pop('activeRunJobByStatus', {}) completeRunJobByStatus = dataStats['WMBS_INFO'].pop('completeRunJobByStatus', {}) for schedStatus in activeRunJobByStatus: agentDoc = {} agentDoc['type'] = docType agentDoc['schedd_status'] = schedStatus agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus] agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus] agentDocs.append(agentDoc) return agentDocs def _buildMonITHealthDocs(self, dataStats): """ Creates documents with specific agent information, status of each component and worker thread (similar to what is shown in wmstats) and also some very basic performance numbers. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_health_info" healthDocs = [] workersStatus = dataStats.pop('workers', {}) for worker in workersStatus: healthDoc = {} healthDoc['type'] = docType healthDoc['worker_name'] = worker['name'] healthDoc['worker_state'] = worker['state'] healthDoc['worker_poll'] = worker['poll_interval'] healthDoc['worker_last_hb'] = worker['last_updated'] healthDoc['worker_cycle_time'] = worker['cycle_time'] healthDocs.append(healthDoc) return healthDocs def _buildMonITSummaryDocs(self, dataStats): """ Creates a document with the very basic agent info used in the wmstats monitoring tab. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_summary_info" summaryDocs = [] summaryDoc = {} summaryDoc['type'] = docType summaryDoc['agent_team'] = dataStats['agent_team'] summaryDoc['agent_version'] = dataStats['agent_version'] summaryDoc['agent_status'] = dataStats['status'] if not self.isT0agent: summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO']['total_query_time'] summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO']['total_query_time'] summaryDoc['drain_mode'] = dataStats['drain_mode'] summaryDoc['down_components'] = dataStats['down_components'] summaryDocs.append(summaryDoc) return summaryDocs def uploadToAMQ(self, docs, agentUrl, timeS): """ _uploadToAMQ_ Sends data to AMQ, which ends up in the MonIT infrastructure. :param docs: list of documents/dicts to be posted """ if not docs: logging.info("There are no documents to send to AMQ") return # add mandatory information for every single document for doc in docs: doc['agent_url'] = agentUrl docType = "cms_%s_info" % self.producer logging.debug("Sending the following data to AMQ %s", pformat(docs)) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=self.producer, topic=self.topicAMQ, host_and_ports=self.hostPortAMQ, logger=logging) notifications = [stompSvc.make_notification(payload=doc, docType=docType, ts=timeS, dataSubfield="payload") for doc in docs] failures = stompSvc.send(notifications) logging.info("%i docs successfully sent to AMQ", len(notifications) - len(failures)) except Exception as ex: logging.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class JobUpdaterPoller(BaseWorkerThread): """ _JobUpdaterPoller_ Poller class for the JobUpdater """ def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config = self.config) self.reqmgr = RequestManager({'endpoint' : self.config.JobUpdater.reqMgrUrl}) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname = "Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname = "Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname = "Jobs.GetNumberOfJobsForWorkflowTaskStatus") def setup(self, parameters = None): """ _setup_ """ pass def terminate(self, parameters = None): """ _terminate_ Terminate gracefully. """ pass def algorithm(self, parameters = None): """ _algorithm_ """ logging.info("Synchronizing priorities with ReqMgr...") self.synchronizeJobPriority() def synchronizeJobPriority(self): """ _synchronizeJobPriority_ Check WMBS and WorkQueue for active workflows and compare with the ReqMgr for priority changes. If a priority change occurs then update the job priority in the batch system and the elements in the local queue that have not been injected yet. """ # Update the priority of workflows that are not in WMBS and just in local queue priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()] for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = self.reqmgr.getRequest(workflow)['RequestPriority'] except Exception, ex: logging.error("Couldn't retrieve the priority of request %s" % workflow) logging.error("Error: %s" % ex) continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow] for workflow in workflowsToUpdate: self.workqueue.updatePriority(workflow, workflowsToUpdate[workflow]) # Check the workflows in WMBS priorityCache = {} workflowsToUpdateWMBS = {} workflowsToCheck = self.listWorkflowsDAO.execute() for workflowEntry in workflowsToCheck: workflow = workflowEntry['name'] if workflow not in priorityCache: try: priorityCache[workflow] = self.reqmgr.getRequest(workflow)['RequestPriority'] except Exception, ex: logging.error("Couldn't retrieve the priority of request %s" % workflow) logging.error("Error: %s" % ex) continue requestPriority = priorityCache[workflow] if requestPriority != workflowEntry['workflow_priority']: # Update the workqueue priority for the Available elements self.workqueue.updatePriority(workflow, priorityCache[workflow]) # Check if there are executing jobs for this particular task if self.executingJobsDAO.execute(workflow, workflowEntry['task']) > 0: self.bossAir.updateJobInformation(workflow, workflowEntry['task'], requestPriority = priorityCache[workflow], taskPriority = workflowEntry['task_priority']) workflowsToUpdateWMBS[workflow] = priorityCache[workflow]
class Request(RESTEntity): def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) self.reqmgr_db = api.db_handler.get_db(config.couch_reqmgr_db) self.reqmgr_db_service = RequestDBWriter(self.reqmgr_db, couchapp="ReqMgr") # this need for the post validtiaon self.reqmgr_aux_db = api.db_handler.get_db(config.couch_reqmgr_aux_db) self.gq_service = WorkQueue(config.couch_host, config.couch_workqueue_db) def _requestArgMapFromBrowser(self, request_args): """ This is specific mapping function data from browser TODO: give a key word so it doesn't have to loop though in general """ docs = [] for doc in request_args: for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _validateGET(self, param, safe): # TODO: need proper validation but for now pass everything args_length = len(param.args) if args_length == 1: safe.kwargs["name"] = param.args[0] param.args.pop() return no_multi_key = ["detail", "_nostale", "date_range", "common_dict"] for key, value in param.kwargs.items(): # convert string to list if key not in no_multi_key and isinstance(value, basestring): param.kwargs[key] = [value] detail = param.kwargs.get('detail', True) if detail in (False, "false", "False", "FALSE"): detail = False if "status" in param.kwargs and detail: for status in param.kwargs["status"]: if status.endswith("-archived"): raise InvalidSpecParameterValue( """Can't retrieve bulk archived status requests with detail option True, set detail=false or use other search arguments""") for prop in param.kwargs: safe.kwargs[prop] = param.kwargs[prop] for prop in safe.kwargs: del param.kwargs[prop] return def _validateRequestBase(self, param, safe, valFunc, requestName=None): data = cherrypy.request.body.read() if data: request_args = json.loads(data) if requestName: request_args["RequestName"] = requestName else: # actually this is error case # cherrypy.log(str(param.kwargs)) request_args = {} for prop in param.kwargs: request_args[prop] = param.kwargs[prop] for prop in request_args: del param.kwargs[prop] if requestName: request_args["RequestName"] = requestName request_args = [request_args] safe.kwargs['workload_pair_list'] = [] if isinstance(request_args, dict): request_args = [request_args] for args in request_args: workload, r_args = valFunc(args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) def _get_request_names(self, ids): "Extract request names from given documents" # cherrypy.log("request names %s" % ids) doc = {} if isinstance(ids, list): for rid in ids: doc[rid] = 'on' elif isinstance(ids, basestring): doc[ids] = 'on' docs = [] for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _getMultiRequestArgs(self, multiRequestForm): request_args = {} for prop in multiRequestForm: if prop == "ids": request_names = self._get_request_names(multiRequestForm["ids"]) elif prop == "new_status": request_args["RequestStatus"] = multiRequestForm[prop] # remove this # elif prop in ["CustodialSites", "AutoApproveSubscriptionSites"]: # request_args[prop] = [multiRequestForm[prop]] else: request_args[prop] = multiRequestForm[prop] return request_names, request_args def _validateMultiRequests(self, param, safe, valFunc): data = cherrypy.request.body.read() if data: request_names, request_args = self._getMultiRequestArgs(json.loads(data)) else: # actually this is error case # cherrypy.log(str(param.kwargs)) request_names, request_args = self._getMultiRequestArgs(param.kwargs) for prop in request_args: if prop == "RequestStatus": del param.kwargs["new_status"] else: del param.kwargs[prop] del param.kwargs["ids"] # remove this # tmp = [] # for prop in param.kwargs: # tmp.append(prop) # for prop in tmp: # del param.kwargs[prop] safe.kwargs['workload_pair_list'] = [] for request_name in request_names: request_args["RequestName"] = request_name workload, r_args = valFunc(request_args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) safe.kwargs["multi_update_flag"] = True def _getRequestNamesFromBody(self, param, safe, valFunc): request_names = json.loads(cherrypy.request.body.read()) safe.kwargs['workload_pair_list'] = request_names safe.kwargs["multi_names_flag"] = True def validate(self, apiobj, method, api, param, safe): # to make validate successful # move the validated argument to safe # make param empty # other wise raise the error try: if method == 'GET': self._validateGET(param, safe) if method == 'PUT': args_length = len(param.args) if args_length == 1: requestName = param.args[0] param.args.pop() else: requestName = None self._validateRequestBase(param, safe, validate_request_update_args, requestName) # TO: handle multiple clone # if len(param.args) == 2: # #validate clone case # if param.args[0] == "clone": # param.args.pop() # return None, request_args if method == 'POST': args_length = len(param.args) if args_length == 1 and param.args[0] == "multi_update": # special case for multi update from browser. param.args.pop() self._validateMultiRequests(param, safe, validate_request_update_args) elif args_length == 1 and param.args[0] == "bynames": # special case for multi update from browser. param.args.pop() self._getRequestNamesFromBody(param, safe, validate_request_update_args) else: self._validateRequestBase(param, safe, validate_request_create_args) except InvalidSpecParameterValue as ex: raise ex except Exception as ex: # TODO add proper error message instead of trace back msg = traceback.format_exc() cherrypy.log("Error: %s" % msg) if hasattr(ex, "message"): if hasattr(ex.message, '__call__'): msg = ex.message() else: msg = str(ex) else: msg = str(ex) raise InvalidSpecParameterValue(msg) def initialize_clone(self, request_name): requests = self.reqmgr_db_service.getRequestByNames(request_name) clone_args = requests.values()[0] # overwrite the name and time stamp. initialize_request_args(clone_args, self.config, clone=True) # timestamp status update spec = loadSpecByType(clone_args["RequestType"]) workload = spec.factoryWorkloadConstruction(clone_args["RequestName"], clone_args) return (workload, clone_args) def _maskTaskStepChain(self, masked_dict, req_dict, chain_name, mask_key): mask_exist = False num_loop = req_dict["%sChain" % chain_name] for i in range(num_loop): if mask_key in req_dict["%s%s" % (chain_name, i+1)]: mask_exist = True break if mask_exist: defaultValue = masked_dict[mask_key] masked_dict[mask_key] = [] # assume mask_key is list if the condition doesn't meet. for i in range(num_loop): chain = req_dict["%s%s" % (chain_name, i+1)] if mask_key in chain: chain_key = "%sName" % chain_name masked_dict[mask_key].append({chain_key: chain[chain_key], mask_key: chain[mask_key]}) else: if isinstance(defaultValue, dict): value = defaultValue.get(chain_key, None) else: value = defaultValue masked_dict[mask_key].append({chain_key: chain[chain_key], mask_key: chain[mask_key]}) return def _mask_result(self, mask, result): if len(mask) == 1 and mask[0] == "DAS": mask = ReqMgrConfigDataCache.getConfig("DAS_RESULT_FILTER")["filter_list"] if len(mask) > 0: masked_result = {} for req_name, req_info in result.items(): masked_result.setdefault(req_name, {}) for mask_key in mask: masked_result[req_name].update({mask_key: req_info.get(mask_key, None)}) if "TaskChain" in req_info: self._maskTaskStepChain(masked_result[req_name], req_info, "Task", mask_key) elif "StepChain" in req_info: self._maskTaskStepChain(masked_result[req_name], req_info,"Step", mask_key) return masked_result else: return result @restcall(formats=[('text/plain', PrettyJSONFormat()), ('application/json', JSONFormat())]) def get(self, **kwargs): """ Returns request info depending on the conditions set by kwargs Currently defined kwargs are following. statusList, requestNames, requestType, prepID, inputDataset, outputDataset, dateRange If jobInfo is True, returns jobInfomation about the request as well. TODO: stuff like this has to masked out from result of this call: _attachments: {u'spec': {u'stub': True, u'length': 51712, u'revpos': 2, u'content_type': u'application/json'}} _id: maxa_RequestString-OVERRIDE-ME_130621_174227_9225 _rev: 4-c6ceb2737793aaeac3f1cdf591593da4 """ # list of status status = kwargs.get("status", []) # list of request names name = kwargs.get("name", []) request_type = kwargs.get("request_type", []) prep_id = kwargs.get("prep_id", []) inputdataset = kwargs.get("inputdataset", []) outputdataset = kwargs.get("outputdataset",[]) date_range = kwargs.get("date_range", False) campaign = kwargs.get("campaign", []) workqueue = kwargs.get("workqueue", []) team = kwargs.get("team", []) mc_pileup = kwargs.get("mc_pileup", []) data_pileup = kwargs.get("data_pileup", []) requestor = kwargs.get("requestor", []) mask = kwargs.get("mask", []) detail = kwargs.get("detail", True) # set the return format. default format has requset name as a key # if is set to one it returns list of dictionary with RequestName field. common_dict = int(kwargs.get("common_dict", 0)) if detail in (False, "false", "False", "FALSE"): option = {"include_docs": False} else: option = {"include_docs": True} # eventhing should be stale view. this only needs for test _nostale = kwargs.get("_nostale", False) if _nostale: self.reqmgr_db_service._setNoStale() request_info = [] if len(status) == 1 and status[0] == "ACTIVE": status = ACTIVE_STATUS if status and not team and not request_type and not requestor: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bystatus", option, status)) if status and team: query_keys = [[t, s] for t in team for s in status] request_info.append( self.reqmgr_db_service.getRequestByCouchView("byteamandstatus", option, query_keys)) if status and request_type: query_keys = [[s, rt] for rt in request_type for s in status] request_info.append(self.reqmgr_db_service.getRequestByCouchView("requestsbystatusandtype", option, query_keys)) if status and requestor: query_keys = [[s, r] for r in requestor for s in status] request_info.append( self.reqmgr_db_service.getRequestByCouchView("bystatusandrequestor", option, query_keys)) if name: request_info.append(self.reqmgr_db_service.getRequestByNames(name)) if prep_id: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byprepid", option, prep_id)) if inputdataset: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byinputdataset", option, inputdataset)) if outputdataset: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byoutputdataset", option, outputdataset)) if date_range: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bydate", option, date_range)) if campaign: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bycampaign", option, campaign)) if workqueue: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byworkqueue", option, workqueue)) if mc_pileup: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bymcpileup", option, mc_pileup)) if data_pileup: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bydatapileup", option, data_pileup)) # get interaction of the request result = self._intersection_of_request_info(request_info) if len(result) == 0: return [] result = self._mask_result(mask, result) # If detail is set to False return just list of request name if not option["include_docs"]: return result.keys() if common_dict == 1: response_list = result.values() else: response_list = [result] return rows(response_list) def _intersection_of_request_info(self, request_info): requests = {} if len(request_info) < 1: return requests request_key_set = set(request_info[0].keys()) for info in request_info: request_key_set = set(request_key_set) & set(info.keys()) # TODO: need to assume some data maight not contains include docs for request_name in request_key_set: requests[request_name] = request_info[0][request_name] return requests # TODO move this out of this class def filterCouchInfo(self, couchInfo): for key in ['_rev', '_attachments']: if key in couchInfo: del couchInfo[key] def _combine_request(self, request_info, requestAgentUrl, cache): keys = {} requestAgentUrlList = [] for row in requestAgentUrl["rows"]: request = row["key"][0] if not keys[request]: keys[request] = [] keys[request].append(row["key"][1]) for request in request_info: for agentUrl in keys[request]: requestAgentUrlList.append([request, agentUrl]) return requestAgentUrlList def _retrieveResubmissionChildren(self, request_name): result = self.reqmgr_db.loadView('ReqMgr', 'childresubmissionrequests', keys=[request_name])['rows'] childrenRequestNames = [] for child in result: childrenRequestNames.append(child['id']) childrenRequestNames.extend(self._retrieveResubmissionChildren(child['id'])) return childrenRequestNames def _handleNoStatusUpdate(self, workload, request_args): """ only few values can be updated without state transition involved currently 'RequestPriority' and 'total_jobs', 'input_lumis', 'input_events', 'input_num_files' """ if 'RequestPriority' in request_args: # must update three places: GQ elements, workload_cache and workload spec self.gq_service.updatePriority(workload.name(), request_args['RequestPriority']) report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args) workload.setPriority(request_args['RequestPriority']) workload.saveCouchUrl(workload.specUrl()) elif "total_jobs" in request_args: # only GQ update this stats # request_args should contain only 4 keys 'total_jobs', 'input_lumis', 'input_events', 'input_num_files'} report = self.reqmgr_db_service.updateRequestStats(workload.name(), request_args) else: raise InvalidSpecParameterValue("can't update value without state transition: %s" % request_args) return report def _handleAssignmentApprovedTransition(self, workload, request_args, dn): report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args, dn) return report def _handleAssignmentStateTransition(self, workload, request_args, dn): req_status = request_args["RequestStatus"] if req_status == "assigned" and not request_args.get('Team', '').strip(): raise InvalidSpecParameterValue("Team must be set during workflow assignment: %s" % request_args) if ('SoftTimeout' in request_args) and ('GracePeriod' in request_args): request_args['SoftTimeout'] = int(request_args['SoftTimeout']) #TODO: not sure why GracePeriod when passed from web ingerface but convert here request_args['GracePeriod'] = int(request_args['GracePeriod']) request_args['HardTimeout'] = request_args['SoftTimeout'] + request_args['GracePeriod'] #Only allow extra value update for assigned status cherrypy.log("INFO: Assign request, input args: %s ..." % request_args) try: workload.updateArguments(request_args) except Exception as ex: msg = traceback.format_exc() cherrypy.log("Error for request args %s: %s" % (request_args, msg)) raise InvalidSpecParameterValue(str(ex)) # validate/update OutputDatasets after ProcessingString and AcquisionEra is updated request_args['OutputDatasets'] = workload.listOutputDatasets() validateOutputDatasets(request_args['OutputDatasets'], workload.getDbsUrl()) # legacy update schema to support ops script loadRequestSchema(workload, request_args) report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args, dn) workload.saveCouch(self.config.couch_host, self.config.couch_reqmgr_db) return report def _handleCascadeUpdate(self, workload, request_args, dn): """ only closed-out and announced has this option """ req_status = request_args["RequestStatus"] # check whehter it is casecade option if request_args["cascade"]: cascade_list = self._retrieveResubmissionChildren(workload.name()) for req_name in cascade_list: self.reqmgr_db_service.updateRequestStatus(req_name, req_status, dn) # update original workflow status report = self.reqmgr_db_service.updateRequestStatus(workload.name(), req_status, dn) return report def _handleOnlyStateTransition(self, workload, req_status, dn): """ It handles only the state transition. Special handling needed if a request is aborted or force completed. """ if req_status in ["aborted", "force-complete"]: # cancel the workflow first self.gq_service.cancelWorkflow(workload.name()) #update the request status in couchdb report = self.reqmgr_db_service.updateRequestStatus(workload.name(), req_status, dn) return report def _updateRequest(self, workload, request_args): dn = cherrypy.request.user.get("dn", "unknown") if workload is None: (workload, request_args) = self.initialize_clone(request_args["OriginalRequestName"]) return self.post([workload, request_args]) if "RequestStatus" not in request_args: report = self._handleNoStatusUpdate(workload, request_args) else: req_status = request_args["RequestStatus"] # assignment-approved only allow Priority update if len(request_args) == 2 and req_status == "assignment-approved": report = self._handleAssignmentApprovedTransition(workload, request_args, dn) elif len(request_args) > 1 and req_status == "assigned": report = self._handleAssignmentStateTransition(workload, request_args, dn) elif len(request_args) == 2 and req_status in ["closed-out", "announced"] and \ "cascade" in request_args: report = self._handleCascadeUpdate(workload, request_args, dn) elif len(request_args) == 1: # If status chnage is to aborted, force-complete, rejected, ignore other argument report = self._handleOnlyStateTransition(workload, req_status, dn) else: raise InvalidSpecParameterValue( "can't update value except transition to assigned status: %s" % request_args) if report == 'OK': return {workload.name(): "OK"} else: return {workload.name(): "ERROR"} @restcall(formats=[('application/json', JSONFormat())]) def put(self, workload_pair_list): """workloadPairList is a list of tuple containing (workload, requeat_args)""" report = [] for workload, request_args in workload_pair_list: result = self._updateRequest(workload, request_args) report.append(result) return report @restcall(formats=[('application/json', JSONFormat())]) def delete(self, request_name): cherrypy.log("INFO: Deleting request document '%s' ..." % request_name) try: self.reqmgr_db.delete_doc(request_name) except CouchError as ex: msg = "ERROR: Delete failed." cherrypy.log(msg + " Reason: %s" % ex) raise cherrypy.HTTPError(404, msg) # TODO # delete should also happen on WMStats cherrypy.log("INFO: Delete '%s' done." % request_name) def _update_additional_request_args(self, workload, request_args): """ add to request_args properties which is not initially set from user. This data will put in to couchdb. Update request_args here if additional information need to be put in couchdb """ request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists request_args['OutputDatasets'] = workload.listOutputDatasets() #Add initial priority only for the creation of the request request_args['InitialPriority'] = request_args["RequestPriority"] # TODO: remove this after reqmgr2 replice reqmgr (reqmgr2Only) request_args['ReqMgr2Only'] = True return @restcall(formats=[('application/json', JSONFormat())]) def post(self, workload_pair_list, multi_update_flag=False, multi_names_flag=False): """ Create and update couchDB with a new request. request argument is passed from validation (validation convert cherrypy.request.body data to argument) TODO: this method will have some parts factored out so that e.g. clone call can share functionality. NOTES: 1) do not strip spaces, #4705 will fails upon injection with spaces; currently the chain relies on a number of things coming in #4705 2) reqInputArgs = Utilities.unidecode(json.loads(body)) (from ReqMgrRESTModel.putRequest) """ # storing the request document into Couch if multi_update_flag: return self.put(workload_pair_list) if multi_names_flag: return self.get(name=workload_pair_list) out = [] for workload, request_args in workload_pair_list: self._update_additional_request_args(workload, request_args) # legacy update schema to support ops script loadRequestSchema(workload, request_args) cherrypy.log("INFO: Create request, input args: %s ..." % request_args) workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"], metadata=request_args) out.append({'request': workload.name()}) return out
def testUpdatePriorityService(self): """ _testUpdatePriorityService_ Check that we can update the priority correctly also check the available workflows feature """ specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec( specName, "file", assignKwargs={'SiteWhitelist': ["T2_XX_SiteA"]}) globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl, UnittestFlag=True) localQ = localQueue(DbName='local_workqueue_t', QueueURL=self.testInit.couchUrl, CacheDir=self.testInit.testDir, ParentQueueCouchUrl='%s/workqueue_t' % self.testInit.couchUrl, ParentQueueInboxCouchDBName='workqueue_t_inbox') # Try a full chain of priority update and propagation self.assertTrue(globalQ.queueWork(specUrl, "RerecoSpec", "teamA") > 0) globalApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') # overwrite default - can't test with stale view globalApi.defaultOptions = {'reduce': True, 'group': True} globalApi.updatePriority(specName, 100) self.assertEqual(globalQ.backend.getWMSpec(specName).priority(), 100) storedElements = globalQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) numWorks = localQ.pullWork({'T2_XX_SiteA': 10}) self.assertTrue(numWorks > 0) # replicate from GQ to LQ manually localQ.backend.pullFromParent(continuous=False) # wait until replication is done time.sleep(2) localQ.processInboundWork(continuous=False) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) localApi = WorkQueueDS(self.testInit.couchUrl, 'local_workqueue_t') # overwrite default - can't test with stale view localApi.defaultOptions = {'reduce': True, 'group': True} localApi.updatePriority(specName, 500) self.assertEqual(localQ.backend.getWMSpec(specName).priority(), 500) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 500) availableWF = localApi.getAvailableWorkflows() self.assertEqual(availableWF, set([(specName, 500)])) # Attempt to update an inexistent workflow in the queue try: globalApi.updatePriority('NotExistent', 2) except Exception as ex: self.fail('No exception should be raised.: %s' % str(ex))
def testUpdatePriorityService(self): """ _testUpdatePriorityService_ Check that we can update the priority correctly also check the available workflows feature """ specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file") globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl) localQ = localQueue(DbName='local_workqueue_t', QueueURL=self.testInit.couchUrl, CacheDir=self.testInit.testDir, ParentQueueCouchUrl='%s/workqueue_t' % self.testInit.couchUrl, ParentQueueInboxCouchDBName='workqueue_t_inbox') # Try a full chain of priority update and propagation self.assertTrue(globalQ.queueWork(specUrl, "RerecoSpec", "teamA") > 0) globalApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') #overwrite default - can't test with stale view globalApi.defaultOptions = {'reduce': True, 'group': True} globalApi.updatePriority(specName, 100) self.assertEqual(globalQ.backend.getWMSpec(specName).priority(), 100) storedElements = globalQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) self.assertTrue(localQ.pullWork({'T2_XX_SiteA': 10}) > 0) localQ.processInboundWork(continuous=False) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) localApi = WorkQueueDS(self.testInit.couchUrl, 'local_workqueue_t') #overwrite default - can't test with stale view localApi.defaultOptions = {'reduce': True, 'group': True} localApi.updatePriority(specName, 500) self.assertEqual(localQ.backend.getWMSpec(specName).priority(), 500) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 500) self.assertEqual(localApi.getAvailableWorkflows(), set([(specName, 500)])) # Attempt to update an inexistent workflow in the queue try: globalApi.updatePriority('NotExistent', 2) except: self.fail('No exception should be raised.')
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority() elements = self.workqueueDS.getElementsByStatus(['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
class AnalyticsPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower() self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None) self.plugin = None def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName) def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: #jobs per request info logging.info("Getting Job Couch Data ...") jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite() #fwjr per request info logging.info("Getting FWJRJob Couch Data ...") fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB() logging.info("Getting Batch Job Data ...") batchJobInfo = self.wmagentDB.getBatchJobInfo() logging.info("Getting Finished Task Data ...") finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask() # get the data from local workqueue: # request name, input dataset, inWMBS, inQueue logging.info("Getting Local Queue Data ...") localQInfo = {} if not hasattr(self.config, "Tier0Feeder"): localQInfo = self.localQueue.getAnalyticsData() else: logging.debug("Tier-0 instance, not checking WorkQueue") # combine all the data from 3 sources logging.info("""Combining data from Job Couch(%s), FWJR(%s), Batch Job(%s), Finished Tasks(%s), Local Queue(%s) ...""" % (len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo))) tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo) combinedRequests = combineAnalyticsData(tempCombinedData, localQInfo) #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) logging.info("%s requests Data combined,\n uploading request data..." % len(combinedRequests)) requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks, self.agentInfo, uploadTime, self.summaryLevel) if self.plugin != None: self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB) self.localSummaryCouchDB.uploadData(requestDocs) logging.info("Request data upload success\n %s request, \nsleep for next cycle" % len(requestDocs)) DataUploadTime.setInfo(uploadTime, "ok") except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) DataUploadTime.setInfo(False, str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc())
class Request(RESTEntity): def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) self.reqmgr_db = api.db_handler.get_db(config.couch_reqmgr_db) self.reqmgr_db_service = RequestDBWriter(self.reqmgr_db, couchapp="ReqMgr") # this need for the post validtiaon self.reqmgr_aux_db = api.db_handler.get_db(config.couch_reqmgr_aux_db) self.gq_service = WorkQueue(config.couch_host, config.couch_workqueue_db) def _requestArgMapFromBrowser(self, request_args): """ This is specific mapping function data from browser TO: give a key word so it doesn't have to loop though in general """ docs = [] for doc in request_args: for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _validateGET(self, param, safe): #TODO: need proper validation but for now pass everything args_length = len(param.args) if args_length == 1: safe.kwargs["name"] = param.args[0] param.args.pop() return for prop in param.kwargs: safe.kwargs[prop] = param.kwargs[prop] for prop in safe.kwargs: del param.kwargs[prop] return def _validateRequestBase(self, param, safe, valFunc, requestName=None): data = cherrypy.request.body.read() if data: request_args = JsonWrapper.loads(data) if requestName: request_args["RequestName"] = requestName if isinstance(request_args, dict): request_args = [request_args] else: # actually this is error case #cherrypy.log(str(param.kwargs)) request_args = {} for prop in param.kwargs: request_args[prop] = param.kwargs[prop] for prop in request_args: del param.kwargs[prop] if requestName: request_args["RequestName"] = requestName request_args = [request_args] safe.kwargs['workload_pair_list'] = [] if isinstance(request_args, dict): request_args = [request_args] for args in request_args: workload, r_args = valFunc(args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) def _get_request_names(self, ids): "Extract request names from given documents" #cherrypy.log("request names %s" % ids) doc = {} if isinstance(ids, list): for rid in ids: doc[rid] = 'on' elif isinstance(ids, basestring): doc[ids] = 'on' docs = [] for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _getMultiRequestArgs(self, multiRequestForm): request_args = {} for prop in multiRequestForm: if prop == "ids": request_names = self._get_request_names( multiRequestForm["ids"]) elif prop == "new_status": request_args["RequestStatus"] = multiRequestForm[prop] # remove this #elif prop in ["CustodialSites", "AutoApproveSubscriptionSites"]: # request_args[prop] = [multiRequestForm[prop]] else: request_args[prop] = multiRequestForm[prop] return request_names, request_args def _validateMultiRequests(self, param, safe, valFunc): data = cherrypy.request.body.read() if data: request_names, request_args = self._getMultiRequestArgs( JsonWrapper.loads(data)) else: # actually this is error case #cherrypy.log(str(param.kwargs)) request_names, request_args = self._getMultiRequestArgs( param.kwargs) for prop in request_args: if prop == "RequestStatus": del param.kwargs["new_status"] else: del param.kwargs[prop] del param.kwargs["ids"] #remove this #tmp = [] #for prop in param.kwargs: # tmp.append(prop) #for prop in tmp: # del param.kwargs[prop] safe.kwargs['workload_pair_list'] = [] for request_name in request_names: request_args["RequestName"] = request_name workload, r_args = valFunc(request_args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) safe.kwargs["multi_update_flag"] = True def validate(self, apiobj, method, api, param, safe): # to make validate successful # move the validated argument to safe # make param empty # other wise raise the error try: if method in ['GET']: self._validateGET(param, safe) if method == 'PUT': args_length = len(param.args) if args_length == 1: requestName = param.args[0] param.args.pop() else: requestName = None self._validateRequestBase(param, safe, validate_request_update_args, requestName) #TO: handle multiple clone # if len(param.args) == 2: # #validate clone case # if param.args[0] == "clone": # param.args.pop() # return None, request_args if method == 'POST': args_length = len(param.args) if args_length == 1 and param.args[0] == "multi_update": #special case for multi update from browser. param.args.pop() self._validateMultiRequests(param, safe, validate_request_update_args) else: self._validateRequestBase(param, safe, validate_request_create_args) except Exception as ex: #TODO add proper error message instead of trace back msg = traceback.format_exc() cherrypy.log("Error: %s" % msg) if hasattr(ex, "message"): if hasattr(ex.message, '__call__'): msg = ex.message() else: msg = str(ex) else: msg = str(ex) raise InvalidSpecParameterValue(msg) def initialize_clone(self, request_name): requests = self.reqmgr_db_service.getRequestByNames(request_name) clone_args = requests.values()[0] # overwrite the name and time stamp. initialize_request_args(clone_args, self.config, clone=True) # timestamp status update spec = loadSpecByType(clone_args["RequestType"]) workload = spec.factoryWorkloadConstruction(clone_args["RequestName"], clone_args) return (workload, clone_args) @restcall def get(self, **kwargs): """ Returns request info depending on the conditions set by kwargs Currently defined kwargs are following. statusList, requestNames, requestType, prepID, inputDataset, outputDataset, dateRange If jobInfo is True, returns jobInfomation about the request as well. TODO: stuff like this has to filtered out from result of this call: _attachments: {u'spec': {u'stub': True, u'length': 51712, u'revpos': 2, u'content_type': u'application/json'}} _id: maxa_RequestString-OVERRIDE-ME_130621_174227_9225 _rev: 4-c6ceb2737793aaeac3f1cdf591593da4 """ if len(kwargs) == 0: kwargs['status'] = "running" options = {"descending": True, 'include_docs': True, 'limit': 200} request_docs = self.reqmgr_db.loadView("ReqMgr", "bystatus", options) return rows([request_docs]) # list of status status = kwargs.get("status", False) # list of request names name = kwargs.get("name", False) request_type = kwargs.get("request_type", False) prep_id = kwargs.get("prep_id", False) inputdataset = kwargs.get("inputdataset", False) outputdataset = kwargs.get("outputdataset", False) date_range = kwargs.get("date_range", False) campaign = kwargs.get("campaign", False) workqueue = kwargs.get("workqueue", False) team = kwargs.get("team", False) # eventhing should be stale view. this only needs for test _nostale = kwargs.get("_nostale", False) option = {} if _nostale: self.reqmgr_db_service._setNoStale() request_info = [] if status and not team and not request_type: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "bystatus", option, status)) if status and team: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "byteamandstatus", option, [[team, status]])) if status and request_type: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "byteamandstatus", option, [[team, status]])) if name: request_info.append(self.reqmgr_db_service.getRequestByNames(name)) if prep_id: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "byprepid", option, prep_id)) if inputdataset: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "byinputdataset", option, inputdataset)) if outputdataset: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "byoutputdataset", option, outputdataset)) if date_range: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "bydate", option, date_range)) if campaign: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "bycampaign", option, campaign)) if workqueue: request_info.append( self.reqmgr_db_service.getRequestByCouchView( "byworkqueue", option, workqueue)) #get interaction of the request result = self._intersection_of_request_info(request_info) if len(result) == 0: return [] return rows([result]) def _intersection_of_request_info(self, request_info): requests = {} if len(request_info) < 1: return requests request_key_set = set(request_info[0].keys()) for info in request_info: request_key_set = set(request_key_set) & set(info.keys()) #TODO: need to assume some data maight not contains include docs for request_name in request_key_set: requests[request_name] = request_info[0][request_name] return requests def _get_couch_view(self, couchdb, couchapp, view, options, keys): if not options: options = {} options.setdefault("include_docs", True) if isinstance(keys, basestring): keys = [keys] result = couchdb.loadView(couchapp, view, options, keys) request_info = {} for item in result["rows"]: request_info[item["id"]] = item.get('doc', None) if request_info[item["id"]] != None: self.filterCouchInfo(request_info[item["id"]]) return request_info #TODO move this out of this class def filterCouchInfo(self, couchInfo): for key in ['_rev', '_attachments']: if key in couchInfo: del couchInfo[key] def get_wmstats_view(self, view, options, keys): return self._get_couch_view(self.wmstatsCouch, "WMStats", view, options, keys) def _combine_request(self, request_info, requestAgentUrl, cache): keys = {} requestAgentUrlList = [] for row in requestAgentUrl["rows"]: request = row["key"][0] if not keys[request]: keys[request] = [] keys[request].append(row["key"][1]) for request in request_info: for agentUrl in keys[request]: requestAgentUrlList.append([request, agentUrl]) return requestAgentUrlList def _updateRequest(self, workload, request_args): if workload == None: (workload, request_args) = self.initialize_clone( request_args["OriginalRequestName"]) return self.post(workload, request_args) dn = cherrypy.request.user.get("dn", "unknown") if "total_jobs" in request_args: # only GQ update this stats # request_args should contain only 4 keys 'total_jobs', 'input_lumis', 'input_events', 'input_num_files'} report = self.reqmgr_db_service.updateRequestStats( workload.name(), request_args) # if is not just updating status else: if len(request_args) > 1 or "RequestStatus" not in request_args: try: workload.updateArguments(request_args) except Exception as ex: msg = traceback.format_exc() cherrypy.log("Error for request args %s: %s" % (request_args, msg)) raise InvalidSpecParameterValue(str(ex)) # trailing / is needed for the savecouchUrl function workload.saveCouch(self.config.couch_host, self.config.couch_reqmgr_db) req_status = request_args.get("RequestStatus", None) # If it is aborted or force-complete transition call workqueue to cancel the request if req_status == "aborted" or req_status == "force-complete": self.gq_service.cancelWorkflow(workload.name()) report = self.reqmgr_db_service.updateRequestProperty( workload.name(), request_args, dn) if report == 'OK': return {workload.name(): "OK"} else: return {workload.name(): "ERROR"} @restcall def put(self, workload_pair_list): "workloadPairList is a list of tuple containing (workload, requeat_args)" report = [] for workload, request_args in workload_pair_list: result = self._updateRequest(workload, request_args) report.append(result) return report @restcall def delete(self, request_name): cherrypy.log("INFO: Deleting request document '%s' ..." % request_name) try: self.reqmgr_db.delete_doc(request_name) except CouchError as ex: msg = "ERROR: Delete failed." cherrypy.log(msg + " Reason: %s" % ex) raise cherrypy.HTTPError(404, msg) # TODO # delete should also happen on WMStats cherrypy.log("INFO: Delete '%s' done." % request_name) @restcall def post(self, workload_pair_list, multi_update_flag=False): """ Create and update couchDB with a new request. request argument is passed from validation (validation convert cherrypy.request.body data to argument) TODO: this method will have some parts factored out so that e.g. clone call can share functionality. NOTES: 1) do not strip spaces, #4705 will fails upon injection with spaces ; currently the chain relies on a number of things coming in #4705 2) reqInputArgs = Utilities.unidecode(JsonWrapper.loads(body)) (from ReqMgrRESTModel.putRequest) """ # storing the request document into Couch if multi_update_flag: return self.put(workload_pair_list) out = [] for workload, request_args in workload_pair_list: cherrypy.log("INFO: Create request, input args: %s ..." % request_args) request_args['RequestWorkflow'] = sanitizeURL( "%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"], metadata=request_args) out.append({'request': workload.name()}) return out
class Request(RESTEntity): def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) self.reqmgr_db = api.db_handler.get_db(config.couch_reqmgr_db) self.reqmgr_db_service = RequestDBWriter(self.reqmgr_db, couchapp="ReqMgr") # this need for the post validtiaon self.gq_service = WorkQueue(config.couch_host, config.couch_workqueue_db) def _validateGET(self, param, safe): # TODO: need proper validation but for now pass everything args_length = len(param.args) if args_length == 1: safe.kwargs["name"] = param.args[0] param.args.pop() return no_multi_key = ["detail", "_nostale", "date_range", "common_dict"] for key, value in param.kwargs.items(): # convert string to list if key not in no_multi_key and isinstance(value, basestring): param.kwargs[key] = [value] detail = param.kwargs.get('detail', True) if detail in (False, "false", "False", "FALSE"): detail = False if "status" in param.kwargs and detail: for status in param.kwargs["status"]: if status.endswith("-archived"): raise InvalidSpecParameterValue( """Can't retrieve bulk archived status requests with detail option True, set detail=false or use other search arguments""") for prop in param.kwargs.keys(): safe.kwargs[prop] = param.kwargs.pop(prop) return def _validateRequestBase(self, param, safe, valFunc, requestName=None): data = cherrypy.request.body.read() if data: request_args = json.loads(data) else: request_args = {} cherrypy.log('Updating request "%s" with these user-provided args: %s' % (requestName, request_args)) # In case key args are also passed and request body also exists. # If the request.body is dictionary update the key args value as well if isinstance(request_args, dict): for prop in param.kwargs.keys(): request_args[prop] = param.kwargs.pop(prop) if requestName: request_args["RequestName"] = requestName request_args = [request_args] safe.kwargs['workload_pair_list'] = [] for args in request_args: workload, r_args = valFunc(args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) def _get_request_names(self, ids): "Extract request names from given documents" # cherrypy.log("request names %s" % ids) doc = {} if isinstance(ids, list): for rid in ids: doc[rid] = 'on' elif isinstance(ids, basestring): doc[ids] = 'on' docs = [] for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _getMultiRequestArgs(self, multiRequestForm): request_args = {} for prop in multiRequestForm: if prop == "ids": request_names = self._get_request_names(multiRequestForm["ids"]) elif prop == "new_status": request_args["RequestStatus"] = multiRequestForm[prop] # remove this # elif prop in ["CustodialSites", "AutoApproveSubscriptionSites"]: # request_args[prop] = [multiRequestForm[prop]] else: request_args[prop] = multiRequestForm[prop] return request_names, request_args def _validateMultiRequests(self, param, safe, valFunc): data = cherrypy.request.body.read() if data: request_names, request_args = self._getMultiRequestArgs(json.loads(data)) else: # actually this is error case # cherrypy.log(str(param.kwargs)) request_names, request_args = self._getMultiRequestArgs(param.kwargs) for prop in request_args: if prop == "RequestStatus": del param.kwargs["new_status"] else: del param.kwargs[prop] del param.kwargs["ids"] safe.kwargs['workload_pair_list'] = [] for request_name in request_names: request_args["RequestName"] = request_name workload, r_args = valFunc(request_args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) safe.kwargs["multi_update_flag"] = True def _getRequestNamesFromBody(self, safe): request_names = json.loads(cherrypy.request.body.read()) safe.kwargs['workload_pair_list'] = request_names safe.kwargs["multi_names_flag"] = True def validate(self, apiobj, method, api, param, safe): # to make validate successful # move the validated argument to safe # make param empty # other wise raise the error try: if method == 'GET': self._validateGET(param, safe) elif method == 'PUT': args_length = len(param.args) if args_length == 1: requestName = param.args[0] param.args.pop() else: requestName = None self._validateRequestBase(param, safe, validate_request_update_args, requestName) elif method == 'POST': args_length = len(param.args) if args_length == 2 and param.args[0] == "clone": # handles clone workflow.- don't validtate args here param.kwargs['OriginalRequestName'] = param.args[1] param.args.pop() param.args.pop() self._validateRequestBase(param, safe, validate_clone_create_args) elif args_length == 1 and param.args[0] == "multi_update": # special case for multi update from browser. param.args.pop() self._validateMultiRequests(param, safe, validate_request_update_args) elif args_length == 1 and param.args[0] == "bynames": # special case for multi update from browser. param.args.pop() self._getRequestNamesFromBody(safe) else: self._validateRequestBase(param, safe, validate_request_create_args) except InvalidSpecParameterValue as ex: raise ex except Exception as ex: # TODO add proper error message instead of trace back msg = traceback.format_exc() cherrypy.log("Error: %s" % msg) if hasattr(ex, "message"): if hasattr(ex.message, '__call__'): msg = ex.message() else: msg = str(ex) else: msg = str(ex) raise InvalidSpecParameterValue(msg) def _maskResult(self, mask, result): """ If a mask of parameters was provided in the query string, then filter the request key/values accordingly. :param mask: a list of strings (keys of the request dictionary) :param result: a dict key'ed by the request name, with the whole request dictionary as a value :return: updates the result object in place and returns it (dict) """ if len(mask) == 1 and mask[0] == "DAS": mask = ReqMgrConfigDataCache.getConfig("DAS_RESULT_FILTER")["filter_list"] if len(mask) > 0: maskedResult = {} for reqName, reqDict in result.items(): reqInfo = RequestInfo(reqDict) maskedResult.setdefault(reqName, {}) for maskKey in mask: foundValue = reqInfo.get(maskKey, None) maskedResult[reqName].update({maskKey: foundValue}) return maskedResult else: return result @restcall(formats=[('text/plain', PrettyJSONFormat()), ('application/json', JSONFormat())]) def get(self, **kwargs): """ Returns request info depending on the conditions set by kwargs Currently defined kwargs are following. statusList, requestNames, requestType, prepID, inputDataset, outputDataset, dateRange If jobInfo is True, returns jobInfomation about the request as well. TODO: stuff like this has to masked out from result of this call: _attachments: {u'spec': {u'stub': True, u'length': 51712, u'revpos': 2, u'content_type': u'application/json'}} _id: maxa_RequestString-OVERRIDE-ME_130621_174227_9225 _rev: 4-c6ceb2737793aaeac3f1cdf591593da4 """ ### pop arguments unrelated to the user query mask = kwargs.pop("mask", []) detail = kwargs.pop("detail", True) common_dict = int(kwargs.pop("common_dict", 0)) # modifies the response format nostale = kwargs.pop("_nostale", False) ### these are the query strings supported by this API status = kwargs.get("status", []) name = kwargs.get("name", []) request_type = kwargs.get("request_type", []) prep_id = kwargs.get("prep_id", []) inputdataset = kwargs.get("inputdataset", []) outputdataset = kwargs.get("outputdataset", []) date_range = kwargs.get("date_range", False) campaign = kwargs.get("campaign", []) team = kwargs.get("team", []) mc_pileup = kwargs.get("mc_pileup", []) data_pileup = kwargs.get("data_pileup", []) requestor = kwargs.get("requestor", []) # further tweaks to the couch queries if len(status) == 1 and status[0] == "ACTIVE": status = ACTIVE_STATUS if detail in (False, "false", "False", "FALSE"): option = {"include_docs": False} else: option = {"include_docs": True} # everything should be stale view. this only needs for test if nostale: self.reqmgr_db_service._setNoStale() request_info = [] queryMatched = False # flag to avoid calling the same view twice if len(kwargs) == 2: if status and team: query_keys = [[t, s] for t in team for s in status] request_info.append(self.reqmgr_db_service.getRequestByCouchView("byteamandstatus", option, query_keys)) queryMatched = True elif status and request_type: query_keys = [[s, rt] for rt in request_type for s in status] request_info.append(self.reqmgr_db_service.getRequestByCouchView("requestsbystatusandtype", option, query_keys)) queryMatched = True elif status and requestor: query_keys = [[s, r] for r in requestor for s in status] request_info.append(self.reqmgr_db_service.getRequestByCouchView("bystatusandrequestor", option, query_keys)) queryMatched = True elif len(kwargs) == 3: if status and request_type and requestor: query_keys = [[s, rt, req] for s in status for rt in request_type for req in requestor] request_info.append(self.reqmgr_db_service.getRequestByCouchView("bystatusandtypeandrequestor", option, query_keys)) queryMatched = True # anything else that hasn't matched the query combination above if not queryMatched: if status: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bystatus", option, status)) if name: request_info.append(self.reqmgr_db_service.getRequestByNames(name)) if request_type: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bytype", option, request_type)) if prep_id: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byprepid", option, prep_id)) if inputdataset: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byinputdataset", option, inputdataset)) if outputdataset: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byoutputdataset", option, outputdataset)) if date_range: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bydate", option, date_range)) if campaign: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bycampaign", option, campaign)) if mc_pileup: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bymcpileup", option, mc_pileup)) if data_pileup: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bydatapileup", option, data_pileup)) # get the intersection of the request data result = self._intersection_of_request_info(request_info) if not result: return [] result = self._maskResult(mask, result) if not option["include_docs"]: return result.keys() # set the return format. default format has request name as a key # if is set to one it returns list of dictionary with RequestName field. if common_dict == 1: response_list = result.values() else: response_list = [result] return rows(response_list) def _intersection_of_request_info(self, request_info): requests = {} if len(request_info) < 1: return requests request_key_set = set(request_info[0].keys()) for info in request_info: request_key_set = set(request_key_set) & set(info.keys()) # TODO: need to assume some data maight not contains include docs for request_name in request_key_set: requests[request_name] = request_info[0][request_name] return requests def _retrieveResubmissionChildren(self, request_name): """ Fetches all the direct children requests from CouchDB. Response from CouchDB view is in the following format: [{u'id': u'child_workflow_name', u'key': u'parent_workflow_name', u'value': 'current_request_status'}] :param request_name: string with the parent workflow name :return: a list of dictionaries with the parent and child workflow and the child status """ result = self.reqmgr_db.loadView('ReqMgr', 'childresubmissionrequests', keys=[request_name])['rows'] childrenRequestAndStatus = [] for childInfo in result: childrenRequestAndStatus.append(childInfo) childrenRequestAndStatus.extend(self._retrieveResubmissionChildren(childInfo['id'])) return childrenRequestAndStatus def _handleNoStatusUpdate(self, workload, request_args, dn): """ For no-status update, we only support the following parameters: 1. RequestPriority 2. Global workqueue statistics, while acquiring a workflow """ if 'RequestPriority' in request_args: # Yes, we completely ignore any other arguments posted by the user (web UI case) request_args = {'RequestPriority': request_args['RequestPriority']} validate_request_priority(request_args) # must update three places: GQ elements, workload_cache and workload spec self.gq_service.updatePriority(workload.name(), request_args['RequestPriority']) report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args, dn) workload.setPriority(request_args['RequestPriority']) workload.saveCouchUrl(workload.specUrl()) cherrypy.log('Updated priority of "{}" to: {}'.format(workload.name(), request_args['RequestPriority'])) elif workqueue_stat_validation(request_args): report = self.reqmgr_db_service.updateRequestStats(workload.name(), request_args) cherrypy.log('Updated workqueue statistics of "{}", with: {}'.format(workload.name(), request_args)) else: msg = "There are invalid arguments for no-status update: %s" % request_args raise InvalidSpecParameterValue(msg) return report def _handleAssignmentApprovedTransition(self, workload, request_args, dn): """ Allows only two arguments: RequestStatus and RequestPriority """ if "RequestPriority" not in request_args: msg = "There are invalid arguments for assignment-approved transition: %s" % request_args raise InvalidSpecParameterValue(msg) validate_request_priority(request_args) report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args, dn) return report def _handleAssignmentStateTransition(self, workload, request_args, dn): if ('SoftTimeout' in request_args) and ('GracePeriod' in request_args): request_args['HardTimeout'] = request_args['SoftTimeout'] + request_args['GracePeriod'] # Only allow extra value update for assigned status cherrypy.log("Assign request %s, input args: %s ..." % (workload.name(), request_args)) try: workload.updateArguments(request_args) except Exception as ex: msg = traceback.format_exc() cherrypy.log("Error for request args %s: %s" % (request_args, msg)) raise InvalidSpecParameterValue(str(ex)) # validate/update OutputDatasets after ProcessingString and AcquisionEra is updated request_args['OutputDatasets'] = workload.listOutputDatasets() validateOutputDatasets(request_args['OutputDatasets'], workload.getDbsUrl()) # by default, it contains all unmerged LFNs (used by sites to protect the unmerged area) request_args['OutputModulesLFNBases'] = workload.listAllOutputModulesLFNBases() # Add parentage relation for step chain, task chain: chainMap = workload.getChainParentageSimpleMapping() if chainMap: request_args["ChainParentageMap"] = chainMap # save the spec first before update the reqmgr request status to prevent race condition # when workflow is pulled to GQ before site white list is updated workload.saveCouch(self.config.couch_host, self.config.couch_reqmgr_db) report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args, dn) return report def _handleOnlyStateTransition(self, workload, request_args, dn): """ It handles only the state transition. Special handling needed if a request is aborted or force completed. """ # if we got here, then the main workflow has been already validated # and the status transition is allowed req_status = request_args["RequestStatus"] cascade = request_args.get("cascade", False) if req_status in ["aborted", "force-complete"]: # cancel the workflow first self.gq_service.cancelWorkflow(workload.name()) # cascade option is only supported for these 3 statuses. If set, we need to # find all the children requests and perform the same status transition if req_status in ["rejected", "closed-out", "announced"] and cascade: childrenNamesAndStatus = self._retrieveResubmissionChildren(workload.name()) msg = "Workflow {} has {} ".format(workload.name(), len(childrenNamesAndStatus)) msg += "children workflows to have a status transition to: {}".format(req_status) cherrypy.log(msg) for childInfo in childrenNamesAndStatus: if check_allowed_transition(childInfo['value'], req_status): cherrypy.log('Updating request status for {} to {}.'.format(childInfo['id'], req_status)) self.reqmgr_db_service.updateRequestStatus(childInfo['id'], req_status, dn) else: msg = "Status transition from {} to {} ".format(childInfo['value'], req_status) msg += "not allowed for workflow: {}, skipping it!".format(childInfo['id']) cherrypy.log(msg) # then update the original/parent workflow status in couchdb cherrypy.log('Updating request status for {} to {}.'.format(workload.name(), req_status)) report = self.reqmgr_db_service.updateRequestStatus(workload.name(), req_status, dn) return report def _updateRequest(self, workload, request_args): dn = get_user_info().get("dn", "unknown") if "RequestStatus" not in request_args: report = self._handleNoStatusUpdate(workload, request_args, dn) else: req_status = request_args["RequestStatus"] if len(request_args) == 2 and req_status == "assignment-approved": report = self._handleAssignmentApprovedTransition(workload, request_args, dn) elif len(request_args) > 1 and req_status == "assigned": report = self._handleAssignmentStateTransition(workload, request_args, dn) elif len(request_args) == 1 or (len(request_args) == 2 and "cascade" in request_args): report = self._handleOnlyStateTransition(workload, request_args, dn) else: msg = "There are invalid arguments with this status transition: %s" % request_args raise InvalidSpecParameterValue(msg) if report == 'OK': return {workload.name(): "OK"} return {workload.name(): "ERROR"} @restcall(formats=[('application/json', JSONFormat())]) def put(self, workload_pair_list): """workloadPairList is a list of tuple containing (workload, request_args)""" report = [] for workload, request_args in workload_pair_list: result = self._updateRequest(workload, request_args) report.append(result) return report @restcall(formats=[('application/json', JSONFormat())]) def delete(self, request_name): cherrypy.log("INFO: Deleting request document '%s' ..." % request_name) try: self.reqmgr_db.delete_doc(request_name) except CouchError as ex: msg = "ERROR: Delete failed." cherrypy.log(msg + " Reason: %s" % ex) raise cherrypy.HTTPError(404, msg) # TODO # delete should also happen on WMStats cherrypy.log("INFO: Delete '%s' done." % request_name) def _update_additional_request_args(self, workload, request_args): """ add to request_args properties which is not initially set from user. This data will put in to couchdb. Update request_args here if additional information need to be put in couchdb """ request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists request_args['OutputDatasets'] = workload.listOutputDatasets() # Add initial priority only for the creation of the request request_args['InitialPriority'] = request_args["RequestPriority"] return @restcall(formats=[('application/json', JSONFormat())]) def post(self, workload_pair_list, multi_update_flag=False, multi_names_flag=False): """ Create and update couchDB with a new request. request argument is passed from validation (validation convert cherrypy.request.body data to argument) TODO: this method will have some parts factored out so that e.g. clone call can share functionality. NOTES: 1) do not strip spaces, #4705 will fails upon injection with spaces; currently the chain relies on a number of things coming in #4705 2) reqInputArgs = Utilities.unidecode(json.loads(body)) (from ReqMgrRESTModel.putRequest) """ # storing the request document into Couch if multi_update_flag: return self.put(workload_pair_list) if multi_names_flag: return self.get(name=workload_pair_list) out = [] for workload, request_args in workload_pair_list: self._update_additional_request_args(workload, request_args) cherrypy.log("Create request, input args: %s ..." % request_args) try: workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"], metadata=request_args) out.append({'request': workload.name()}) except Exception as ex: # then it failed to add the spec file as attachment # we better delete the original request to avoid confusion in wmstats cherrypy.log("Error saving request spec to couch: %s " % str(ex)) self.delete(request_args['RequestName']) return out
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = { 'proxy': { 'error': 3, 'warning': 5 }, 'certificate': { 'error': 10, 'warning': 20 } } # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkCredLifetime(agentInfo, "proxy") self.checkCredLifetime(agentInfo, "certificate") timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not self.isT0agent: timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) self.uploadAgentInfoToCentralWMStats(agentInfo) self.buildMonITDocs(agentInfo) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} wqStates = ['Available', 'Acquired'] results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus(wqStates) uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo): """ Add some required fields to the document before it can get uploaded to WMStats. :param agentInfo: dict with agent stats to be posted to couchdb """ agentInfo['_id'] = agentInfo["agent_url"] agentInfo['timestamp'] = int(time.time()) agentInfo['type'] = "agent_info" # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed try: self.centralWMStatsCouchDB.updateAgentInfo( agentInfo, propertiesToKeep=["data_last_update", "data_error"]) except Exception as e: logging.error( "Failed to upload agent statistics to WMStats. Error: %s", str(e)) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkCredLifetime(self, agInfo, credType): """ Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :param credType: credential type, can be: "proxy" or "certificate" :return: same dictionary object plus additional keys/values if needed. """ if credType == "proxy": credFile = self.proxyFile secsLeft = self.proxy.getTimeLeft(proxy=credFile) elif credType == "certificate": credFile = self.userCertFile secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True) else: logging.error( "Unknown credential type. Available options are: [proxy, certificate]" ) return logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft) daysLeft = secsLeft / (60 * 60 * 24) if daysLeft <= self.credThresholds[credType]['error']: credWarning = True agInfo['status'] = "error" elif daysLeft <= self.credThresholds[credType]['warning']: credWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: credWarning = False if credWarning: warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile) warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.) agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg logging.warning(warnMsg) return def buildMonITDocs(self, dataStats): """ Convert agent statistics into MonIT-friendly documents to be posted to AMQ/ES. It creates 5 different type of documents: * priority information * site information * work information * agent information * agent health information Note that the internal methods are popping some metrics out of dataStats """ if not self.postToAMQ: return logging.info("Preparing documents to be posted to AMQ/MonIT..") allDocs = self._buildMonITPrioDocs(dataStats) allDocs.extend(self._buildMonITSitesDocs(dataStats)) allDocs.extend(self._buildMonITWorkDocs(dataStats)) allDocs.extend(self._buildMonITWMBSDocs(dataStats)) allDocs.extend(self._buildMonITAgentDocs(dataStats)) allDocs.extend(self._buildMonITHealthDocs(dataStats)) allDocs.extend(self._buildMonITSummaryDocs(dataStats)) # and finally post them all to AMQ logging.info("Found %d documents to post to AMQ", len(allDocs)) self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp']) def _buildMonITPrioDocs(self, dataStats): """ Uses the `sitePendCountByPrio` metric in order to build documents reporting the site name, job priority and amount of jobs within that priority. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_prio_info MonIT docs """ docType = "wma_prio_info" prioDocs = [] sitePendCountByPrio = dataStats['WMBS_INFO'].pop( 'sitePendCountByPrio', []) for site, item in viewitems(sitePendCountByPrio): # it seems sites with no jobs are also always here as "Sitename": {0: 0} if list(item) == [0]: continue for prio, jobs in viewitems(item): prioDoc = {} prioDoc['site_name'] = site prioDoc['type'] = docType prioDoc['priority'] = prio prioDoc['job_count'] = jobs prioDocs.append(prioDoc) return prioDocs def _buildMonITSitesDocs(self, dataStats): """ Uses the site thresholds and job information for each site in order to build a `site_info` document type for MonIT. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_site_info MonIT docs """ docType = "wma_site_info" siteDocs = [] thresholds = dataStats['WMBS_INFO'].pop('thresholds', {}) thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {}) if self.isT0agent: possibleJobsPerSite = {} uniqueJobsPerSite = {} else: possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'possibleJobsPerSite', {}) uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'uniqueJobsPerSite', {}) for site in sorted(thresholds): siteDoc = {} siteDoc['site_name'] = site siteDoc['type'] = docType siteDoc['thresholds'] = thresholds[site] siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown') siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0) for status in possibleJobsPerSite: # make sure these keys are always present in the documents jobKey = "possible_%s_jobs" % status.lower() elemKey = "num_%s_elem" % status.lower() uniJobKey = "unique_%s_jobs" % status.lower() siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0 if site in possibleJobsPerSite[status]: siteDoc[jobKey] = possibleJobsPerSite[status][site][ 'sum_jobs'] siteDoc[elemKey] = possibleJobsPerSite[status][site][ 'num_elem'] if site in uniqueJobsPerSite[status]: siteDoc[uniJobKey] = uniqueJobsPerSite[status][site][ 'sum_jobs'] siteDocs.append(siteDoc) return siteDocs def _buildMonITWorkDocs(self, dataStats): """ Uses the local workqueue information order by WQE status and build statistics for the workload in terms of workqueue elements and top level jobs. Using the WMBS data, also builds documents to show the amount of work in 'created' and 'executing' WMBS status. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_work_info MonIT docs """ workDocs = [] if self.isT0agent: return workDocs docType = "wma_work_info" workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {}) for status, info in viewitems(workByStatus): workDoc = {} workDoc['type'] = docType workDoc['status'] = status workDoc['num_elem'] = info.get('num_elem', 0) workDoc['sum_jobs'] = info.get('sum_jobs', 0) workDocs.append(workDoc) return workDocs def _buildMonITWMBSDocs(self, dataStats): """ Using the WMBS data, builds documents to show the amount of work in 'created' and 'executing' WMBS status. It also builds a document for every single wmbs_status in the database. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs """ docType = "wma_wmbs_info" wmbsDocs = [] wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsCreatedTypeCount', {}) wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsExecutingTypeCount', {}) for jobType in wmbsCreatedTypeCount: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['job_type'] = jobType wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType] wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType] wmbsDocs.append(wmbsDoc) docType = "wma_wmbs_state_info" wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {}) for wmbsStatus in wmbsCountByState: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['wmbs_status'] = wmbsStatus wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus] wmbsDocs.append(wmbsDoc) return wmbsDocs def _buildMonITAgentDocs(self, dataStats): """ Uses the BossAir and WMBS table information in order to build a view of amount of jobs in different statuses. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_agent_info MonIT docs """ docType = "wma_agent_info" agentDocs = [] activeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'activeRunJobByStatus', {}) completeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'completeRunJobByStatus', {}) for schedStatus in activeRunJobByStatus: agentDoc = {} agentDoc['type'] = docType agentDoc['schedd_status'] = schedStatus agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus] agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus] agentDocs.append(agentDoc) return agentDocs def _buildMonITHealthDocs(self, dataStats): """ Creates documents with specific agent information, status of each component and worker thread (similar to what is shown in wmstats) and also some very basic performance numbers. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_health_info" healthDocs = [] workersStatus = dataStats.pop('workers', {}) for worker in workersStatus: healthDoc = {} healthDoc['type'] = docType healthDoc['worker_name'] = worker['name'] healthDoc['worker_state'] = worker['state'] healthDoc['worker_poll'] = worker['poll_interval'] healthDoc['worker_last_hb'] = worker['last_updated'] healthDoc['worker_cycle_time'] = worker['cycle_time'] healthDocs.append(healthDoc) return healthDocs def _buildMonITSummaryDocs(self, dataStats): """ Creates a document with the very basic agent info used in the wmstats monitoring tab. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_summary_info" summaryDocs = [] summaryDoc = {} summaryDoc['type'] = docType summaryDoc['agent_team'] = dataStats['agent_team'] summaryDoc['agent_version'] = dataStats['agent_version'] summaryDoc['agent_status'] = dataStats['status'] if not self.isT0agent: summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO'][ 'total_query_time'] summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO'][ 'total_query_time'] summaryDoc['drain_mode'] = dataStats['drain_mode'] summaryDoc['down_components'] = dataStats['down_components'] summaryDocs.append(summaryDoc) return summaryDocs def uploadToAMQ(self, docs, agentUrl, timeS): """ _uploadToAMQ_ Sends data to AMQ, which ends up in the MonIT infrastructure. :param docs: list of documents/dicts to be posted """ if not docs: logging.info("There are no documents to send to AMQ") return # add mandatory information for every single document for doc in docs: doc['agent_url'] = agentUrl docType = "cms_%s_info" % self.producer notifications = [] logging.debug("Sending the following data to AMQ %s", pformat(docs)) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=self.producer, topic=self.topicAMQ, validation_schema=None, host_and_ports=self.hostPortAMQ, logger=logging) for doc in docs: singleNotif, _, _ = stompSvc.make_notification( payload=doc, docType=docType, ts=timeS, dataSubfield="payload") notifications.append(singleNotif) failures = stompSvc.send(notifications) msg = "%i out of %i documents successfully sent to AMQ" % ( len(notifications) - len(failures), len(notifications)) logging.info(msg) except Exception as ex: logging.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class JobUpdaterPoller(BaseWorkerThread): """ _JobUpdaterPoller_ Poller class for the JobUpdater """ def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.JobUpdater.reqMgr2Url) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus") def setup(self, parameters=None): """ _setup_ """ pass def terminate(self, parameters=None): """ _terminate_ Terminate gracefully. """ pass def algorithm(self, parameters=None): """ _algorithm_ """ try: logging.info("Synchronizing priorities with ReqMgr...") self.synchronizeJobPriority() logging.info("Priorities were synchronized, wait until the next cycle") except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except CouchConflictError as ex: msg = "Caught CouchConflictError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except Exception as ex: if 'Connection refused' in str(ex): logging.warn("Failed to sync priorities. Trying in the next cycle") else: msg = "Caught unexpected exception in JobUpdater: %s\n" % str(ex) logging.exception(msg) raise JobUpdaterException(msg) def synchronizeJobPriority(self): """ _synchronizeJobPriority_ Check WMBS and WorkQueue for active workflows and compare with the ReqMgr for priority changes. If a priority change occurs then update the job priority in the batch system and the elements in the local queue that have not been injected yet. """ # Update the priority of workflows that are not in WMBS and just in local queue priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()] for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = self.reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority'] except Exception as ex: logging.error("Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow] logging.info("Found %d workflows to update in workqueue", len(workflowsToUpdate)) for workflow in workflowsToUpdate: self.workqueue.updatePriority(workflow, workflowsToUpdate[workflow]) # Check the workflows in WMBS priorityCache = {} workflowsToUpdateWMBS = {} workflowsToCheck = self.listWorkflowsDAO.execute() for workflowEntry in workflowsToCheck: workflow = workflowEntry['name'] if workflow not in priorityCache: try: priorityCache[workflow] = self.reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority'] except Exception as ex: logging.error("Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue requestPriority = int(priorityCache[workflow]) if requestPriority != int(workflowEntry['workflow_priority']): # Update the workqueue priority for the Available elements self.workqueue.updatePriority(workflow, requestPriority) # Check if there are executing jobs for this particular task if self.executingJobsDAO.execute(workflow, workflowEntry['task']) > 0: self.bossAir.updateJobInformation(workflow, workflowEntry['task'], requestPriority=priorityCache[workflow], taskPriority=workflowEntry['task_priority']) workflowsToUpdateWMBS[workflow] = priorityCache[workflow] if workflowsToUpdateWMBS: logging.info("Updating %d workflows in WMBS.", len(workflowsToUpdateWMBS)) self.updateWorkflowPrioDAO.execute(workflowsToUpdateWMBS)
class AnalyticsPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower() self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None) self.plugin = None def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") # use local db for tier0 if hasattr(self.config, "Tier0Feeder"): centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) self.dbsBufferUtil = DBSBufferUtil() if self.pluginName is not None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname=self.pluginName) @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: # jobs per request info logging.info("Getting Job Couch Data ...") jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite() # fwjr per request info logging.info("Getting FWJRJob Couch Data ...") fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB() skippedInfoFromCouch = self.localCouchDB.getSkippedFilesSummaryByWorkflow() logging.info("Getting Batch Job Data ...") batchJobInfo = self.wmagentDB.getBatchJobInfo() logging.info("Getting Finished Task Data ...") finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask() logging.info("Getting DBS PhEDEx upload status ...") completedWfs = self.dbsBufferUtil.getPhEDExDBSStatusForCompletedWorkflows(summary=True) # get the data from local workqueue: # request name, input dataset, inWMBS, inQueue logging.info("Getting Local Queue Data ...") localQInfo = {} if not hasattr(self.config, "Tier0Feeder"): localQInfo = self.localQueue.getAnalyticsData() else: logging.debug("Tier-0 instance, not checking WorkQueue") # combine all the data from 3 sources logging.info("""Combining data from Job Couch(%s), FWJR(%s), WorkflowsWithSkippedFile(%s), Batch Job(%s), Finished Tasks(%s), Local Queue(%s) Completed workflows(%s).. ...""", len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(skippedInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo), len(completedWfs)) tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo) tempCombinedData2 = combineAnalyticsData(tempCombinedData, localQInfo) combinedRequests = combineAnalyticsData(tempCombinedData2, completedWfs) # set the uploadTime - should be the same for all docs uploadTime = int(time.time()) logging.info("%s requests Data combined,\n uploading request data...", len(combinedRequests)) requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks, skippedInfoFromCouch, self.agentInfo, uploadTime, self.summaryLevel) if self.plugin != None: self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB) existingDocs = self.centralWMStatsCouchDB.getAllAgentRequestRevByID(self.agentInfo["agent_url"]) self.centralWMStatsCouchDB.bulkUpdateData(requestDocs, existingDocs) logging.info("Request data upload success\n %s request, \nsleep for next cycle", len(requestDocs)) self.centralWMStatsCouchDB.updateAgentInfoInPlace(self.agentInfo["agent_url"], {"data_last_update": uploadTime, "data_error": "ok"}) except Exception as ex: msg = str(ex) logging.exception("Error occurred, will retry later: %s", msg) try: self.centralWMStatsCouchDB.updateAgentInfoInPlace(self.agentInfo["agent_url"], {"data_error": msg}) except: logging.error("upload Agent Info to central couch failed")
class AnalyticsPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = ( config.AnalyticsDataCollector.summaryLevel).lower() self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None) self.plugin = None def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService( self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData( self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter( centralRequestCouchDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor( self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory( "plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname=self.pluginName) def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: #jobs per request info logging.info("Getting Job Couch Data ...") jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite( ) #fwjr per request info logging.info("Getting FWJRJob Couch Data ...") fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB( ) logging.info("Getting Batch Job Data ...") batchJobInfo = self.wmagentDB.getBatchJobInfo() logging.info("Getting Finished Task Data ...") finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask() # get the data from local workqueue: # request name, input dataset, inWMBS, inQueue logging.info("Getting Local Queue Data ...") localQInfo = {} if not hasattr(self.config, "Tier0Feeder"): localQInfo = self.localQueue.getAnalyticsData() else: logging.debug("Tier-0 instance, not checking WorkQueue") # combine all the data from 3 sources logging.info( """Combining data from Job Couch(%s), FWJR(%s), Batch Job(%s), Finished Tasks(%s), Local Queue(%s) ...""" % (len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo))) tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo) combinedRequests = combineAnalyticsData(tempCombinedData, localQInfo) #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) logging.info( "%s requests Data combined,\n uploading request data..." % len(combinedRequests)) requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks, self.agentInfo, uploadTime, self.summaryLevel) if self.plugin != None: self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB) self.localSummaryCouchDB.uploadData(requestDocs) logging.info( "Request data upload success\n %s request, \nsleep for next cycle" % len(requestDocs)) DataUploadTime.setInfo(uploadTime, "ok") except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) DataUploadTime.setInfo(False, str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc())
def testUpdatePriorityService(self): """ _testUpdatePriorityService_ Check that we can update the priority correctly also check the available workflows feature """ specName = "RerecoSpec" specUrl = self.specGenerator.createReRecoSpec(specName, "file", assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]}) globalQ = globalQueue(DbName='workqueue_t', QueueURL=self.testInit.couchUrl, UnittestFlag=True) localQ = localQueue(DbName='local_workqueue_t', QueueURL=self.testInit.couchUrl, CacheDir=self.testInit.testDir, ParentQueueCouchUrl='%s/workqueue_t' % self.testInit.couchUrl, ParentQueueInboxCouchDBName='workqueue_t_inbox' ) # Try a full chain of priority update and propagation self.assertTrue(globalQ.queueWork(specUrl, "RerecoSpec", "teamA") > 0) globalApi = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') # overwrite default - can't test with stale view globalApi.defaultOptions = {'reduce': True, 'group': True} globalApi.updatePriority(specName, 100) self.assertEqual(globalQ.backend.getWMSpec(specName).priority(), 100) storedElements = globalQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) numWorks = localQ.pullWork({'T2_XX_SiteA': 10}) self.assertTrue(numWorks > 0) # replicate from GQ to LQ manually localQ.backend.pullFromParent(continuous=False) # wait until replication is done time.sleep(2) localQ.processInboundWork(continuous=False) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 100) localApi = WorkQueueDS(self.testInit.couchUrl, 'local_workqueue_t') # overwrite default - can't test with stale view localApi.defaultOptions = {'reduce': True, 'group': True} localApi.updatePriority(specName, 500) self.assertEqual(localQ.backend.getWMSpec(specName).priority(), 500) storedElements = localQ.backend.getElementsForWorkflow(specName) for element in storedElements: self.assertEqual(element['Priority'], 500) availableWF = localApi.getAvailableWorkflows() self.assertEqual(availableWF, set([(specName, 500)])) # Attempt to update an inexistent workflow in the queue try: globalApi.updatePriority('NotExistent', 2) except Exception as ex: self.fail('No exception should be raised.: %s' % str(ex))
class Request(RESTEntity): def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) self.reqmgr_db = api.db_handler.get_db(config.couch_reqmgr_db) self.reqmgr_db_service = RequestDBWriter(self.reqmgr_db, couchapp="ReqMgr") # this need for the post validtiaon self.reqmgr_aux_db = api.db_handler.get_db(config.couch_reqmgr_aux_db) self.gq_service = WorkQueue(config.couch_host, config.couch_workqueue_db) def _requestArgMapFromBrowser(self, request_args): """ This is specific mapping function data from browser TODO: give a key word so it doesn't have to loop though in general """ docs = [] for doc in request_args: for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _validateGET(self, param, safe): # TODO: need proper validation but for now pass everything args_length = len(param.args) if args_length == 1: safe.kwargs["name"] = param.args[0] param.args.pop() return if "status" in param.kwargs and isinstance(param.kwargs["status"], basestring): param.kwargs["status"] = [param.kwargs["status"]] if "status" in param.kwargs: for status in param.kwargs["status"]: if status.endswith("-archived"): raise InvalidSpecParameterValue( "Can't retrieve bulk archived status requests, use other search arguments") for prop in param.kwargs: safe.kwargs[prop] = param.kwargs[prop] for prop in safe.kwargs: del param.kwargs[prop] return def _validateRequestBase(self, param, safe, valFunc, requestName=None): data = cherrypy.request.body.read() if data: request_args = JsonWrapper.loads(data) if requestName: request_args["RequestName"] = requestName if isinstance(request_args, dict): request_args = [request_args] else: # actually this is error case # cherrypy.log(str(param.kwargs)) request_args = {} for prop in param.kwargs: request_args[prop] = param.kwargs[prop] for prop in request_args: del param.kwargs[prop] if requestName: request_args["RequestName"] = requestName request_args = [request_args] safe.kwargs['workload_pair_list'] = [] if isinstance(request_args, dict): request_args = [request_args] for args in request_args: workload, r_args = valFunc(args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) def _get_request_names(self, ids): "Extract request names from given documents" # cherrypy.log("request names %s" % ids) doc = {} if isinstance(ids, list): for rid in ids: doc[rid] = 'on' elif isinstance(ids, basestring): doc[ids] = 'on' docs = [] for key in doc.keys(): if key.startswith('request'): rid = key.split('request-')[-1] if rid != 'all': docs.append(rid) del doc[key] return docs def _getMultiRequestArgs(self, multiRequestForm): request_args = {} for prop in multiRequestForm: if prop == "ids": request_names = self._get_request_names(multiRequestForm["ids"]) elif prop == "new_status": request_args["RequestStatus"] = multiRequestForm[prop] # remove this # elif prop in ["CustodialSites", "AutoApproveSubscriptionSites"]: # request_args[prop] = [multiRequestForm[prop]] else: request_args[prop] = multiRequestForm[prop] return request_names, request_args def _validateMultiRequests(self, param, safe, valFunc): data = cherrypy.request.body.read() if data: request_names, request_args = self._getMultiRequestArgs(JsonWrapper.loads(data)) else: # actually this is error case # cherrypy.log(str(param.kwargs)) request_names, request_args = self._getMultiRequestArgs(param.kwargs) for prop in request_args: if prop == "RequestStatus": del param.kwargs["new_status"] else: del param.kwargs[prop] del param.kwargs["ids"] # remove this # tmp = [] # for prop in param.kwargs: # tmp.append(prop) # for prop in tmp: # del param.kwargs[prop] safe.kwargs['workload_pair_list'] = [] for request_name in request_names: request_args["RequestName"] = request_name workload, r_args = valFunc(request_args, self.config, self.reqmgr_db_service, param) safe.kwargs['workload_pair_list'].append((workload, r_args)) safe.kwargs["multi_update_flag"] = True def _getRequestNamesFromBody(self, param, safe, valFunc): request_names = JsonWrapper.loads(cherrypy.request.body.read()) safe.kwargs['workload_pair_list'] = request_names safe.kwargs["multi_names_flag"] = True def validate(self, apiobj, method, api, param, safe): # to make validate successful # move the validated argument to safe # make param empty # other wise raise the error try: if method in ['GET']: self._validateGET(param, safe) if method == 'PUT': args_length = len(param.args) if args_length == 1: requestName = param.args[0] param.args.pop() else: requestName = None self._validateRequestBase(param, safe, validate_request_update_args, requestName) # TO: handle multiple clone # if len(param.args) == 2: # #validate clone case # if param.args[0] == "clone": # param.args.pop() # return None, request_args if method == 'POST': args_length = len(param.args) if args_length == 1 and param.args[0] == "multi_update": # special case for multi update from browser. param.args.pop() self._validateMultiRequests(param, safe, validate_request_update_args) elif args_length == 1 and param.args[0] == "bynames": # special case for multi update from browser. param.args.pop() self._getRequestNamesFromBody(param, safe, validate_request_update_args) else: self._validateRequestBase(param, safe, validate_request_create_args) except InvalidSpecParameterValue as ex: raise ex except Exception as ex: # TODO add proper error message instead of trace back msg = traceback.format_exc() cherrypy.log("Error: %s" % msg) if hasattr(ex, "message"): if hasattr(ex.message, '__call__'): msg = ex.message() else: msg = str(ex) else: msg = str(ex) raise InvalidSpecParameterValue(msg) def initialize_clone(self, request_name): requests = self.reqmgr_db_service.getRequestByNames(request_name) clone_args = requests.values()[0] # overwrite the name and time stamp. initialize_request_args(clone_args, self.config, clone=True) # timestamp status update spec = loadSpecByType(clone_args["RequestType"]) workload = spec.factoryWorkloadConstruction(clone_args["RequestName"], clone_args) return (workload, clone_args) @restcall(formats=[('application/json', JSONFormat())]) def get(self, **kwargs): """ Returns request info depending on the conditions set by kwargs Currently defined kwargs are following. statusList, requestNames, requestType, prepID, inputDataset, outputDataset, dateRange If jobInfo is True, returns jobInfomation about the request as well. TODO: stuff like this has to filtered out from result of this call: _attachments: {u'spec': {u'stub': True, u'length': 51712, u'revpos': 2, u'content_type': u'application/json'}} _id: maxa_RequestString-OVERRIDE-ME_130621_174227_9225 _rev: 4-c6ceb2737793aaeac3f1cdf591593da4 """ if len(kwargs) == 0: kwargs['status'] = "running" options = {"descending": True, 'include_docs': True, 'limit': 200} request_docs = self.reqmgr_db.loadView("ReqMgr", "bystatus", options) return rows([request_docs]) # list of status status = kwargs.get("status", False) # list of request names name = kwargs.get("name", False) request_type = kwargs.get("request_type", False) prep_id = kwargs.get("prep_id", False) inputdataset = kwargs.get("inputdataset", False) outputdataset = kwargs.get("outputdataset", False) date_range = kwargs.get("date_range", False) campaign = kwargs.get("campaign", False) workqueue = kwargs.get("workqueue", False) team = kwargs.get("team", False) mc_pileup = kwargs.get("mc_pileup", False) data_pileup = kwargs.get("data_pileup", False) detail = kwargs.get("detail", True) if detail in (False, "false", "False"): option = {"include_docs": False} else: option = {"include_docs": True} # eventhing should be stale view. this only needs for test _nostale = kwargs.get("_nostale", False) if _nostale: self.reqmgr_db_service._setNoStale() request_info = [] if status and not team and not request_type: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bystatus", option, status)) if status and team: request_info.append( self.reqmgr_db_service.getRequestByCouchView("byteamandstatus", option, [[team, status]])) if status and request_type: request_info.append(self.reqmgr_db_service.getRequestByCouchView("requestsbystatusandtype", option, [[status, request_type]])) if name: request_info.append(self.reqmgr_db_service.getRequestByNames(name)) if prep_id: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byprepid", option, prep_id)) if inputdataset: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byinputdataset", option, inputdataset)) if outputdataset: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byoutputdataset", option, outputdataset)) if date_range: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bydate", option, date_range)) if campaign: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bycampaign", option, campaign)) if workqueue: request_info.append(self.reqmgr_db_service.getRequestByCouchView("byworkqueue", option, workqueue)) if mc_pileup: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bymcpileup", option, mc_pileup)) if data_pileup: request_info.append(self.reqmgr_db_service.getRequestByCouchView("bydatapileup", option, data_pileup)) # get interaction of the request result = self._intersection_of_request_info(request_info) if len(result) == 0: return [] return rows([result]) def _intersection_of_request_info(self, request_info): requests = {} if len(request_info) < 1: return requests request_key_set = set(request_info[0].keys()) for info in request_info: request_key_set = set(request_key_set) & set(info.keys()) # TODO: need to assume some data maight not contains include docs for request_name in request_key_set: requests[request_name] = request_info[0][request_name] return requests # TODO move this out of this class def filterCouchInfo(self, couchInfo): for key in ['_rev', '_attachments']: if key in couchInfo: del couchInfo[key] def _combine_request(self, request_info, requestAgentUrl, cache): keys = {} requestAgentUrlList = [] for row in requestAgentUrl["rows"]: request = row["key"][0] if not keys[request]: keys[request] = [] keys[request].append(row["key"][1]) for request in request_info: for agentUrl in keys[request]: requestAgentUrlList.append([request, agentUrl]) return requestAgentUrlList def _retrieveResubmissionChildren(self, request_name): result = self.reqmgr_db.loadView('ReqMgr', 'childresubmissionrequests', keys=[request_name])['rows'] childrenRequestNames = [] for child in result: childrenRequestNames.append(child['id']) childrenRequestNames.extend(self._retrieveResubmissionChildren(child['id'])) return childrenRequestNames def _updateRequest(self, workload, request_args): if workload == None: (workload, request_args) = self.initialize_clone(request_args["OriginalRequestName"]) return self.post(workload, request_args) dn = cherrypy.request.user.get("dn", "unknown") if ('SoftTimeout' in request_args) and ('GracePeriod' in request_args): request_args['HardTimeout'] = request_args['SoftTimeout'] + request_args['GracePeriod'] if 'RequestPriority' in request_args: self.gq_service.updatePriority(workload.name(), request_args['RequestPriority']) if "total_jobs" in request_args: # only GQ update this stats # request_args should contain only 4 keys 'total_jobs', 'input_lumis', 'input_events', 'input_num_files'} report = self.reqmgr_db_service.updateRequestStats(workload.name(), request_args) # if is not just updating status else: req_status = request_args.get("RequestStatus", None) if len(request_args) >= 1 and req_status == None: try: workload.updateArguments(request_args) except Exception as ex: msg = traceback.format_exc() cherrypy.log("Error for request args %s: %s" % (request_args, msg)) raise InvalidSpecParameterValue(str(ex)) # trailing / is needed for the savecouchUrl function workload.saveCouch(self.config.couch_host, self.config.couch_reqmgr_db) elif (req_status in ["closed-out" "announced"]) and request_args.get("cascade", False): cascade_list = self._retrieveResubmissionChildren(workload.name) for req_name in cascade_list: report = self.reqmgr_db_service.updateRequestStatus(req_name, req_status) # If it is aborted or force-complete transition call workqueue to cancel the request else: if req_status == "aborted" or req_status == "force-complete": self.gq_service.cancelWorkflow(workload.name()) report = self.reqmgr_db_service.updateRequestProperty(workload.name(), request_args, dn) if report == 'OK': return {workload.name(): "OK"} else: return {workload.name(): "ERROR"} @restcall(formats=[('application/json', JSONFormat())]) def put(self, workload_pair_list): "workloadPairList is a list of tuple containing (workload, requeat_args)" report = [] for workload, request_args in workload_pair_list: result = self._updateRequest(workload, request_args) report.append(result) return report @restcall(formats=[('application/json', JSONFormat())]) def delete(self, request_name): cherrypy.log("INFO: Deleting request document '%s' ..." % request_name) try: self.reqmgr_db.delete_doc(request_name) except CouchError as ex: msg = "ERROR: Delete failed." cherrypy.log(msg + " Reason: %s" % ex) raise cherrypy.HTTPError(404, msg) # TODO # delete should also happen on WMStats cherrypy.log("INFO: Delete '%s' done." % request_name) def _update_additional_request_args(self, workload, request_args): """ add to request_args properties which is not initially set from user. This data will put in to couchdb. Update request_args here if additional information need to be put in couchdb """ request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], request_args["CouchWorkloadDBName"], workload.name()))['url'] # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists request_args['OutputDatasets'] = workload.listOutputDatasets() # TODO: remove this after reqmgr2 replice reqmgr (reqmgr2Only) request_args['ReqMgr2Only'] = True return @restcall(formats=[('application/json', JSONFormat())]) def post(self, workload_pair_list, multi_update_flag=False, multi_names_flag=False): """ Create and update couchDB with a new request. request argument is passed from validation (validation convert cherrypy.request.body data to argument) TODO: this method will have some parts factored out so that e.g. clone call can share functionality. NOTES: 1) do not strip spaces, #4705 will fails upon injection with spaces; currently the chain relies on a number of things coming in #4705 2) reqInputArgs = Utilities.unidecode(JsonWrapper.loads(body)) (from ReqMgrRESTModel.putRequest) """ # storing the request document into Couch if multi_update_flag: return self.put(workload_pair_list) if multi_names_flag: return self.get(name=workload_pair_list) out = [] for workload, request_args in workload_pair_list: self._update_additional_request_args(workload, request_args) cherrypy.log("INFO: Create request, input args: %s ..." % request_args) workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"], metadata=request_args) out.append({'request': workload.name()}) return out
class JobUpdaterPoller(BaseWorkerThread): """ _JobUpdaterPoller_ Poller class for the JobUpdater """ def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory( classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory( classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory( classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus") def setup(self, parameters=None): """ _setup_ """ pass def terminate(self, parameters=None): """ _terminate_ Terminate gracefully. """ pass @timeFunction def algorithm(self, parameters=None): """ _algorithm_ """ try: logging.info("Synchronizing priorities with ReqMgr...") self.synchronizeJobPriority() logging.info( "Priorities were synchronized, wait until the next cycle") except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except CouchConflictError as ex: msg = "Caught CouchConflictError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except Exception as ex: if 'Connection refused' in str(ex): logging.warn( "Failed to sync priorities. Trying in the next cycle") else: msg = "Caught unexpected exception in JobUpdater: %s\n" % str( ex) logging.exception(msg) raise JobUpdaterException(msg) def synchronizeJobPriority(self): """ _synchronizeJobPriority_ Check WMBS and WorkQueue for active workflows and compare with the ReqMgr for priority changes. If a priority change occurs then update the job priority in the batch system and the elements in the local queue that have not been injected yet. """ # Update the priority of workflows that are not in WMBS and just in local queue priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()] for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: result = self.reqmgr2.getRequestByNames(workflow)[0] priorityCache[workflow] = result[workflow][ 'RequestPriority'] except Exception as ex: logging.error( "Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow] logging.info("Found %d workflows to update in workqueue", len(workflowsToUpdate)) for workflow in workflowsToUpdate: self.workqueue.updatePriority(workflow, workflowsToUpdate[workflow]) # Check the workflows in WMBS priorityCache = {} workflowsToUpdateWMBS = {} workflowsToCheck = self.listWorkflowsDAO.execute() for workflowEntry in workflowsToCheck: workflow = workflowEntry['name'] if workflow not in priorityCache: try: result = self.reqmgr2.getRequestByNames(workflow)[0] priorityCache[workflow] = result[workflow][ 'RequestPriority'] except Exception as ex: logging.error( "Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue requestPriority = int(priorityCache[workflow]) if requestPriority != int(workflowEntry['workflow_priority']): # Update the workqueue priority for the Available elements self.workqueue.updatePriority(workflow, requestPriority) # Check if there are executing jobs for this particular task if self.executingJobsDAO.execute(workflow, workflowEntry['task']) > 0: self.bossAir.updateJobInformation( workflow, workflowEntry['task'], requestPriority=priorityCache[workflow], taskPriority=workflowEntry['task_priority']) workflowsToUpdateWMBS[workflow] = priorityCache[workflow] if workflowsToUpdateWMBS: logging.info("Updating %d workflows in WMBS.", len(workflowsToUpdateWMBS)) self.updateWorkflowPrioDAO.execute(workflowsToUpdateWMBS)