class RequestDBTest(unittest.TestCase): """ """ def setUp(self): """ _setUp_ """ self.schema = [] self.couchApps = ["ReqMgr"] self.testInit = TestInitCouchApp('RequestDBServiceTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=self.schema, useDefault=False) dbName = 'requsetdb_t' self.testInit.setupCouch(dbName, *self.couchApps) reqDBURL = "%s/%s" % (self.testInit.couchUrl, dbName) self.requestWriter = RequestDBWriter(reqDBURL) self.requestReader = RequestDBReader(reqDBURL) self.requestWriter.defaultStale = {} self.requestReader.defaultStale = {} return def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ self.testInit.tearDownCouch() def testRequestDBWriter(self): # test getWork schema = generate_reqmgr_schema(3) result = self.requestWriter.insertGenericRequest(schema[0]) self.assertEqual(len(result), 1, 'insert fail') self.assertEqual( self.requestWriter.updateRequestStatus(schema[0]['RequestName'], "failed"), 'OK', 'update fail') self.assertEqual( self.requestWriter.updateRequestStatus("not_exist_schema", "assigned"), 'Error: document not found') result = self.requestWriter.updateRequestProperty( schema[0]['RequestName'], {'Teams': ['teamA']}) self.assertEqual( self.requestWriter.updateRequestProperty(schema[0]['RequestName'], {'Teams': ['teamA']}), 'OK', 'update fail') self.assertEqual( self.requestWriter.updateRequestProperty("not_exist_schema", {'Teams': 'teamA'}), 'Error: document not found') result = self.requestReader.getRequestByNames( [schema[0]['RequestName']]) self.assertEqual(len(result), 1, "should be 1") result = self.requestReader.getRequestByStatus(["failed"], False, 1) self.assertEqual(len(result), 1, "should be 1") result = self.requestReader.getStatusAndTypeByRequest( [schema[0]['RequestName']]) self.assertEqual(result[schema[0]['RequestName']][0], 'failed', "should be failed") result = self.requestWriter.insertGenericRequest(schema[1]) time.sleep(2) result = self.requestWriter.insertGenericRequest(schema[2]) endTime = int(time.time()) - 1 result = self.requestReader.getRequestByStatusAndEndTime( "new", False, endTime) self.assertEqual(len(result), 1, "should be 1") endTime = int(time.time()) + 1 result = self.requestReader.getRequestByStatusAndEndTime( "new", False, endTime) self.assertEqual(len(result), 2, "should be 2")
class WMStatsReader(object): #TODO need to get this from reqmgr api ACTIVE_STATUS = ["new", "assignment-approved", "assigned", "acquired", "running", "running-open", "running-closed", "failed", "force-complete", "completed", "closed-out", "announced", "aborted", "aborted-completed", "rejected"] T0_ACTIVE_STATUS = ["new", "Closed", "Merge", "Harvesting", "Processing Done", "AlcaSkim", "completed"] def __init__(self, couchURL, appName="WMStats", reqdbURL=None, reqdbCouchApp="ReqMgr"): self._sanitizeURL(couchURL) # set the connection for local couchDB call self._commonInit(couchURL, appName) if reqdbURL: self.reqDB = RequestDBReader(reqdbURL, reqdbCouchApp) else: self.reqDB = None def _sanitizeURL(self, couchURL): return sanitizeURL(couchURL)['url'] def _commonInit(self, couchURL, appName = "WMStats"): """ setting up comon variables for inherited class. inherited class should call this in their init function """ self.couchURL, self.dbName = splitCouchServiceURL(couchURL) self.couchServer = CouchServer(self.couchURL) self.couchDB = self.couchServer.connectDatabase(self.dbName, False) self.couchapp = appName self.defaultStale = {"stale": "update_after"} def setDefaultStaleOptions(self, options): if not options: options = {} if 'stale' not in options: options.update(self.defaultStale) return options def getLatestJobInfoByRequests(self, requestNames): jobInfoByRequestAndAgent = {} if len(requestNames) > 0: requestAndAgentKey = self._getRequestAndAgent(requestNames) jobDocIds = self._getLatestJobInfo(requestAndAgentKey) jobInfoByRequestAndAgent = self._getAllDocsByIDs(jobDocIds) return jobInfoByRequestAndAgent def _updateRequestInfoWithJobInfo(self, requestInfo): if len(requestInfo.keys()) != 0: jobInfoByRequestAndAgent = self.getLatestJobInfoByRequests(requestInfo.keys()) self._combineRequestAndJobData(requestInfo, jobInfoByRequestAndAgent) def _getCouchView(self, view, options, keys = []): options = self.setDefaultStaleOptions(options) if keys and isinstance(keys, str): keys = [keys] return self.couchDB.loadView(self.couchapp, view, options, keys) def _formatCouchData(self, data, key = "id"): result = {} for row in data['rows']: if 'error' in row: continue if "doc" in row: result[row[key]] = row["doc"] else: result[row[key]] = None return result def _combineRequestAndJobData(self, requestData, jobData): """ update the request data with job info requestData['AgentJobInfo'] = {'vocms234.cern.ch:9999': {"_id":"d1d11dfcb30e0ab47db42007cb6fb847", "_rev":"1-8abfaa2de822ed081cb8d174e3e2c003", "status":{"inWMBS":334,"success":381,"submitted":{"retry":2,"pending":2},"failure":{"exception":3}}, "agent_team":"testbed-integration","workflow":"amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731", "timestamp":1394738860,"sites":{"T2_CH_CERN_AI":{"submitted":{"retry":1,"pending":1}}, "T2_CH_CERN":{"success":6,"submitted":{"retry":1,"pending":1}}, "T2_DE_DESY":{"failure":{"exception":3},"success":375}}, "agent":"WMAgentCommissioning", "tasks": {"/amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731/Production": {"status":{"failure":{"exception":3},"success":331}, "sites":{"T2_DE_DESY": {"success":325,"wrappedTotalJobTime":11305908, "dataset":{},"failure":{"exception":3}, "cmsRunCPUPerformance":{"totalJobCPU":10869688.8, "totalEventCPU":10832426.7, "totalJobTime":11255865.9}, "inputEvents":0}, "T2_CH_CERN":{"success":6,"wrappedTotalJobTime":176573, "dataset":{}, "cmsRunCPUPerformance":{"totalJobCPU":167324.8, "totalEventCPU":166652.1, "totalJobTime":174975.7}, "inputEvents":0}}, "subscription_status":{"updated":1393108089, "finished":2, "total":2,"open":0}, "jobtype":"Production"}, "/amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731/Production/ProductionMergeRAWSIMoutput/ProductionRAWSIMoutputMergeLogCollect": {"jobtype":"LogCollect", "subscription_status":{"updated":1392885768, "finished":0, "total":1,"open":1}}, "/amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731/Production/ProductionMergeRAWSIMoutput": {"status":{"success":41,"submitted":{"retry":1,"pending":1}}, "sites":{"T2_DE_DESY":{"datasetStat":{"totalLumis":973,"events":97300,"size":105698406915}, "success":41,"wrappedTotalJobTime":9190, "dataset":{"/GluGluToHTohhTo4B_mH-350_mh-125_8TeV-pythia6-tauola/Summer12-OracleUpgrade_TEST_ALAN_HG1401-v1/GEN-SIM": {"totalLumis":973,"events":97300,"size":105698406915}}, "cmsRunCPUPerformance":{"totalJobCPU":548.92532,"totalEventCPU":27.449808,"totalJobTime":2909.92125}, "inputEvents":97300}, "T2_CH_CERN":{"submitted":{"retry":1,"pending":1}}}, "subscription_status":{"updated":1392885768,"finished":0,"total":1,"open":1}, "jobtype":"Merge"}, "agent_url":"vocms231.cern.ch:9999", "type":"agent_request"}} """ if jobData: for row in jobData["rows"]: # condition checks if documents are deleted between calls. # just ignore in that case if row["doc"]: jobInfo = requestData[row["doc"]["workflow"]] jobInfo.setdefault("AgentJobInfo", {}) jobInfo["AgentJobInfo"][row["doc"]["agent_url"]] = row["doc"] def _getRequestAndAgent(self, filterRequest = None): """ returns the [['request_name', 'agent_url'], ....] """ options = {} options["reduce"] = True options["group"] = True result = self._getCouchView("requestAgentUrl", options) if filterRequest == None: keys = [row['key'] for row in result["rows"]] else: keys = [row['key'] for row in result["rows"] if row['key'][0] in filterRequest] return keys def _getLatestJobInfo(self, keys): """ keys is [['request_name', 'agent_url'], ....] returns ids """ if len(keys) == 0: return [] options = {} options["reduce"] = True options["group"] = True result = self._getCouchView("latestRequest", options, keys) ids = [row['value']['id'] for row in result["rows"]] return ids def _getAllDocsByIDs(self, ids, include_docs = True): """ keys is [id, ....] returns document """ if len(ids) == 0: return None options = {} options["include_docs"] = include_docs result = self.couchDB.allDocs(options, ids) return result def _getAgentInfo(self): """ returns all the agents status on wmstats """ options = {} result = self._getCouchView("agentInfo", options) return result def agentsByTeam(self, ignoreDrain = True): """ return a dictionary like {team:#agents,...} """ result = self._getAgentInfo() response = dict() for agentInfo in result["rows"]: teams = agentInfo['value']['agent_team'].split(',') for team in teams: if team not in response.keys(): response[team] = 0 if ignoreDrain: if not agentInfo['value']['drain_mode']: for team in teams: response[team] += 1 else: for team in teams: response[team] += 1 return response def getServerInstance(self): return self.couchServer def getDBInstance(self): return self.couchDB def getRequestDBInstance(self): return self.reqDB def getHeartbeat(self): try: return self.couchDB.info(); except Exception as ex: return {'error_message': str(ex)} def getRequestByNames(self, requestNames, jobInfoFlag = False): """ To use this function reqDBURL need to be set when wmstats initialized. This will be deplicated so please don use this. """ requestInfo = self.reqDB.getRequestByNames(requestNames, True) if jobInfoFlag: # get request and agent info self._updateRequestInfoWithJobInfo(requestInfo) return requestInfo def getActiveData(self, jobInfoFlag = False): return self.getRequestByStatus(WMStatsReader.ACTIVE_STATUS, jobInfoFlag) def getT0ActiveData(self, jobInfoFlag = False): return self.getRequestByStatus(WMStatsReader.T0_ACTIVE_STATUS, jobInfoFlag) def getRequestByStatus(self, statusList, jobInfoFlag = False, limit = None, skip = None, legacyFormat = False): """ To use this function reqDBURL need to be set when wmstats initialized. This will be deplicated so please don use this. If legacyFormat is True convert data to old wmstats format from current reqmgr format. Shouldn't be set to True unless existing code breaks """ requestInfo = self.reqDB.getRequestByStatus(statusList, True, limit, skip) if legacyFormat: # convert the format to wmstas old format for requestName, doc in requestInfo.items(): requestInfo[requestName] = convertToLegacyFormat(doc) if jobInfoFlag: # get request and agent info self._updateRequestInfoWithJobInfo(requestInfo) return requestInfo def getRequestSummaryWithJobInfo(self, requestName): """ get request info with job status """ requestInfo = self.reqDB.getRequestByNames(requestName) self._updateRequestInfoWithJobInfo(requestInfo) return requestInfo def getArchivedRequests(self): """ get list of archived workflow in wmstats db. """ options = {"group_level": 1, "reduce": True} results = self.couchDB.loadView(self.couchapp, "allWorkflows", options = options)['rows'] requestNames = [x['key'] for x in results] workflowDict = self.reqDB.getStatusAndTypeByRequest(requestNames) archivedRequests = [] for request, value in workflowDict.items(): if value[0].endswith("-archived"): archivedRequests.append(request) return archivedRequests def isWorkflowCompletedWithLogCollectAndCleanUp(self, requestName): """ check whether workflow is completed including LogCollect and CleanUp tasks TODO: If the parent task all failed and next task are not created at all, It can't detect complete status. If the one of the task doesn't contain any jobs, it will return False """ requestInfo = self.getRequestSummaryWithJobInfo(requestName) reqInfoInstance = RequestInfo(requestInfo[requestName]) return reqInfoInstance.isWorkflowFinished()
class WMStatsReader(object): # TODO need to get this from reqmgr api ACTIVE_STATUS = ["new", "assignment-approved", "assigned", "acquired", "running", "running-open", "running-closed", "failed", "force-complete", "completed", "closed-out", "announced", "aborted", "aborted-completed", "rejected"] T0_ACTIVE_STATUS = ["new", "Closed", "Merge", "Harvesting", "Processing Done", "AlcaSkim", "completed"] def __init__(self, couchURL, appName="WMStats", reqdbURL=None, reqdbCouchApp="ReqMgr"): self._sanitizeURL(couchURL) # set the connection for local couchDB call self._commonInit(couchURL, appName) if reqdbURL: self.reqDB = RequestDBReader(reqdbURL, reqdbCouchApp) else: self.reqDB = None def _sanitizeURL(self, couchURL): return sanitizeURL(couchURL)['url'] def _commonInit(self, couchURL, appName="WMStats"): """ setting up comon variables for inherited class. inherited class should call this in their init function """ self.couchURL, self.dbName = splitCouchServiceURL(couchURL) self.couchServer = CouchServer(self.couchURL) self.couchDB = self.couchServer.connectDatabase(self.dbName, False) self.couchapp = appName self.defaultStale = {"stale": "update_after"} def setDefaultStaleOptions(self, options): if not options: options = {} if 'stale' not in options: options.update(self.defaultStale) return options def getLatestJobInfoByRequests(self, requestNames): jobInfoByRequestAndAgent = {} if len(requestNames) > 0: requestAndAgentKey = self._getRequestAndAgent(requestNames) jobInfoByRequestAndAgent = self._getLatestJobInfo(requestAndAgentKey) return jobInfoByRequestAndAgent def _updateRequestInfoWithJobInfo(self, requestInfo): if len(requestInfo.keys()) != 0: jobInfoByRequestAndAgent = self.getLatestJobInfoByRequests(requestInfo.keys()) self._combineRequestAndJobData(requestInfo, jobInfoByRequestAndAgent) def _getCouchView(self, view, options, keys=None): keys = keys or [] options = self.setDefaultStaleOptions(options) if keys and isinstance(keys, str): keys = [keys] return self.couchDB.loadView(self.couchapp, view, options, keys) def _formatCouchData(self, data, key="id"): result = {} for row in data['rows']: if 'error' in row: continue if "doc" in row: result[row[key]] = row["doc"] else: result[row[key]] = None return result def _combineRequestAndJobData(self, requestData, jobData): """ update the request data with job info requestData['AgentJobInfo'] = {'vocms234.cern.ch:9999': {"_id":"d1d11dfcb30e0ab47db42007cb6fb847", "_rev":"1-8abfaa2de822ed081cb8d174e3e2c003", "status":{"inWMBS":334,"success":381,"submitted":{"retry":2,"pending":2},"failure":{"exception":3}}, "agent_team":"testbed-integration","workflow":"amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731", "timestamp":1394738860,"sites":{"T2_CH_CERN_AI":{"submitted":{"retry":1,"pending":1}}, "T2_CH_CERN":{"success":6,"submitted":{"retry":1,"pending":1}}, "T2_DE_DESY":{"failure":{"exception":3},"success":375}}, "agent":"WMAgentCommissioning", "tasks": {"/amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731/Production": {"status":{"failure":{"exception":3},"success":331}, "sites":{"T2_DE_DESY": {"success":325,"wrappedTotalJobTime":11305908, "dataset":{},"failure":{"exception":3}, "cmsRunCPUPerformance":{"totalJobCPU":10869688.8, "totalEventCPU":10832426.7, "totalJobTime":11255865.9}, "inputEvents":0}, "T2_CH_CERN":{"success":6,"wrappedTotalJobTime":176573, "dataset":{}, "cmsRunCPUPerformance":{"totalJobCPU":167324.8, "totalEventCPU":166652.1, "totalJobTime":174975.7}, "inputEvents":0}}, "subscription_status":{"updated":1393108089, "finished":2, "total":2,"open":0}, "jobtype":"Production"}, "/amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731/Production/ProductionMergeRAWSIMoutput/ProductionRAWSIMoutputMergeLogCollect": {"jobtype":"LogCollect", "subscription_status":{"updated":1392885768, "finished":0, "total":1,"open":1}}, "/amaltaro_OracleUpgrade_TEST_HG1401_140220_090116_6731/Production/ProductionMergeRAWSIMoutput": {"status":{"success":41,"submitted":{"retry":1,"pending":1}}, "sites":{"T2_DE_DESY":{"datasetStat":{"totalLumis":973,"events":97300,"size":105698406915}, "success":41,"wrappedTotalJobTime":9190, "dataset":{"/GluGluToHTohhTo4B_mH-350_mh-125_8TeV-pythia6-tauola/Summer12-OracleUpgrade_TEST_ALAN_HG1401-v1/GEN-SIM": {"totalLumis":973,"events":97300,"size":105698406915}}, "cmsRunCPUPerformance":{"totalJobCPU":548.92532,"totalEventCPU":27.449808,"totalJobTime":2909.92125}, "inputEvents":97300}, "T2_CH_CERN":{"submitted":{"retry":1,"pending":1}}}, "subscription_status":{"updated":1392885768,"finished":0,"total":1,"open":1}, "jobtype":"Merge"}, "agent_url":"vocms231.cern.ch:9999", "type":"agent_request"}} """ if jobData: for row in jobData["rows"]: # condition checks if documents are deleted between calls. # just ignore in that case if row["doc"]: jobInfo = requestData[row["doc"]["workflow"]] jobInfo.setdefault("AgentJobInfo", {}) jobInfo["AgentJobInfo"][row["doc"]["agent_url"]] = row["doc"] def _getRequestAndAgent(self, filterRequest=None): """ returns the [['request_name', 'agent_url'], ....] """ options = {} options["reduce"] = True options["group"] = True result = self._getCouchView("requestAgentUrl", options) if filterRequest is None: keys = [row['key'] for row in result["rows"]] else: keys = [row['key'] for row in result["rows"] if row['key'][0] in filterRequest] return keys def _getLatestJobInfo(self, keys): """ keys is [['request_name', 'agent_url'], ....] returns ids """ if len(keys) == 0: return [] options = {"include_docs": True} options["reduce"] = False result = self._getCouchView("latestRequest", options, keys) return result def _getAllDocsByIDs(self, ids, include_docs=True): """ keys is [id, ....] returns document """ if len(ids) == 0: return None options = {} options["include_docs"] = include_docs result = self.couchDB.allDocs(options, ids) return result def _getAgentInfo(self): """ returns all the agents status on wmstats """ options = {} result = self._getCouchView("agentInfo", options) return result def agentsByTeam(self, filterDrain=False): """ return a dictionary like {team:#agents,...} """ result = self._getAgentInfo() response = dict() for agentInfo in result["rows"]: #filtering empty string team = agentInfo['value']['agent_team'] if not team: continue response.setdefault(team, 0) if filterDrain: if not agentInfo['value'].get('drain_mode', False): response[team] += 1 else: response[team] += 1 return response def getServerInstance(self): return self.couchServer def getDBInstance(self): return self.couchDB def getRequestDBInstance(self): return self.reqDB def getHeartbeat(self): try: return self.couchDB.info() except Exception as ex: return {'error_message': str(ex)} def getRequestByNames(self, requestNames, jobInfoFlag=False): """ To use this function reqDBURL need to be set when wmstats initialized. This will be deplicated so please don use this. """ requestInfo = self.reqDB.getRequestByNames(requestNames, True) if jobInfoFlag: # get request and agent info self._updateRequestInfoWithJobInfo(requestInfo) return requestInfo def getActiveData(self, jobInfoFlag=False): return self.getRequestByStatus(WMStatsReader.ACTIVE_STATUS, jobInfoFlag) def getT0ActiveData(self, jobInfoFlag=False): return self.getRequestByStatus(WMStatsReader.T0_ACTIVE_STATUS, jobInfoFlag) def getRequestByStatus(self, statusList, jobInfoFlag=False, limit=None, skip=None, legacyFormat=False): """ To use this function reqDBURL need to be set when wmstats initialized. This will be deplicated so please don use this. If legacyFormat is True convert data to old wmstats format from current reqmgr format. Shouldn't be set to True unless existing code breaks """ requestInfo = self.reqDB.getRequestByStatus(statusList, True, limit, skip) if legacyFormat: # convert the format to wmstas old format for requestName, doc in requestInfo.items(): requestInfo[requestName] = convertToLegacyFormat(doc) if jobInfoFlag: # get request and agent info self._updateRequestInfoWithJobInfo(requestInfo) return requestInfo def getRequestSummaryWithJobInfo(self, requestName): """ get request info with job status """ requestInfo = self.reqDB.getRequestByNames(requestName) self._updateRequestInfoWithJobInfo(requestInfo) return requestInfo def getArchivedRequests(self): """ get list of archived workflow in wmstats db. """ options = {"group_level": 1, "reduce": True} results = self.couchDB.loadView(self.couchapp, "allWorkflows", options=options)['rows'] requestNames = [x['key'] for x in results] workflowDict = self.reqDB.getStatusAndTypeByRequest(requestNames) archivedRequests = [] for request, value in workflowDict.items(): if value[0].endswith("-archived"): archivedRequests.append(request) return archivedRequests def isWorkflowCompletedWithLogCollectAndCleanUp(self, requestName): """ check whether workflow is completed including LogCollect and CleanUp tasks TODO: If the parent task all failed and next task are not created at all, It can't detect complete status. If the one of the task doesn't contain any jobs, it will return False """ requestInfo = self.getRequestSummaryWithJobInfo(requestName) reqInfoInstance = RequestInfo(requestInfo[requestName]) return reqInfoInstance.isWorkflowFinished() def getTaskJobSummaryByRequest(self, requestName, sampleSize=1): options = {'reduce': True, 'group_level': 5, 'startkey': [requestName], 'endkey': [requestName, {}]} results = self.couchDB.loadView(self.couchapp, "jobsByStatusWorkflow", options=options) jobDetails = {} for row in results['rows']: # row["key"] = ['workflow', 'task', 'jobstatus', 'exitCode', 'site'] startKey = row["key"][:4] endKey = [] site = row["key"][4] if site: startKey.append(site) endKey.extend(startKey) endKey.append({}) numOfError = row["value"] jobInfo = self.jobDetailByTasks(startKey, endKey, numOfError, sampleSize) jobDetails = nestedDictUpdate(jobDetails, jobInfo) return jobDetails def jobDetailByTasks(self, startKey, endKey, numOfError, limit=1): options = {'include_docs': True, 'reduce': False, 'startkey': startKey, 'endkey': endKey, 'limit': limit} result = self.couchDB.loadView(self.couchapp, "jobsByStatusWorkflow", options=options) jobInfoDoc = {} for row in result['rows']: keys = row['key'] workflow = keys[0] task = keys[1] jobStatus = keys[2] exitCode = keys[3] site = keys[4] jobInfoDoc.setdefault(workflow, {}) jobInfoDoc[workflow].setdefault(task, {}) jobInfoDoc[workflow][task].setdefault(jobStatus, {}) jobInfoDoc[workflow][task][jobStatus].setdefault(exitCode, {}) jobInfoDoc[workflow][task][jobStatus][exitCode].setdefault(site, {}) finalStruct = jobInfoDoc[workflow][task][jobStatus][exitCode][site] finalStruct["errorCount"] = numOfError finalStruct.setdefault("samples", []) finalStruct["samples"].append(row["doc"]) return jobInfoDoc def getAllAgentRequestRevByID(self, agentURL): options = {"reduce": False} results = self.couchDB.loadView(self.couchapp, "byAgentURL", options=options, keys=[agentURL]) idRevMap = {} for row in results['rows']: idRevMap[row['id']] = row['value']['rev'] return idRevMap
class CleanCouchPoller(BaseWorkerThread): """ Cleans up local couch db according the the given condition. 1. Cleans local couch db when request is completed and reported to cental db. This will clean up local couchdb, local summary db, local queue 2. Cleans old couchdoc which is created older than the time threshold """ def __init__(self, config): """ Initialize config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config def setup(self, parameters): """ Called at startup """ self.teamName = self.config.Agent.teamName # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.archiveDelayHours = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") #TODO: we might need to use local db for Tier0 self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) if self.useReqMgrForCompletionCheck: self.deletableState = "announced" self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL) #TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only) self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableState = "completed" # use local for update self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName) def algorithm(self, parameters): """ Get information from wmbs, workqueue and local couch. - It deletes old wmstats docs - Archive workflows """ try: logging.info("Cleaning up the old request docs") report = self.wmstatsCouchDB.deleteOldDocs(self.config.TaskArchiver.DataKeepDays) logging.info("%s docs deleted" % report) logging.info("Cleaning up the archived request docs") report = self.cleanAlreadyArchivedWorkflows() logging.info("%s archived workflows deleted" % report) # archiving only workflows that I own (same team) logging.info("Getting requests in '%s' state for team '%s'", self.deletableState, self.teamName) endTime = int(time.time()) - self.archiveDelayHours * 3600 wfs = self.centralRequestDBReader.getRequestByTeamAndStatus(self.teamName, self.deletableState) commonWfs = self.centralRequestDBReader.getRequestByStatusAndStartTime(self.deletableState, False, endTime) deletableWorkflows = list(set(wfs) & set(commonWfs)) logging.info("Ready to archive normal %s workflows", len(deletableWorkflows)) numUpdated = self.archiveWorkflows(deletableWorkflows, "normal-archived") logging.info("archive normal %s workflows", numUpdated) abortedWorkflows = self.centralRequestDBReader.getRequestByStatus(["aborted-completed"]) logging.info("Ready to archive aborted %s workflows", len(abortedWorkflows)) numUpdated = self.archiveWorkflows(abortedWorkflows, "aborted-archived") logging.info("archive aborted %s workflows", numUpdated) rejectedWorkflows = self.centralRequestDBReader.getRequestByStatus(["rejected"]) logging.info("Ready to archive rejected %s workflows", len(rejectedWorkflows)) numUpdated = self.archiveWorkflows(rejectedWorkflows, "rejected-archived") logging.info("archive rejected %s workflows", numUpdated) except Exception as ex: logging.error(str(ex)) logging.error("Error occurred, will try again next cycle") def archiveWorkflows(self, workflows, archiveState): updated = 0 for workflowName in workflows: if self.cleanAllLocalCouchDB(workflowName): if self.useReqMgrForCompletionCheck: try: #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced) self.reqmgrSvc.updateRequestStatus(workflowName, archiveState) #And replace with this - remove all the excption #self.reqmgr2Svc.updateRequestStatus(workflowName, archiveState) except HTTPException as ex: # If we get an HTTPException of 404 means reqmgr2 request if ex.status == 404: # try reqmgr2 call msg = "%s : reqmgr2 request: %s" % (workflowName, str(ex)) logging.warning(msg) self.reqmgr2Svc.updateRequestStatus(workflowName, archiveState) else: msg = "%s : fail to update status with HTTP error: %s" % (workflowName, str(ex)) logging.error(msg) raise ex updated += 1 logging.debug("status updated to %s %s", archiveState, workflowName) else: # tier0 update case self.centralRequestDBWriter.updateRequestStatus(workflowName, archiveState) return updated def deleteWorkflowFromJobCouch(self, workflowName, db): """ _deleteWorkflowFromCouch_ If we are asked to delete the workflow from couch, delete it to clear up some space. Load the document IDs and revisions out of couch by workflowName, then order a delete on them. """ options = {"startkey": [workflowName], "endkey": [workflowName, {}], "reduce": False} if db == "JobDump": couchDB = self.jobsdatabase view = "jobsByWorkflowName" elif db == "FWJRDump": couchDB = self.fwjrdatabase view = "fwjrsByWorkflowName" elif db == "SummaryStats": couchDB = self.statsumdatabase view = None elif db == "WMStatsAgent": couchDB = self.wmstatsCouchDB.getDBInstance() view = "allWorkflows" options = {"key": workflowName, "reduce": False} if view == None: try: committed = couchDB.delete_doc(workflowName) except CouchNotFoundError as ex: return {'status': 'warning', 'message': "%s: %s" % (workflowName, str(ex))} else: try: jobs = couchDB.loadView(db, view, options = options)['rows'] except Exception as ex: errorMsg = "Error on loading jobs for %s" % workflowName logging.warning("%s/n%s" % (str(ex), errorMsg)) return {'status': 'error', 'message': errorMsg} for j in jobs: doc = {} doc["_id"] = j['value']['id'] doc["_rev"] = j['value']['rev'] couchDB.queueDelete(doc) committed = couchDB.commit() if committed: #create the error report errorReport = {} deleted = 0 status = "ok" for data in committed: if 'error' in data: errorReport.setdefault(data['error'], 0) errorReport[data['error']] += 1 status = "error" else: deleted += 1 return {'status': status, 'delete': deleted, 'message': errorReport} else: return {'status': 'warning', 'message': "no %s exist" % workflowName} def cleanAllLocalCouchDB(self, workflowName): logging.info("Deleting %s from JobCouch" % workflowName) jobReport = self.deleteWorkflowFromJobCouch(workflowName, "JobDump") logging.debug("%s docs deleted from JobDump", jobReport) fwjrReport = self.deleteWorkflowFromJobCouch(workflowName, "FWJRDump") logging.debug("%s docs deleted from FWJRDump", fwjrReport) summaryReport = self.deleteWorkflowFromJobCouch(workflowName, "SummaryStats") logging.debug("%s docs deleted from SummaryStats", summaryReport) wmstatsReport = self.deleteWorkflowFromJobCouch(workflowName, "WMStatsAgent") logging.debug("%s docs deleted from wmagent_summary", wmstatsReport) # if one of the procedure fails return False if (jobReport["status"] == "error" or fwjrReport["status"] == "error" or wmstatsReport["status"] == "error"): return False # other wise return True. return True def cleanAlreadyArchivedWorkflows(self): """ loop through the workflows in couchdb, if archived delete all the data in couchdb """ numDeletedRequests = 0 try: localWMStats = self.wmstatsCouchDB.getDBInstance() options = {"group_level": 1, "reduce": True} results = localWMStats.loadView("WMStatsAgent", "allWorkflows", options = options)['rows'] requestNames = [x['key'] for x in results] logging.info("There are %s workfows to check for archived status" % len(requestNames)) workflowDict = self.centralRequestDBReader.getStatusAndTypeByRequest(requestNames) for request, value in workflowDict.items(): if value[0].endswith("-archived"): self.cleanAllLocalCouchDB(request) numDeletedRequests += 1 except Exception as ex: errorMsg = "Error on loading workflow list from wmagent_summary db" logging.warning("%s/n%s" % (errorMsg, str(ex))) return numDeletedRequests
class RequestDBTest(unittest.TestCase): """ """ def setUp(self): """ _setUp_ """ self.schema = [] self.couchApps = ["ReqMgr"] self.testInit = TestInitCouchApp('RequestDBServiceTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = self.schema, useDefault = False) dbName = 'requsetdb_t' self.testInit.setupCouch(dbName, *self.couchApps) reqDBURL = "%s/%s" % (self.testInit.couchUrl, dbName) self.requestWriter = RequestDBWriter(reqDBURL) self.requestReader = RequestDBReader(reqDBURL) self.requestWriter.defaultStale = {} self.requestReader.defaultStale = {} return def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ self.testInit.tearDownCouch() def testRequestDBWriter(self): # test getWork schema = generate_reqmgr_schema(3) result = self.requestWriter.insertGenericRequest(schema[0]) self.assertEqual(len(result), 1, 'insert fail'); self.assertEqual(self.requestWriter.updateRequestStatus(schema[0]['RequestName'], "failed"), 'OK', 'update fail') self.assertEqual(self.requestWriter.updateRequestStatus("not_exist_schema", "assigned"), 'Error: document not found') result = self.requestWriter.updateRequestProperty(schema[0]['RequestName'], {'Teams': ['teamA']}) self.assertEqual(self.requestWriter.updateRequestProperty(schema[0]['RequestName'], {'Teams': ['teamA']}), 'OK', 'update fail') self.assertEqual(self.requestWriter.updateRequestProperty("not_exist_schema", {'Teams': 'teamA'}), 'Error: document not found') result = self.requestReader.getRequestByNames([schema[0]['RequestName']]) self.assertEqual(len(result), 1, "should be 1") result = self.requestReader.getRequestByStatus(["failed"], False, 1) self.assertEqual(len(result), 1, "should be 1") result = self.requestReader.getStatusAndTypeByRequest([schema[0]['RequestName']]) self.assertEqual(result[schema[0]['RequestName']][0], 'failed', "should be failed") result = self.requestWriter.insertGenericRequest(schema[1]) time.sleep(2) result = self.requestWriter.insertGenericRequest(schema[2]) endTime = int(time.time()) - 1 result = self.requestReader.getRequestByStatusAndEndTime("new", False, endTime) self.assertEqual(len(result), 1, "should be 1") endTime = int(time.time()) + 1 result = self.requestReader.getRequestByStatusAndEndTime("new", False, endTime) self.assertEqual(len(result), 2, "should be 2")
class CleanCouchPoller(BaseWorkerThread): """ Cleans up local couch db according the the given condition. 1. Cleans local couch db when request is completed and reported to cental db. This will clean up local couchdb, local summary db, local queue 2. Cleans old couchdoc which is created older than the time threshold """ def __init__(self, config): """ Initialize config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config def setup(self, parameters): """ Called at startup """ self.teamName = self.config.Agent.teamName # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.archiveDelayHours = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0) self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") #TODO: we might need to use local db for Tier0 self.centralRequestDBReader = RequestDBReader( self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) if self.useReqMgrForCompletionCheck: self.deletableState = "announced" self.centralRequestDBWriter = RequestDBWriter( self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) self.reqmgr2Svc = ReqMgr( self.config.TaskArchiver.ReqMgr2ServiceURL) #TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only) self.reqmgrSvc = RequestManager( {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableState = "completed" # use local for update self.centralRequestDBWriter = RequestDBWriter( self.config.AnalyticsDataCollector.localT0RequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase( statSummaryDBName) def algorithm(self, parameters): """ Get information from wmbs, workqueue and local couch. - It deletes old wmstats docs - Archive workflows """ try: logging.info("Cleaning up the old request docs") report = self.wmstatsCouchDB.deleteOldDocs( self.config.TaskArchiver.DataKeepDays) logging.info("%s docs deleted" % report) logging.info("Cleaning up the archived request docs") report = self.cleanAlreadyArchivedWorkflows() logging.info("%s archived workflows deleted" % report) # archiving only workflows that I own (same team) logging.info("Getting requests in '%s' state for team '%s'", self.deletableState, self.teamName) endTime = int(time.time()) - self.archiveDelayHours * 3600 wfs = self.centralRequestDBReader.getRequestByTeamAndStatus( self.teamName, self.deletableState) commonWfs = self.centralRequestDBReader.getRequestByStatusAndStartTime( self.deletableState, False, endTime) deletableWorkflows = list(set(wfs) & set(commonWfs)) logging.info("Ready to archive normal %s workflows", len(deletableWorkflows)) numUpdated = self.archiveWorkflows(deletableWorkflows, "normal-archived") logging.info("archive normal %s workflows", numUpdated) abortedWorkflows = self.centralRequestDBReader.getRequestByStatus( ["aborted-completed"]) logging.info("Ready to archive aborted %s workflows", len(abortedWorkflows)) numUpdated = self.archiveWorkflows(abortedWorkflows, "aborted-archived") logging.info("archive aborted %s workflows", numUpdated) rejectedWorkflows = self.centralRequestDBReader.getRequestByStatus( ["rejected"]) logging.info("Ready to archive rejected %s workflows", len(rejectedWorkflows)) numUpdated = self.archiveWorkflows(rejectedWorkflows, "rejected-archived") logging.info("archive rejected %s workflows", numUpdated) except Exception as ex: logging.error(str(ex)) logging.error("Error occurred, will try again next cycle") def archiveWorkflows(self, workflows, archiveState): updated = 0 for workflowName in workflows: if self.cleanAllLocalCouchDB(workflowName): if self.useReqMgrForCompletionCheck: try: #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced) self.reqmgrSvc.updateRequestStatus( workflowName, archiveState) #And replace with this - remove all the excption #self.reqmgr2Svc.updateRequestStatus(workflowName, archiveState) except HTTPException as ex: # If we get an HTTPException of 404 means reqmgr2 request if ex.status == 404: # try reqmgr2 call msg = "%s : reqmgr2 request: %s" % (workflowName, str(ex)) logging.warning(msg) self.reqmgr2Svc.updateRequestStatus( workflowName, archiveState) else: msg = "%s : fail to update status with HTTP error: %s" % ( workflowName, str(ex)) logging.error(msg) raise ex updated += 1 logging.debug("status updated to %s %s", archiveState, workflowName) else: # tier0 update case self.centralRequestDBWriter.updateRequestStatus( workflowName, archiveState) return updated def deleteWorkflowFromJobCouch(self, workflowName, db): """ _deleteWorkflowFromCouch_ If we are asked to delete the workflow from couch, delete it to clear up some space. Load the document IDs and revisions out of couch by workflowName, then order a delete on them. """ options = { "startkey": [workflowName], "endkey": [workflowName, {}], "reduce": False } if db == "JobDump": couchDB = self.jobsdatabase view = "jobsByWorkflowName" elif db == "FWJRDump": couchDB = self.fwjrdatabase view = "fwjrsByWorkflowName" elif db == "SummaryStats": couchDB = self.statsumdatabase view = None elif db == "WMStatsAgent": couchDB = self.wmstatsCouchDB.getDBInstance() view = "allWorkflows" options = {"key": workflowName, "reduce": False} if view == None: try: committed = couchDB.delete_doc(workflowName) except CouchNotFoundError as ex: return { 'status': 'warning', 'message': "%s: %s" % (workflowName, str(ex)) } else: try: jobs = couchDB.loadView(db, view, options=options)['rows'] except Exception as ex: errorMsg = "Error on loading jobs for %s" % workflowName logging.warning("%s/n%s" % (str(ex), errorMsg)) return {'status': 'error', 'message': errorMsg} for j in jobs: doc = {} doc["_id"] = j['value']['id'] doc["_rev"] = j['value']['rev'] couchDB.queueDelete(doc) committed = couchDB.commit() if committed: #create the error report errorReport = {} deleted = 0 status = "ok" for data in committed: if 'error' in data: errorReport.setdefault(data['error'], 0) errorReport[data['error']] += 1 status = "error" else: deleted += 1 return { 'status': status, 'delete': deleted, 'message': errorReport } else: return { 'status': 'warning', 'message': "no %s exist" % workflowName } def cleanAllLocalCouchDB(self, workflowName): logging.info("Deleting %s from JobCouch" % workflowName) jobReport = self.deleteWorkflowFromJobCouch(workflowName, "JobDump") logging.debug("%s docs deleted from JobDump", jobReport) fwjrReport = self.deleteWorkflowFromJobCouch(workflowName, "FWJRDump") logging.debug("%s docs deleted from FWJRDump", fwjrReport) summaryReport = self.deleteWorkflowFromJobCouch( workflowName, "SummaryStats") logging.debug("%s docs deleted from SummaryStats", summaryReport) wmstatsReport = self.deleteWorkflowFromJobCouch( workflowName, "WMStatsAgent") logging.debug("%s docs deleted from wmagent_summary", wmstatsReport) # if one of the procedure fails return False if (jobReport["status"] == "error" or fwjrReport["status"] == "error" or wmstatsReport["status"] == "error"): return False # other wise return True. return True def cleanAlreadyArchivedWorkflows(self): """ loop through the workflows in couchdb, if archived delete all the data in couchdb """ numDeletedRequests = 0 try: localWMStats = self.wmstatsCouchDB.getDBInstance() options = {"group_level": 1, "reduce": True} results = localWMStats.loadView("WMStatsAgent", "allWorkflows", options=options)['rows'] requestNames = [x['key'] for x in results] logging.info("There are %s workfows to check for archived status" % len(requestNames)) workflowDict = self.centralRequestDBReader.getStatusAndTypeByRequest( requestNames) for request, value in workflowDict.items(): if value[0].endswith("-archived"): self.cleanAllLocalCouchDB(request) numDeletedRequests += 1 except Exception as ex: errorMsg = "Error on loading workflow list from wmagent_summary db" logging.warning("%s/n%s" % (errorMsg, str(ex))) return numDeletedRequests