def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, "WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def assignRequest(requestName, teamName, prodMgr=None, wmstatUrl=None): """ _assignRequest_ Assign a request to a team. This does the following: - Changes the status to assigned - Creates an association to the team provided - Optionally associates the request to a prod mgr instance """ factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) teamId = factory(classname="Team.ID").execute(teamName) if teamId == None: msg = "Team named %s not known in database" % teamName msg += "Failed to assign request %s to team %s" % (requestName, teamName) raise RuntimeError, msg if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateTeam(requestName, teamName) assigner = factory(classname="Assignment.New") assigner.execute(reqId, teamId) changeRequestStatus(requestName, "assigned", priority=None, wmstatUrl=wmstatUrl) if prodMgr != None: addPM = factory(classname="Progress.ProdMgr") addPM.execute(reqId, prodMgr)
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBReader = WMStatsReader( self.config.TaskArchiver.centralWMStatsURL) if self.useReqMgrForCompletionCheck: self.deletableStates = ["announced"] self.centralCouchDBWriter = WMStatsWriter( self.config.TaskArchiver.centralWMStatsURL) self.reqmgrSvc = RequestManager( {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableStates = ["completed"] self.centralCouchDBWriter = self.wmstatsCouchDB jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase( statSummaryDBName)
class HeartbeatMonitorBase(CherryPyPeriodicTask): def __init__(self, rest, config): super(HeartbeatMonitorBase, self).__init__(config) self.centralWMStats = WMStatsWriter(config.wmstats_url) self.threadList = config.thread_list def setConcurrentTasks(self, config): """ sets the list of function reference for concurrent tasks """ self.concurrentTasks = [{'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration}] def reportToWMStats(self, config): """ report thread status and heartbeat. Also can report additional mointoring information by rewriting addAdditionalMonitorReport method """ self.logger.info("Checking Thread status...") downThreadInfo = self.logDB.wmstats_down_components_report(self.threadList) monitorInfo = self.addAdditionalMonitorReport(config) downThreadInfo.update(monitorInfo) wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter) self.centralWMStats.updateAgentInfo(wqSummaryDoc) self.logger.info("Uploaded to WMStats...") return def addAdditionalMonitorReport(self, config): """ add Additonal report with heartbeat report overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats) """ return {}
def __init__(self, rest, config): super(HeartbeatMonitorBase, self).__init__(config) self.centralWMStats = WMStatsWriter(config.wmstats_url) self.threadList = config.thread_list self.userAMQ = getattr(config, "user_amq", None) self.passAMQ = getattr(config, "pass_amq", None) self.postToAMQ = getattr(config, "post_to_amq", False) self.topicAMQ = getattr(config, "topic_amq", None) self.hostPortAMQ = getattr(config, "host_port_amq", None)
def saveWorkload(helper, workload, wmstatUrl=None): """ Saves the changes to this workload """ if workload.startswith('http'): helper.saveCouchUrl(workload) if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateFromWMSpec(helper) else: helper.save(workload)
def saveWorkload(helper, workload, wmstatUrl = None): """ Saves the changes to this workload """ if workload.startswith('http'): helper.saveCouchUrl(workload) if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateFromWMSpec(helper) else: helper.save(workload)
class CleanUpTask(CherryPyPeriodicTask): """ This class is used for both T0WMStats and WMStats controlled by config.reqdb_couch_app value """ def __init__(self, rest, config): super(CleanUpTask, self).__init__(config) self.wmstatsDB = WMStatsWriter(config.wmstats_url, reqdbURL=config.reqmgrdb_url, reqdbCouchApp=config.reqdb_couch_app) def setConcurrentTasks(self, config): """ sets the list of functions which runs concurrently """ self.concurrentTasks = [{ 'func': self.cleanUpOldRequests, 'duration': (config.DataKeepDays * 24 * 60 * 60) }, { 'func': self.cleanUpArchivedRequests, 'duration': config.archivedCleanUpDuration }] def cleanUpOldRequests(self, config): """ clean up wmstats data older then given days """ self.logger.info("deleting %s hours old docs", (config.DataKeepDays * 24)) result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays) self.logger.info("%s old doc deleted", result) return def cleanUpArchivedRequests(self, config): """ loop through the workflows in couchdb, if archived delete all the data in couchdb """ self.logger.info("getting archived data") requestNames = self.wmstatsDB.getArchivedRequests() self.logger.info("archived list %s", requestNames) for req in requestNames: self.logger.info("Deleting data for: %s", req) try: result = self.wmstatsDB.deleteDocsByWorkflow(req) except Exception as ex: self.logger.error("deleting %s failed: %s", req, str(ex)) for line in traceback.format_exc().rstrip().split("\n"): self.logger.error(" " + line) else: if result is None: self.logger.info("there were no documents to delete.") else: self.logger.info("%s docs deleted", len(result)) return
def setUp(self): """ _setUp_ """ self.schema = [] self.couchApps = ["WMStats"] self.testInit = TestInitCouchApp('WorkQueueServiceTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=self.schema, useDefault=False) self.testInit.setupCouch('wmstats_t', *self.couchApps) self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, 'wmstats_t') return
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication()
class CleanUpTask(CherryPyPeriodicTask): """ This class is used for both T0WMStats and WMStats controlled by config.reqdb_couch_app value """ def __init__(self, rest, config): super(CleanUpTask, self).__init__(config) self.wmstatsDB = WMStatsWriter(config.wmstats_url, reqdbURL=config.reqmgrdb_url, reqdbCouchApp=config.reqdb_couch_app) def setConcurrentTasks(self, config): """ sets the list of functions which runs concurrently """ self.concurrentTasks = [{'func': self.cleanUpOldRequests, 'duration': (config.DataKeepDays * 24 * 60 * 60)}, {'func': self.cleanUpArchivedRequests, 'duration': config.archivedCleanUpDuration}] def cleanUpOldRequests(self, config): """ clean up wmstats data older then given days """ self.logger.info("deleting %s hours old docs", (config.DataKeepDays * 24)) result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays) self.logger.info("%s old doc deleted", result) return def cleanUpArchivedRequests(self, config): """ loop through the workflows in couchdb, if archived delete all the data in couchdb """ self.logger.info("getting archived data") requestNames = self.wmstatsDB.getArchivedRequests() self.logger.info("archived list %s", requestNames) for req in requestNames: self.logger.info("Deleting data for: %s", req) try: result = self.wmstatsDB.deleteDocsByWorkflow(req) except Exception as ex: self.logger.error("deleting %s failed: %s", req, str(ex)) for line in traceback.format_exc().rstrip().split("\n"): self.logger.error(" " + line) else: if result is None: self.logger.info("there were no documents to delete.") else: self.logger.info("%s docs deleted", len(result)) return
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL) self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
def processInboundWork(self, inbound_work = None, throw = False): """Retrieve work from inbox, split and store If request passed then only process that request """ if self.params['LocalQueueFlag']: self.backend.fixConflicts() # db should be consistent result = [] if not inbound_work: inbound_work = self.backend.getElementsForSplitting() for inbound in inbound_work: # Check we haven't already split the work work = self.backend.getElementsForParent(inbound) try: if work: self.logger.info('Request "%s" already split - Resuming' % inbound['RequestName']) else: work, totalStats = self._splitWork(inbound['WMSpec'], None, inbound['Inputs'], inbound['Mask']) self.backend.insertElements(work, parent = inbound) # if this fails, rerunning will pick up here # save inbound work to signal we have completed queueing # add the total work on wmstat summary self.backend.updateInboxElements(inbound.id, Status = 'Acquired') if not self.params.get('LocalQueueFlag') and self.params.get('WMStatsCouchUrl'): # only update global stats for global queue try: wmstatSvc = WMStatsWriter(self.params.get('WMStatsCouchUrl')) wmstatSvc.insertTotalStats(inbound['WMSpec'].name(), totalStats) except Exception, ex: self.logger.info('Error publishing %s to WMStats: %s' % (inbound['RequestName'], str(ex))) except TERMINAL_EXCEPTIONS, ex: self.logger.info('Failing workflow "%s": %s' % (inbound['RequestName'], str(ex))) self.backend.updateInboxElements(inbound.id, Status = 'Failed') if throw: raise except Exception, ex: # if request has been failing for too long permanently fail it. # last update time was when element was assigned to this queue if (float(inbound.updatetime) + self.params['QueueRetryTime']) < time.time(): self.logger.info('Failing workflow "%s" as not queued in %d secs: %s' % (inbound['RequestName'], self.params['QueueRetryTime'], str(ex))) self.backend.updateInboxElements(inbound.id, Status = 'Failed') else: self.logger.info('Exception splitting work for wmspec "%s": %s' % (inbound['RequestName'], str(ex))) if throw: raise continue
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.archiveDelayHours = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0) self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL, "WMStatsAgent") #TODO: we might need to use local db for Tier0 self.centralRequestDBReader = RequestDBReader( self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) if self.useReqMgrForCompletionCheck: self.deletableState = "announced" self.centralRequestDBWriter = RequestDBWriter( self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) if self.config.TaskArchiver.reqmgr2Only: self.reqmgr2Svc = ReqMgr( self.config.TaskArchiver.ReqMgr2ServiceURL) else: #TODO: remove this for reqmgr2 self.reqmgrSvc = RequestManager( {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableState = "completed" # use local for update self.centralRequestDBWriter = RequestDBWriter( self.config.AnalyticsDataCollector.localT0RequestDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase( statSummaryDBName)
def assignRequest(requestName, teamName, priorityModifier=0, prodMgr=None, wmstatUrl=None): """ _assignRequest_ Assign a request to a team. This does the following: - Changes the status to assigned - Creates an association to the team provided - Optionally associates the request to a prod mgr instance - Optionally sets the priority modifier for the team (allows same request to be shared between two teams with different priorities """ factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) teamId = factory(classname="Team.ID").execute(teamName) if teamId == None: msg = "Team named %s not known in database" % teamName msg += "Failed to assign request %s to team %s" % (requestName, teamName) raise RuntimeError, msg if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateTeam(requestName, teamName) assigner = factory(classname="Assignment.New") assigner.execute(reqId, teamId, priorityModifier) changeRequestStatus(requestName, 'assigned', priority=None, wmstatUrl=wmstatUrl) if prodMgr != None: addPM = factory(classname="Progress.ProdMgr") addPM.execute(reqId, prodMgr) return
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL) if self.useReqMgrForCompletionCheck: self.deletableStates = ["announced"] self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL) self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableStates = ["completed"] self.centralCouchDBWriter = self.wmstatsCouchDB jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
def setup(self, parameters): """ Called at startup """ # set the connection for local couchDB call self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL) #TODO: we might need to use local db for Tier0 self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) if self.useReqMgrForCompletionCheck: self.deletableStates = ["announced"] self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: remove this for reqmgr2 self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL}) else: # Tier0 case self.deletableStates = ["completed"] # use local for update self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL) logging.info("Setting the replication to central monitor ...") self.localSummaryCouchDB.replicate(self.config.AnalyticsDataCollector.centralWMStatsURL) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
class CleanUpTask(CherryPyPeriodicTask): """ This class is used for both T0WMStats and WMStats controlled by config.reqdb_couch_app value """ def __init__(self, rest, config): CherryPyPeriodicTask.__init__(self, config) self.wmstatsDB = WMStatsWriter(config.wmstats_url, reqdbURL=config.reqmgrdb_url, reqdbCouchApp=config.reqdb_couch_app) def setConcurrentTasks(self, config): """ sets the list of functions which """ self.concurrentTasks = [{ 'func': self.cleanUpOldRequests, 'duration': (config.DataKeepDays * 24 * 60 * 60) }, { 'func': self.cleanUpArchivedRequests, 'duration': config.archivedCleanUpDuration }] def cleanUpOldRequests(self, config): """ clean up wmstats data older then given days """ self.logger.info("deleting %s hours old docs" % (config.DataKeepDays * 24)) result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays) self.logger.info("%s old doc deleted" % result) return def cleanUpArchivedRequests(self, config): """ loop through the workflows in couchdb, if archived delete all the data in couchdb """ requestNames = self.wmstatsDB.getArchivedRequests() for req in requestNames: self.logger.info("deleting %s data" % req) result = self.wmstatsDB.deleteDocsByWorkflow(req) self.logger.info("%s deleted" % result) return
class WMStatsTest(unittest.TestCase): """ """ def setUp(self): """ _setUp_ """ self.schema = [] self.couchApps = ["WMStats"] self.testInit = TestInitCouchApp('WorkQueueServiceTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = self.schema, useDefault = False) self.testInit.setupCouch('wmstats_t', *self.couchApps) self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, 'wmstats_t'); return def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ self.testInit.tearDownCouch() def testWMStatsWriter(self): # test getWork schema = generate_reqmgr_schema() self.assertEquals(self.wmstatsWriter.insertRequest(schema[0]), 'OK', 'insert fail'); self.assertEquals(self.wmstatsWriter.updateRequestStatus(schema[0]['RequestName'], "failed"), 'OK', 'update fail') self.assertEquals(self.wmstatsWriter.updateRequestStatus("not_exist_schema", "assigned"), 'ERROR: request not found - not_exist_schema') self.assertEquals(self.wmstatsWriter.updateTeam(schema[0]['RequestName'], 'teamA'), 'OK', 'update fail') self.assertEquals(self.wmstatsWriter.updateTeam("not_exist_schema", 'teamA'), 'ERROR: request not found - not_exist_schema') totalStats = {'total_jobs': 100, 'input_events': 1000, 'input_lumis': 1234, 'input_num_files': 5} self.assertEquals(self.wmstatsWriter.insertTotalStats(schema[0]['RequestName'], totalStats), 'INSERTED', 'update fail') self.assertEquals(self.wmstatsWriter.insertTotalStats(schema[0]['RequestName'], totalStats), 'UPDATED', 'update fail') self.assertEquals(self.wmstatsWriter.insertTotalStats("not_exist_schema", totalStats), 'ERROR: request not found - not_exist_schema') spec1 = newWorkload(schema[0]['RequestName']) production = spec1.newTask("Production") production.setTaskType("Merge") production.setSiteWhitelist(['TEST_SITE']) self.assertEquals(self.wmstatsWriter.updateFromWMSpec(spec1), 'OK', 'update fail') spec2 = newWorkload("not_exist_schema") production = spec2.newTask("Production") production.setTaskType("Merge") self.assertEquals(self.wmstatsWriter.updateFromWMSpec(spec2), 'ERROR: request not found - not_exist_schema')
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) if hasattr(self.config, "Tier0Feeder"): self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName= "WMStatsAgent") else: self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication()
def changeRequestStatus(requestName, newState, priority=None, wmstatUrl=None): """ _changeRequestStatus_ Basic API to change a request to a new state, also includes optional priority change for the request - *requestName* : name of the request to be modified - *newState* : name of the new status for the request - *priority* : optional integer priority Apparently when changing request state (on assignment page), it's possible to change priority at one go. Hence the argument is here. """ # MySQL/Oracle factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) changeRequestIDStatus(reqId, newState, priority) # CouchDB # have to first get information where the request Couch document is, # extracting the information from reqmgr_request.workflow table field reqData = factory(classname="Request.Get").execute(reqId) # this would be something like this: # http://localhost:5984/reqmgr_workload_cache/maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec wfUrl = reqData['workflow'] # cut off /maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec couchUrl = wfUrl.replace('/' + requestName + "/spec", '') couchDbName = couchUrl[couchUrl.rfind('/') + 1:] # cut off database name from the URL url = couchUrl.replace('/' + couchDbName, '') couchDb = Database(couchDbName, url) fields = {"RequestStatus": newState} couchDb.updateDocument(requestName, "ReqMgr", "updaterequest", fields=fields, useBody=True) #TODO: should we make this mendatory? if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateRequestStatus(requestName, newState)
class CleanUpTask(CherryPyPeriodicTask): """ This class is used for both T0WMStats and WMStats controlled by config.reqdb_couch_app value """ def __init__(self, rest, config): CherryPyPeriodicTask.__init__(self, config) self.wmstatsDB = WMStatsWriter( config.wmstats_url, reqdbURL=config.reqmgrdb_url, reqdbCouchApp=config.reqdb_couch_app ) def setConcurrentTasks(self, config): """ sets the list of functions which """ self.concurrentTasks = [ {"func": self.cleanUpOldRequests, "duration": (config.DataKeepDays * 24 * 60 * 60)}, {"func": self.cleanUpArchivedRequests, "duration": config.archivedCleanUpDuration}, ] def cleanUpOldRequests(self, config): """ clean up wmstats data older then given days """ self.logger.info("deleting %s hours old docs" % (config.DataKeepDays * 24)) result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays) self.logger.info("%s old doc deleted" % result) return def cleanUpArchivedRequests(self, config): """ loop through the workflows in couchdb, if archived delete all the data in couchdb """ requestNames = self.wmstatsDB.getArchivedRequests() for req in requestNames: self.logger.info("deleting %s data" % req) result = self.wmstatsDB.deleteDocsByWorkflow(req) self.logger.info("%s deleted" % result) return
def setUp(self): """ _setUp_ Setup the test environment """ self.testInit = TestInit(__file__) self.testInit.setDatabaseConnection() self.testInit.setSchema(["WMCore.WMBS"]) self.wmstatsCouchDB = 'wmstats_plugin_t' self.testInit.setupCouch(self.wmstatsCouchDB, 'WMStats') self.testDir = self.testInit.generateWorkDir() self.wmstatsWriter = WMStatsWriter(os.environ['COUCHURL'], self.wmstatsCouchDB) self.stateMap = {} self.orderedStates = [] self.plugin = None return
def changeRequestStatus(requestName, newState, priority = None, wmstatUrl = None): """ _changeRequestStatus_ Basic API to change a request to a new state, also includes optional priority change for the request - *requestName* : name of the request to be modified - *newState* : name of the new status for the request - *priority* : optional integer priority """ #TODO: should we make this mendatory? if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateRequestStatus(requestName, newState) factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) changeRequestIDStatus(reqId, newState, priority) return
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def changeRequestStatus(requestName, newState, priority=None, wmstatUrl=None): """ _changeRequestStatus_ Basic API to change a request to a new state, also includes optional priority change for the request - *requestName* : name of the request to be modified - *newState* : name of the new status for the request - *priority* : optional integer priority Apparently when changing request state (on assignment page), it's possible to change priority at one go. Hence the argument is here. """ # MySQL/Oracle factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) changeRequestIDStatus(reqId, newState, priority) # CouchDB # have to first get information where the request Couch document is, # extracting the information from reqmgr_request.workflow table field reqData = factory(classname="Request.Get").execute(reqId) # this would be something like this: # http://localhost:5984/reqmgr_workload_cache/maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec wfUrl = reqData["workflow"] # cut off /maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec couchUrl = wfUrl.replace("/" + requestName + "/spec", "") couchDbName = couchUrl[couchUrl.rfind("/") + 1 :] # cut off database name from the URL url = couchUrl.replace("/" + couchDbName, "") couchDb = Database(couchDbName, url) fields = {"RequestStatus": newState} couchDb.updateDocument(requestName, "ReqMgr", "updaterequest", fields=fields, useBody=True) # TODO: should we make this mendatory? if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateRequestStatus(requestName, newState)
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) if hasattr(self.config, "Tier0Feeder"): self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") else: self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication()
class HeartbeatMonitorBase(CherryPyPeriodicTask): def __init__(self, rest, config): super(HeartbeatMonitorBase, self).__init__(config) self.centralWMStats = WMStatsWriter(config.wmstats_url) self.threadList = config.thread_list def setConcurrentTasks(self, config): """ sets the list of function reference for concurrent tasks """ self.concurrentTasks = [{ 'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration }] def reportToWMStats(self, config): """ report thread status and heartbeat. Also can report additional mointoring information by rewriting addAdditionalMonitorReport method """ self.logger.info("Checking Thread status...") downThreadInfo = self.logDB.wmstats_down_components_report( self.threadList) monitorInfo = self.addAdditionalMonitorReport(config) downThreadInfo.update(monitorInfo) wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter) self.centralWMStats.updateAgentInfo(wqSummaryDoc) self.logger.info("Uploaded to WMStats...") return def addAdditionalMonitorReport(self, config): """ add Additonal report with heartbeat report overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats) """ return {}
def __init__(self, config): """ _init_ """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "T0.WMBS", logger = logging, dbinterface = myThread.dbi) self.tier0ConfigFile = config.Tier0Feeder.tier0ConfigFile self.specDirectory = config.Tier0Feeder.specDirectory self.dropboxuser = config.Tier0Feeder.dropboxuser self.dropboxpass = config.Tier0Feeder.dropboxpass self.transferSystemBaseDir = getattr(config.Tier0Feeder, "transferSystemBaseDir", None) if self.transferSystemBaseDir != None: if not os.path.exists(self.transferSystemBaseDir): self.transferSystemBaseDir = None self.dqmUploadProxy = config.WMBSService.proxy self.localSummaryCouchDB = WMStatsWriter(config.AnalyticsDataCollector.localWMStatsURL) hltConfConnectUrl = config.HLTConfDatabase.connectUrl dbFactoryHltConf = DBFactory(logging, dburl = hltConfConnectUrl, options = {}) dbInterfaceHltConf = dbFactoryHltConf.connect() daoFactoryHltConf = DAOFactory(package = "T0.WMBS", logger = logging, dbinterface = dbInterfaceHltConf) self.getHLTConfigDAO = daoFactoryHltConf(classname = "RunConfig.GetHLTConfig") storageManagerConnectUrl = config.StorageManagerDatabase.connectUrl dbFactoryStorageManager = DBFactory(logging, dburl = storageManagerConnectUrl, options = {}) self.dbInterfaceStorageManager = dbFactoryStorageManager.connect() self.getExpressReadyRunsDAO = None if hasattr(config, "PopConLogDatabase"): popConLogConnectUrl = getattr(config.PopConLogDatabase, "connectUrl", None) if popConLogConnectUrl != None: dbFactoryPopConLog = DBFactory(logging, dburl = popConLogConnectUrl, options = {}) dbInterfacePopConLog = dbFactoryPopConLog.connect() daoFactoryPopConLog = DAOFactory(package = "T0.WMBS", logger = logging, dbinterface = dbInterfacePopConLog) self.getExpressReadyRunsDAO = daoFactoryPopConLog(classname = "Tier0Feeder.GetExpressReadyRuns") return
def setUp(self): """ _setUp_ """ self.schema = [] self.couchApps = ["WMStats"] self.testInit = TestInitCouchApp('WorkQueueServiceTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = self.schema, useDefault = False) self.testInit.setupCouch('wmstats_t', *self.couchApps) self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, 'wmstats_t'); return
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call #self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)
def changeRequestStatus(requestName, newState, priority=None, wmstatUrl=None): """ _changeRequestStatus_ Basic API to change a request to a new state, also includes optional priority change for the request - *requestName* : name of the request to be modified - *newState* : name of the new status for the request - *priority* : optional integer priority Apparently when changing request state (on assignment page), it's possible to change priority at one go. Hence the argument is here. """ #TODO: should we make this mendatory? if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateRequestStatus(requestName, newState) factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) changeRequestIDStatus(reqId, newState, priority) return
def changeRequestStatus(requestName, newState, priority = None, wmstatUrl = None): """ _changeRequestStatus_ Basic API to change a request to a new state, also includes optional priority change for the request - *requestName* : name of the request to be modified - *newState* : name of the new status for the request - *priority* : optional integer priority Apparently when changing request state (on assignment page), it's possible to change priority at one go. Hence the argument is here. """ #TODO: should we make this mendatory? if wmstatUrl: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.updateRequestStatus(requestName, newState) factory = DBConnect.getConnection() reqId = getRequestID(factory, requestName) changeRequestIDStatus(reqId, newState, priority) return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = { 'proxy': { 'error': 3, 'warning': 5 }, 'certificate': { 'error': 10, 'warning': 20 } } # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkCredLifetime(agentInfo, "proxy") self.checkCredLifetime(agentInfo, "certificate") timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not self.isT0agent: timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) self.uploadAgentInfoToCentralWMStats(agentInfo) self.buildMonITDocs(agentInfo) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} wqStates = ['Available', 'Acquired'] results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus(wqStates) uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo): """ Add some required fields to the document before it can get uploaded to WMStats. :param agentInfo: dict with agent stats to be posted to couchdb """ agentInfo['_id'] = agentInfo["agent_url"] agentInfo['timestamp'] = int(time.time()) agentInfo['type'] = "agent_info" # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed try: self.centralWMStatsCouchDB.updateAgentInfo( agentInfo, propertiesToKeep=["data_last_update", "data_error"]) except Exception as e: logging.error( "Failed to upload agent statistics to WMStats. Error: %s", str(e)) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkCredLifetime(self, agInfo, credType): """ Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :param credType: credential type, can be: "proxy" or "certificate" :return: same dictionary object plus additional keys/values if needed. """ if credType == "proxy": credFile = self.proxyFile secsLeft = self.proxy.getTimeLeft(proxy=credFile) elif credType == "certificate": credFile = self.userCertFile secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True) else: logging.error( "Unknown credential type. Available options are: [proxy, certificate]" ) return logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft) daysLeft = secsLeft / (60 * 60 * 24) if daysLeft <= self.credThresholds[credType]['error']: credWarning = True agInfo['status'] = "error" elif daysLeft <= self.credThresholds[credType]['warning']: credWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: credWarning = False if credWarning: warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile) warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.) agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg logging.warning(warnMsg) return def buildMonITDocs(self, dataStats): """ Convert agent statistics into MonIT-friendly documents to be posted to AMQ/ES. It creates 5 different type of documents: * priority information * site information * work information * agent information * agent health information Note that the internal methods are popping some metrics out of dataStats """ if not self.postToAMQ: return logging.info("Preparing documents to be posted to AMQ/MonIT..") allDocs = self._buildMonITPrioDocs(dataStats) allDocs.extend(self._buildMonITSitesDocs(dataStats)) allDocs.extend(self._buildMonITWorkDocs(dataStats)) allDocs.extend(self._buildMonITWMBSDocs(dataStats)) allDocs.extend(self._buildMonITAgentDocs(dataStats)) allDocs.extend(self._buildMonITHealthDocs(dataStats)) allDocs.extend(self._buildMonITSummaryDocs(dataStats)) # and finally post them all to AMQ logging.info("Found %d documents to post to AMQ", len(allDocs)) self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp']) def _buildMonITPrioDocs(self, dataStats): """ Uses the `sitePendCountByPrio` metric in order to build documents reporting the site name, job priority and amount of jobs within that priority. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_prio_info MonIT docs """ docType = "wma_prio_info" prioDocs = [] sitePendCountByPrio = dataStats['WMBS_INFO'].pop( 'sitePendCountByPrio', []) for site, item in viewitems(sitePendCountByPrio): # it seems sites with no jobs are also always here as "Sitename": {0: 0} if list(item) == [0]: continue for prio, jobs in viewitems(item): prioDoc = {} prioDoc['site_name'] = site prioDoc['type'] = docType prioDoc['priority'] = prio prioDoc['job_count'] = jobs prioDocs.append(prioDoc) return prioDocs def _buildMonITSitesDocs(self, dataStats): """ Uses the site thresholds and job information for each site in order to build a `site_info` document type for MonIT. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_site_info MonIT docs """ docType = "wma_site_info" siteDocs = [] thresholds = dataStats['WMBS_INFO'].pop('thresholds', {}) thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {}) if self.isT0agent: possibleJobsPerSite = {} uniqueJobsPerSite = {} else: possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'possibleJobsPerSite', {}) uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'uniqueJobsPerSite', {}) for site in sorted(thresholds): siteDoc = {} siteDoc['site_name'] = site siteDoc['type'] = docType siteDoc['thresholds'] = thresholds[site] siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown') siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0) for status in possibleJobsPerSite: # make sure these keys are always present in the documents jobKey = "possible_%s_jobs" % status.lower() elemKey = "num_%s_elem" % status.lower() uniJobKey = "unique_%s_jobs" % status.lower() siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0 if site in possibleJobsPerSite[status]: siteDoc[jobKey] = possibleJobsPerSite[status][site][ 'sum_jobs'] siteDoc[elemKey] = possibleJobsPerSite[status][site][ 'num_elem'] if site in uniqueJobsPerSite[status]: siteDoc[uniJobKey] = uniqueJobsPerSite[status][site][ 'sum_jobs'] siteDocs.append(siteDoc) return siteDocs def _buildMonITWorkDocs(self, dataStats): """ Uses the local workqueue information order by WQE status and build statistics for the workload in terms of workqueue elements and top level jobs. Using the WMBS data, also builds documents to show the amount of work in 'created' and 'executing' WMBS status. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_work_info MonIT docs """ workDocs = [] if self.isT0agent: return workDocs docType = "wma_work_info" workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {}) for status, info in viewitems(workByStatus): workDoc = {} workDoc['type'] = docType workDoc['status'] = status workDoc['num_elem'] = info.get('num_elem', 0) workDoc['sum_jobs'] = info.get('sum_jobs', 0) workDocs.append(workDoc) return workDocs def _buildMonITWMBSDocs(self, dataStats): """ Using the WMBS data, builds documents to show the amount of work in 'created' and 'executing' WMBS status. It also builds a document for every single wmbs_status in the database. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs """ docType = "wma_wmbs_info" wmbsDocs = [] wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsCreatedTypeCount', {}) wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsExecutingTypeCount', {}) for jobType in wmbsCreatedTypeCount: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['job_type'] = jobType wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType] wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType] wmbsDocs.append(wmbsDoc) docType = "wma_wmbs_state_info" wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {}) for wmbsStatus in wmbsCountByState: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['wmbs_status'] = wmbsStatus wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus] wmbsDocs.append(wmbsDoc) return wmbsDocs def _buildMonITAgentDocs(self, dataStats): """ Uses the BossAir and WMBS table information in order to build a view of amount of jobs in different statuses. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_agent_info MonIT docs """ docType = "wma_agent_info" agentDocs = [] activeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'activeRunJobByStatus', {}) completeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'completeRunJobByStatus', {}) for schedStatus in activeRunJobByStatus: agentDoc = {} agentDoc['type'] = docType agentDoc['schedd_status'] = schedStatus agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus] agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus] agentDocs.append(agentDoc) return agentDocs def _buildMonITHealthDocs(self, dataStats): """ Creates documents with specific agent information, status of each component and worker thread (similar to what is shown in wmstats) and also some very basic performance numbers. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_health_info" healthDocs = [] workersStatus = dataStats.pop('workers', {}) for worker in workersStatus: healthDoc = {} healthDoc['type'] = docType healthDoc['worker_name'] = worker['name'] healthDoc['worker_state'] = worker['state'] healthDoc['worker_poll'] = worker['poll_interval'] healthDoc['worker_last_hb'] = worker['last_updated'] healthDoc['worker_cycle_time'] = worker['cycle_time'] healthDocs.append(healthDoc) return healthDocs def _buildMonITSummaryDocs(self, dataStats): """ Creates a document with the very basic agent info used in the wmstats monitoring tab. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_summary_info" summaryDocs = [] summaryDoc = {} summaryDoc['type'] = docType summaryDoc['agent_team'] = dataStats['agent_team'] summaryDoc['agent_version'] = dataStats['agent_version'] summaryDoc['agent_status'] = dataStats['status'] if not self.isT0agent: summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO'][ 'total_query_time'] summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO'][ 'total_query_time'] summaryDoc['drain_mode'] = dataStats['drain_mode'] summaryDoc['down_components'] = dataStats['down_components'] summaryDocs.append(summaryDoc) return summaryDocs def uploadToAMQ(self, docs, agentUrl, timeS): """ _uploadToAMQ_ Sends data to AMQ, which ends up in the MonIT infrastructure. :param docs: list of documents/dicts to be posted """ if not docs: logging.info("There are no documents to send to AMQ") return # add mandatory information for every single document for doc in docs: doc['agent_url'] = agentUrl docType = "cms_%s_info" % self.producer notifications = [] logging.debug("Sending the following data to AMQ %s", pformat(docs)) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=self.producer, topic=self.topicAMQ, validation_schema=None, host_and_ports=self.hostPortAMQ, logger=logging) for doc in docs: singleNotif, _, _ = stompSvc.make_notification( payload=doc, docType=docType, ts=timeS, dataSubfield="payload") notifications.append(singleNotif) failures = stompSvc.send(notifications) msg = "%i out of %i documents successfully sent to AMQ" % ( len(notifications) - len(failures), len(notifications)) logging.info(msg) except Exception as ex: logging.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = ( config.AnalyticsDataCollector.summaryLevel).lower() def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete or replicator docs befor setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) if hasattr(self.config, "Tier0Feeder"): self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") else: self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: logging.info("Getting Agent info ...") agentInfo = self.collectAgentInfo() #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) logging.info("Agent components down:\n %s" % agentInfo['down_components']) logging.info( "Agent in drain mode:\n %s \nsleep for next WMStats alarm updating cycle" % agentInfo['drain_mode']) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def collectCouchDBInfo(self): couchInfo = {'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo msg = "" for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = msg return couchInfo def collectAgentInfo(self): agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['status'] = "warning" else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if (couchInfo['status'] != 'ok'): agentInfo['down_components'].append("CouchServer") agentInfo['status'] = couchInfo['status'] couchInfo['name'] = "CouchServer" agentInfo['down_component_detail'].append(couchInfo) # Disk space warning diskUseList = diskUse() diskUseThreshold = float( self.config.AnalyticsDataCollector.diskUseThreshold) agentInfo['disk_warning'] = [] for disk in diskUseList: if float(disk['percent'].strip('%')) >= diskUseThreshold and disk[ 'mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk: agentInfo['disk_warning'].append(disk) # Couch process warning couchProc = numberCouchProcess() couchProcessThreshold = float( self.config.AnalyticsDataCollector.couchProcessThreshold) if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo(self) if lastDataUpload['data_last_update'] != 0: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error'] != "": agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok': if agentInfo['disk_warning'] != []: agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if ('data_error' in agentInfo and agentInfo['data_error'] != 'ok') or \ ('couch_process_warning' in agentInfo and agentInfo['couch_process_warning'] != 0): agentInfo['status'] = "error" return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)
for ds in helper.listOutputDatasets(): if ds not in request['OutputDatasets']: request['OutputDatasets'].append(ds) # don't want to JSONify the whole workflow del metadata['WorkloadSpec'] workloadUrl = helper.saveCouch(couchUrl, couchDB, metadata=metadata) request['RequestWorkflow'] = removePasswordFromUrl(workloadUrl) try: CheckIn.checkIn(request, reqSchema['RequestType']) except CheckIn.RequestCheckInError, ex: msg = ex._message raise HTTPError(400, "Error in Request check-in: %s" % msg) try: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.insertRequest(request) except Exception as ex: webApi.error("Could not update WMStats, reason: %s" % ex) raise HTTPError(400, "Creating request failed, could not update WMStats.") return request def makeRequest(webApi, reqInputArgs, couchUrl, couchDB, wmstatUrl): """ Handles the submission of requests. """
class AnalyticsPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = ( config.AnalyticsDataCollector.summaryLevel).lower() self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None) self.plugin = None def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService( self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData( self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter( centralRequestCouchDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor( self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory( "plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname=self.pluginName) def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: #jobs per request info logging.info("Getting Job Couch Data ...") jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite( ) #fwjr per request info logging.info("Getting FWJRJob Couch Data ...") fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB( ) logging.info("Getting Batch Job Data ...") batchJobInfo = self.wmagentDB.getBatchJobInfo() logging.info("Getting Finished Task Data ...") finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask() # get the data from local workqueue: # request name, input dataset, inWMBS, inQueue logging.info("Getting Local Queue Data ...") localQInfo = {} if not hasattr(self.config, "Tier0Feeder"): localQInfo = self.localQueue.getAnalyticsData() else: logging.debug("Tier-0 instance, not checking WorkQueue") # combine all the data from 3 sources logging.info( """Combining data from Job Couch(%s), FWJR(%s), Batch Job(%s), Finished Tasks(%s), Local Queue(%s) ...""" % (len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo))) tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo) combinedRequests = combineAnalyticsData(tempCombinedData, localQInfo) #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) logging.info( "%s requests Data combined,\n uploading request data..." % len(combinedRequests)) requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks, self.agentInfo, uploadTime, self.summaryLevel) if self.plugin != None: self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB) self.localSummaryCouchDB.uploadData(requestDocs) logging.info( "Request data upload success\n %s request, \nsleep for next cycle" % len(requestDocs)) DataUploadTime.setInfo(uploadTime, "ok") except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) DataUploadTime.setInfo(False, str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc())
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus( ['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
import os import sys from optparse import OptionParser from WMCore.Services.WMStats.WMStatsWriter import WMStatsWriter from WMCore.Configuration import loadConfigurationFile if __name__ == "__main__": if "WMAGENT_CONFIG" not in os.environ: print("The WMAGENT_CONFIG environment variable needs to be set before this can run") sys.exit(1) wmagentConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) if hasattr(wmagentConfig, "AnalyticsDataCollector") and hasattr(wmagentConfig.AnalyticsDataCollector, "centralWMStatsURL"): wmstats = WMStatsWriter(wmagentConfig.AnalyticsDataCollector.centralWMStatsURL) else: print("AnalyticsDataCollector.centralWMStatsURL is not specified") sys.exit(1) parser = OptionParser() parser.set_usage("wmstats-request-status-chagne [agent_url:port]") parser.add_option("-r", "--request", dest = "request", help = "resquest name") parser.add_option("-s", "--status", dest = "newstatus", help = "set to new status") (options, args) = parser.parse_args()
class HeartbeatMonitorBase(CherryPyPeriodicTask): def __init__(self, rest, config): super(HeartbeatMonitorBase, self).__init__(config) self.centralWMStats = WMStatsWriter(config.wmstats_url) self.threadList = config.thread_list self.userAMQ = getattr(config, "user_amq", None) self.passAMQ = getattr(config, "pass_amq", None) self.postToAMQ = getattr(config, "post_to_amq", False) self.topicAMQ = getattr(config, "topic_amq", None) self.hostPortAMQ = getattr(config, "host_port_amq", None) def setConcurrentTasks(self, config): """ sets the list of function reference for concurrent tasks """ self.concurrentTasks = [{'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration}] def reportToWMStats(self, config): """ report thread status and heartbeat. Also can report additional monitoring information by rewriting addAdditionalMonitorReport method """ self.logger.info("Checking Thread status...") downThreadInfo = self.logDB.wmstats_down_components_report(self.threadList) monitorInfo = self.addAdditionalMonitorReport(config) downThreadInfo.update(monitorInfo) wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter) self.centralWMStats.updateAgentInfo(wqSummaryDoc) self.logger.info("Uploaded to WMStats...") return def addAdditionalMonitorReport(self, config): """ add Additonal report with heartbeat report overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats) """ return {} def uploadToAMQ(self, docs, producer=None): """ _uploadToAMQ_ Sends data to AMQ, which ends up in elastic search. :param docs: list of documents/dicts to be posted :param producer: service name that's providing this info """ if not docs: self.logger.info("There are no documents to send to AMQ") return producer = producer or self.producer self.logger.debug("Sending the following data to AMQ %s", pformat(docs)) ts = int(time.time()) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=producer, topic=self.topicAMQ, host_and_ports=self.hostPortAMQ, logger=self.logger) notifications = [stompSvc.make_notification(payload=doc, docType=self.docTypeAMQ, ts=ts, dataSubfield="payload") for doc in docs] failures = stompSvc.send(notifications) self.logger.info("%i docs successfully sent to Stomp AMQ", len(notifications) - len(failures)) except Exception as ex: self.logger.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class HeartbeatMonitorBase(CherryPyPeriodicTask): def __init__(self, rest, config): super(HeartbeatMonitorBase, self).__init__(config) self.centralWMStats = WMStatsWriter(config.wmstats_url) self.threadList = config.thread_list self.userAMQ = getattr(config, "user_amq", None) self.passAMQ = getattr(config, "pass_amq", None) self.postToAMQ = getattr(config, "post_to_amq", False) self.topicAMQ = getattr(config, "topic_amq", None) self.hostPortAMQ = getattr(config, "host_port_amq", None) def setConcurrentTasks(self, config): """ sets the list of function reference for concurrent tasks """ self.concurrentTasks = [{ 'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration }] def reportToWMStats(self, config): """ report thread status and heartbeat. Also can report additional monitoring information by rewriting addAdditionalMonitorReport method """ self.logger.info("Checking Thread status...") downThreadInfo = self.logDB.wmstats_down_components_report( self.threadList) monitorInfo = self.addAdditionalMonitorReport(config) downThreadInfo.update(monitorInfo) wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter) self.centralWMStats.updateAgentInfo(wqSummaryDoc) self.logger.info("Uploaded to WMStats...") return def addAdditionalMonitorReport(self, config): """ add Additonal report with heartbeat report overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats) """ return {} def uploadToAMQ(self, docs, producer=None): """ _uploadToAMQ_ Sends data to AMQ, which ends up in elastic search. :param docs: list of documents/dicts to be posted :param producer: service name that's providing this info """ if not docs: self.logger.info("There are no documents to send to AMQ") return producer = producer or self.producer self.logger.debug("Sending the following data to AMQ %s", pformat(docs)) ts = int(time.time()) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=producer, topic=self.topicAMQ, host_and_ports=self.hostPortAMQ, logger=self.logger) notifications = stompSvc.make_notification(payload=docs, docType=self.docTypeAMQ, docId=producer, ts=ts) failures = stompSvc.send(notifications) self.logger.info("%i docs successfully sent to Stomp AMQ", len(notifications) - len(failures)) except Exception as ex: self.logger.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile # counter for deep agent monitoring. Every 15min (3 cycles of the component) self.monitorCounter = 0 self.monitorInterval = getattr(config.AgentStatusWatcher, 'monitorPollInterval', 3) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) if self.monitorCounter % self.monitorInterval == 0: monitoring = self.collectWMBSInfo() monitoring['components'] = agentInfo['down_components'] monitoring['timestamp'] = int(time.time()) with open(self.jsonFile, 'w') as outFile: json.dump(monitoring, outFile, indent=2) self.monitorCounter += 1 except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Disk space warning diskUseList = diskUse() diskUseThreshold = float( self.config.AnalyticsDataCollector.diskUseThreshold) agentInfo['disk_warning'] = [] for disk in diskUseList: if float(disk['percent'].strip('%')) >= diskUseThreshold and \ disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk: agentInfo['disk_warning'].append(disk) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" if agentInfo['down_components']: logging.info("List of agent components down: %s" % agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ results = {} logging.info("Getting wmbs job info ...") # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.info("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.info("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.info( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.info( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.info( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.info( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.info( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.info( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.config = config self.jobCacheDir = self.config.JobCreator.jobCacheDir if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False: # Get workqueue setup from config unless overridden if hasattr(self.config.TaskArchiver, 'WorkQueueParams'): self.workQueue = localQueue( **self.config.TaskArchiver.WorkQueueParams) else: from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig self.workQueue = queueFromConfig(self.config) else: self.workQueue = None self.maxProcessSize = getattr(self.config.TaskArchiver, 'maxProcessSize', 250) self.timeout = getattr(self.config.TaskArchiver, "timeOut", None) self.nOffenders = getattr(self.config.TaskArchiver, 'nOffenders', 3) self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) self.uploadPublishInfo = getattr(self.config.TaskArchiver, 'uploadPublishInfo', False) self.uploadPublishDir = getattr(self.config.TaskArchiver, 'uploadPublishDir', None) self.userFileCacheURL = getattr(self.config.TaskArchiver, 'userFileCacheURL', None) # Set up optional histograms self.histogramKeys = getattr(self.config.TaskArchiver, "histogramKeys", []) self.histogramBins = getattr(self.config.TaskArchiver, "histogramBins", 10) self.histogramLimit = getattr(self.config.TaskArchiver, "histogramLimit", 5.0) if not self.useReqMgrForCompletionCheck: #sets the local monitor summary couch db self.wmstatsCouchDB = WMStatsWriter( self.config.TaskArchiver.localWMStatsURL) self.centralCouchDBWriter = self.wmstatsCouchDB else: self.centralCouchDBWriter = WMStatsWriter( self.config.TaskArchiver.centralWMStatsURL) # Start a couch server for getting job info # from the FWJRs for committal to archive try: workDBName = getattr(self.config.TaskArchiver, 'workloadSummaryCouchDBName', 'workloadsummary') workDBurl = getattr(self.config.TaskArchiver, 'workloadSummaryCouchURL') jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url'] jobDBName = self.config.JobStateMachine.couchDBName self.jobCouchdb = CouchServer(jobDBurl) self.workCouchdb = CouchServer(workDBurl) self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName) self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.workdatabase = self.workCouchdb.connectDatabase(workDBName) logging.debug("Using url %s/%s for job" % (jobDBurl, jobDBName)) logging.debug("Writing to %s/%s for workloadSummary" % (sanitizeURL(workDBurl)['url'], workDBName)) self.requireCouch = getattr(self.config.TaskArchiver, 'requireCouch', False) except Exception, ex: msg = "Error in connecting to couch.\n" msg += str(ex) logging.error(msg) self.jobsdatabase = None self.fwjrdatabase = None if getattr(self.config.TaskArchiver, 'requireCouch', False): raise TaskArchiverPollerException(msg)
def __init__(self, rest, config): super(HeartbeatMonitorBase, self).__init__(config) self.centralWMStats = WMStatsWriter(config.wmstats_url) self.threadList = config.thread_list
class AnalyticsPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower() self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None) self.plugin = None def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName) def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: #jobs per request info logging.info("Getting Job Couch Data ...") jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite() #fwjr per request info logging.info("Getting FWJRJob Couch Data ...") fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB() logging.info("Getting Batch Job Data ...") batchJobInfo = self.wmagentDB.getBatchJobInfo() logging.info("Getting Finished Task Data ...") finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask() # get the data from local workqueue: # request name, input dataset, inWMBS, inQueue logging.info("Getting Local Queue Data ...") localQInfo = {} if not hasattr(self.config, "Tier0Feeder"): localQInfo = self.localQueue.getAnalyticsData() else: logging.debug("Tier-0 instance, not checking WorkQueue") # combine all the data from 3 sources logging.info("""Combining data from Job Couch(%s), FWJR(%s), Batch Job(%s), Finished Tasks(%s), Local Queue(%s) ...""" % (len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo))) tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo) combinedRequests = combineAnalyticsData(tempCombinedData, localQInfo) #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) logging.info("%s requests Data combined,\n uploading request data..." % len(combinedRequests)) requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks, self.agentInfo, uploadTime, self.summaryLevel) if self.plugin != None: self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB) self.localSummaryCouchDB.uploadData(requestDocs) logging.info("Request data upload success\n %s request, \nsleep for next cycle" % len(requestDocs)) DataUploadTime.setInfo(uploadTime, "ok") except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) DataUploadTime.setInfo(False, str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc())
def buildWorkloadAndCheckIn(webApi, reqSchema, couchUrl, couchDB, wmstatUrl, clone=False): """ If clone is True, the function is called on a cloned request in which case no modification of the reqSchema shall happen and should be checked in as is. """ try: request = buildWorkloadForRequest(typename = reqSchema["RequestType"], schema = reqSchema) except WMSpecFactoryException as ex: logging.error(traceback.format_exc()) raise HTTPError(400, "Error in Workload Validation: %s" % ex.message()) helper = WMWorkloadHelper(request['WorkloadSpec']) # update request as well for wmstats update # there is a better way to do this (passing helper to request but make sure all the information is there) request["Campaign"] = helper.getCampaign() # Add the output datasets if necessary # for some bizarre reason OutpuDatasets is list of lists, when cloning # [['/MinimumBias/WMAgentCommissioning10-v2/RECO'], ['/MinimumBias/WMAgentCommissioning10-v2/ALCARECO']] # #3743 if not clone: for ds in helper.listOutputDatasets(): if ds not in request['OutputDatasets']: request['OutputDatasets'].append(ds) #TODO: need to update output dataset by Task for task chain requests # can't save Request object directly, because it makes it hard to retrieve the _rev metadata = {} metadata.update(request) # don't want to JSONify the whole workflow del metadata['WorkloadSpec'] workloadUrl = helper.saveCouch(couchUrl, couchDB, metadata=metadata) request['RequestWorkflow'] = removePasswordFromUrl(workloadUrl) try: CheckIn.checkIn(request, reqSchema['RequestType']) except CheckIn.RequestCheckInError as ex: raise HTTPError(400, "Error in Request check-in: %s" % str(ex)) # Inconsistent request parameters between Oracle and Couch (#4380, #4388) # metadata above is what is saved into couch to represent a request document. # Number of request arguments on a corresponding couch document # is not set, has default null/None values, update those accordingly now. # It's a mess to have two mutually inconsistent database backends. # Not easy to handle this earlier since couch is stored first and # some parameters are worked out later when storing into Oracle. reqDetails = requestDetails(request["RequestName"]) # couchdb request parameters which are null at the injection time and remain so paramsToUpdate = ["RequestStatus", "RequestSizeFiles", "AcquisitionEra", "RequestWorkflow", "RequestType", "RequestStatus", "RequestPriority", "Requestor", "Group", "SizePerEvent", "PrepID", "RequestNumEvents", "ProcessingString", "ProcessingVersion", ] couchDb = Database(reqDetails["CouchWorkloadDBName"], reqDetails["CouchURL"]) fields = {} for key in paramsToUpdate: fields[key] = reqDetails[key] couchDb.updateDocument(request["RequestName"], "ReqMgr", "updaterequest", fields=fields, useBody=True) try: wmstatSvc = WMStatsWriter(wmstatUrl) wmstatSvc.insertRequest(request) except Exception as ex: webApi.error("Could not update WMStats, reason: %s" % ex) raise HTTPError(400, "Creating request failed, could not update WMStats.") return request
def __init__(self, rest, config): super(CleanUpTask, self).__init__(config) self.wmstatsDB = WMStatsWriter(config.wmstats_url, reqdbURL=config.reqmgrdb_url, reqdbCouchApp=config.reqdb_couch_app)
import sys from optparse import OptionParser from WMCore.Services.WMStats.WMStatsWriter import WMStatsWriter from WMCore.Configuration import loadConfigurationFile if __name__ == "__main__": if "WMAGENT_CONFIG" not in os.environ: print "The WMAGENT_CONFIG environment variable needs to be set before this can run" sys.exit(1) wmagentConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) if hasattr(wmagentConfig, "AnalyticsDataCollector") and hasattr( wmagentConfig.AnalyticsDataCollector, "centralWMStatsURL"): wmstats = WMStatsWriter( wmagentConfig.AnalyticsDataCollector.centralWMStatsURL) else: print "AnalyticsDataCollector.centralWMStatsURL is not specified" sys.exit(1) parser = OptionParser() parser.set_usage("wmstats-request-status-chagne [agent_url:port]") parser.add_option("-r", "--request", dest="request", help="resquest name") parser.add_option("-s", "--status", dest="newstatus", help="set to new status") (options, args) = parser.parse_args()
class AnalyticsPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower() self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None) self.plugin = None def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue if not hasattr(self.config, "Tier0Feeder"): self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") # use local db for tier0 if hasattr(self.config, "Tier0Feeder"): centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp=self.config.AnalyticsDataCollector.RequestCouchApp) self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) self.dbsBufferUtil = DBSBufferUtil() if self.pluginName is not None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname=self.pluginName) @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: # jobs per request info logging.info("Getting Job Couch Data ...") jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite() # fwjr per request info logging.info("Getting FWJRJob Couch Data ...") fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB() skippedInfoFromCouch = self.localCouchDB.getSkippedFilesSummaryByWorkflow() logging.info("Getting Batch Job Data ...") batchJobInfo = self.wmagentDB.getBatchJobInfo() logging.info("Getting Finished Task Data ...") finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask() logging.info("Getting DBS PhEDEx upload status ...") completedWfs = self.dbsBufferUtil.getPhEDExDBSStatusForCompletedWorkflows(summary=True) # get the data from local workqueue: # request name, input dataset, inWMBS, inQueue logging.info("Getting Local Queue Data ...") localQInfo = {} if not hasattr(self.config, "Tier0Feeder"): localQInfo = self.localQueue.getAnalyticsData() else: logging.debug("Tier-0 instance, not checking WorkQueue") # combine all the data from 3 sources logging.info("""Combining data from Job Couch(%s), FWJR(%s), WorkflowsWithSkippedFile(%s), Batch Job(%s), Finished Tasks(%s), Local Queue(%s) Completed workflows(%s).. ...""", len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(skippedInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo), len(completedWfs)) tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo) tempCombinedData2 = combineAnalyticsData(tempCombinedData, localQInfo) combinedRequests = combineAnalyticsData(tempCombinedData2, completedWfs) # set the uploadTime - should be the same for all docs uploadTime = int(time.time()) logging.info("%s requests Data combined,\n uploading request data...", len(combinedRequests)) requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks, skippedInfoFromCouch, self.agentInfo, uploadTime, self.summaryLevel) if self.plugin != None: self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB) existingDocs = self.centralWMStatsCouchDB.getAllAgentRequestRevByID(self.agentInfo["agent_url"]) self.centralWMStatsCouchDB.bulkUpdateData(requestDocs, existingDocs) logging.info("Request data upload success\n %s request, \nsleep for next cycle", len(requestDocs)) self.centralWMStatsCouchDB.updateAgentInfoInPlace(self.agentInfo["agent_url"], {"data_last_update": uploadTime, "data_error": "ok"}) except Exception as ex: msg = str(ex) logging.exception("Error occurred, will retry later: %s", msg) try: self.centralWMStatsCouchDB.updateAgentInfoInPlace(self.agentInfo["agent_url"], {"data_error": msg}) except: logging.error("upload Agent Info to central couch failed")
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority() elements = self.workqueueDS.getElementsByStatus(['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning( "Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute( jobId=jobReport.getJobID(), conn=self.getDBConn(), transaction=self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % ( jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info( "TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID=job["id"], fwkJobReport=fwkJobReport) if self.returnJobReport: returnList.append({ 'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport }) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute( binds=self.filesetAssoc, conn=self.getDBConn(), transaction=self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobCompleteInput.execute( id=idList, lfnsToSkip=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute( binds=self.parentageBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute( binds=self.parentageBindsForMerge, conn=self.getDBConn(), transaction=self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction=False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append( outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task, errorDataset=False): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"], size=jobReportFile["size"], events=jobReportFile["events"], checksums=jobReportFile["checksums"], status="NOTUPLOADED") dbsFile.setAlgorithm(appName=datasetInfo["applicationName"], appVer=datasetInfo["applicationVersion"], appFam=jobReportFile["module_label"], psetHash="GIBBERISH", configContent=jobReportFile.get('configURL')) if errorDataset: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"] + "-Error", datasetInfo["processedDataset"], datasetInfo["dataTier"])) else: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus( validStatus=jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra( era=jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber=run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0], immediateSave=False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute( [lfn], conn=self.getDBConn(), transaction=self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn=parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file=fwjrFile, jobID=jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS( fwjrFile, task, jobType == "Repack" and fwjrFile["size"] > self.maxAllowedRepackOutputSize) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName( file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) jobType = self.getJobTypeAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len( outputMap.keys()) == 0 and outputModules == set( ['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set( outputMap.keys()).difference(outputModules) == set( ['write_RAW']): pass else: failJob = True if jobType in ["Processing", "Production"]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error( "Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep( step='logArch1') else: logging.debug( "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step='logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.warning("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats( fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error( "Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id=jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID=jobID, task=fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputID }) # LogCollect jobs have no output fileset if jobType == "LogCollect": pass # Repack jobs that wrote too large merged output skip output filesets elif jobType == "Repack" and merged and wmbsFile[ "size"] > self.maxAllowedRepackOutputSize: pass else: outputFilesets = self.outputFilesetsForJob( outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputFileset }) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles and jobType not in ['LogCollect', 'Cleanup']: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView( "FWJRDump", 'jobsByOutputLFN', options={"stale": "update_after"}, keys=keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error( "job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % ( dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({ 'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID }) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute( workflowName, taskPath, conn=self.getDBConn(), transaction=self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({ 'workflowPath': workflowPath, 'workflowID': workflowID }) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute( siteName=jobLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files=dbsFileTuples, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetLocation.execute(binds=dbsFileLoc, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetChecksum.execute(bulkList=dbsCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" % dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: outpnn = wmbsFile.getLocations()[0] if self.pnn_to_psn.get(outpnn, None): fileLocations.append({'lfn': lfn, 'location': outpnn}) else: msg = "PNN doesn't exist in wmbs_location_sename table: %s (investigate)" % outpnn logging.error(msg) raise AccountantWorkerException(msg) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) fileCreate.append([ lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged'] ]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files=fileCreate, conn=self.getDBConn(), transaction=self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileAddChecksum.execute( bulkList=fileCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileLocation.execute( lfn=fileLocations, location=self.fileLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error( "Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn=pnn, immediateSave=False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn=lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute( binds=bindList, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute( [{ 'jobid': x } for x in self.jobsWithSkippedFiles.keys()], fileSelection=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask=False) return
def updateRequestStatus(couchURL, requestList, status): ww = WMStatsWriter(couchURL) for request in requestList: ww.updateRequestStatus(request, status) print("%s is udated to %s" % (request, status))
def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return