Example #1
0
    def moveToArchived(self, config):
        """
        gather active data statistics
        """

        testbedWMStats = WMStatsReader(config.wmstats_url, reqdbURL=config.reqmgrdb_url)
        reqdbWriter = RequestDBWriter(config.reqmgrdb_url)

        statusTransition = {"aborted": ["aborted-completed", "aborted-archived"], "rejected": ["rejected-archived"]}

        for status, nextStatusList in statusTransition.items():

            requests = testbedWMStats.getRequestByStatus([status], jobInfoFlag=True, legacyFormat=True)

            self.logger.info("checking %s workflows: %d" % (status, len(requests)))

            if len(requests) > 0:

                requestCollection = RequestInfoCollection(requests)

                requestsDict = requestCollection.getData()
                numOfArchived = 0

                for requestName, requestInfo in requestsDict.items():

                    if requestInfo.getJobSummary().getTotalJobs() == 0:
                        for nextStatus in nextStatusList:
                            reqdbWriter.updateRequestStatus(requestName, nextStatus)
                        numOfArchived += 1

                self.logger.info("Total %s-archieved: %d" % (status, numOfArchived))

        return
Example #2
0
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(
            self.config.TaskArchiver.centralWMStatsURL)

        if self.useReqMgrForCompletionCheck:
            self.deletableStates = ["announced"]
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
            self.reqmgrSvc = RequestManager(
                {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableStates = ["completed"]
            self.centralCouchDBWriter = self.wmstatsCouchDB

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)
Example #3
0
 def __init__(self,
              couchURL,
              appName="WMStats",
              reqdbURL=None,
              reqdbCouchApp="ReqMgr"):
     # set the connection for local couchDB call
     WMStatsReader.__init__(self, couchURL, appName, reqdbURL,
                            reqdbCouchApp)
Example #4
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.ssb2AgentStatus = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining',
            'unknown': None
        }
        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        _token = config.AgentStatusWatcher.grafanaToken
        self.grafanaURL = config.AgentStatusWatcher.grafanaURL
        self.grafanaAPIName = config.AgentStatusWatcher.grafanaSSB
        self.grafana = Grafana(_token,
                               configDict={"endpoint": self.grafanaURL})

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.General.centralWMStatsURL)
Example #5
0
 def __init__(self, app, api, config, mount, t0flag=False):
     # main CouchDB database where requests/workloads are stored
     RESTEntity.__init__(self, app, api, config, mount)
     wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
     reqdb_url = "%s/%s" % (self.config.couch_host, self.config.couch_reqmgr_db)
     if t0flag:
         couchAppName = "T0Request"
     else:
         couchAppName = "ReqMgr"
     self.wmstats = WMStatsReader(wmstats_url, reqdbURL=reqdb_url, reqdbCouchApp=couchAppName)
Example #6
0
 def __init__(self, app, api, config, mount):
     # main CouchDB database where requests/workloads are stored
     RESTEntity.__init__(self, app, api, config, mount)
     wmstats_url = "%s/%s" % (self.config.couch_host,
                              self.config.couch_wmstats_db)
     reqdb_url = "%s/%s" % (self.config.couch_host,
                            self.config.couch_reqmgr_db)
     self.wmstats = WMStatsReader(wmstats_url,
                                  reqdbURL=reqdb_url,
                                  reqdbCouchApp="ReqMgr")
Example #7
0
 def setup(self, parameters):
     """
     Set db connection and prepare resource control
     """
     # Interface to WMBS/BossAir db
     myThread = threading.currentThread()
     # set resource control
     self.resourceControl = ResourceControl(config = self.config)
     
     # wmstats connection 
     self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
def main():
    reader = WMStatsReader("http://dummy.cern.ch:5984", "wmagent_summary")
    wmstats = Database('wmagent_summary', 'http://dummy.cern.ch:5984')
    suspiciousWorkflows = reader.workflowsByStatus(["Processing Done"], stale = False)
    for entry in suspiciousWorkflows:
        requestDoc = wmstats.document(entry)
        statusList = requestDoc['request_status']
        if statusList[-2]['status'] == 'normal-archived':
            statusList = statusList[:-1]
            requestDoc['request_status'] = statusList
            wmstats.queue(requestDoc)
            
    wmstats.commit()
Example #9
0
 def gatherT0ActiveDataStats(self, config):
     """
     gather active data statistics
     """
     try:
         if DataCache.islatestJobDataExpired():
             wmstatsDB = WMStatsReader(config.wmstats_url, config.reqmgrdb_url, 
                                       reqdbCouchApp = "T0Request")
             jobData = wmstatsDB.getT0ActiveData(jobInfoFlag = True)
             DataCache.setlatestJobData(jobData)
             self.logger.info("DataCache is updated: %s" % len(jobData))
     except Exception as ex:
         self.logger.error(str(ex))
     return
 def gatherT0ActiveDataStats(self, config):
     """
     gather active data statistics
     """
     try:
         if DataCache.islatestJobDataExpired():
             wmstatsDB = WMStatsReader(config.wmstats_url, reqdbURL=config.reqmgrdb_url,
                                       reqdbCouchApp = "T0Request")
             jobData = wmstatsDB.getT0ActiveData(jobInfoFlag = True)
             DataCache.setlatestJobData(jobData)
             self.logger.info("DataCache is updated: %s", len(jobData))
     except Exception as ex:
         self.logger.error(str(ex))
     return
Example #11
0
 def gatherActiveDataStats(self, config):
     """
     gather active data statistics
     """
     try:
         if DataCache.islatestJobDataExpired():
             wmstatsDB = WMStatsReader(config.wmstats_url, config.reqmgrdb_url, 
                                       reqdbCouchApp = "ReqMgr")
             jobData = wmstatsDB.getActiveData(jobInfoFlag = True)
             DataCache.setlatestJobData(jobData)
         
     except Exception as ex:
         cherrypy.log.error(str(ex))
     return
Example #12
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)
Example #13
0
    def gatherActiveDataStats(self, config):
        """
        gather active data statistics
        """
        try:
            if DataCache.islatestJobDataExpired():
                wmstatsDB = WMStatsReader(config.wmstats_url,
                                          config.reqmgrdb_url,
                                          reqdbCouchApp="ReqMgr")
                jobData = wmstatsDB.getActiveData(jobInfoFlag=True)
                DataCache.setlatestJobData(jobData)

        except Exception as ex:
            cherrypy.log.error(str(ex))
        return
Example #14
0
 def gatherActiveDataStats(self, config):
     """
     gather active data statistics
     """
     try:
         if DataCache.islatestJobDataExpired():
             reqDB = RequestDBReader(config.requestDBURL)
             wmstatsDB = WMStatsReader(config.wmstatsURL)
             
             requestNames = reqDB.getRequestByStatus(ACTIVE_STATUS)
             jobData = wmstatsDB.getLatestJobInfoByRequests(requestNames)
             DataCache.setlatestJobData(jobData)
         
     except Exception, ex:
         cherrypy.log.error(str(ex))
Example #15
0
class RequestInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host,
                                 self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host,
                               self.config.couch_reqmgr_db)
        self.wmstats = WMStatsReader(wmstats_url,
                                     reqdbURL=reqdb_url,
                                     reqdbCouchApp="ReqMgr")

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()
        return

    @restcall(formats=[('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self, request_name):
        result = self.wmstats.getRequestSummaryWithJobInfo(request_name)
        return rows([result])
Example #16
0
class FinishedStatusInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host, self.config.couch_reqmgr_db)
        self.wmstats = WMStatsReader(wmstats_url, reqdbURL=reqdb_url, reqdbCouchApp="ReqMgr")

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()
        return


    @restcall(formats = [('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self, request_name):

        try:
            result = self.wmstats.isWorkflowCompletedWithLogCollectAndCleanUp(request_name)
        except KeyError:
            raise cherrypy.HTTPError(404, "Cannot find request: %s" % request_name)

        return rows([result])
Example #17
0
class FinishedStatusInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host,
                                 self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host,
                               self.config.couch_reqmgr_db)
        self.wmstats = WMStatsReader(wmstats_url,
                                     reqdbURL=reqdb_url,
                                     reqdbCouchApp="ReqMgr")

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()
        return

    @restcall(formats=[('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self, request_name):

        try:
            result = self.wmstats.isWorkflowCompletedWithLogCollectAndCleanUp(
                request_name)
        except KeyError:
            raise cherrypy.HTTPError(404,
                                     "Cannot find request: %s" % request_name)

        return rows([result])
Example #18
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
     
     if self.useReqMgrForCompletionCheck:
         self.deletableStates = ["announced"]
         self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
         self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
     else:
         # Tier0 case
         self.deletableStates = ["completed"]
         self.centralCouchDBWriter = self.wmstatsCouchDB
     
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
     
     statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
     self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
Example #19
0
    def gatherActiveDataStats(self, config):
        """
        gather active data statistics
        """
        try:
            if DataCache.islatestJobDataExpired():
                reqDB = RequestDBReader(config.requestDBURL)
                wmstatsDB = WMStatsReader(config.wmstatsURL)

                requestNames = reqDB.getRequestByStatus(ACTIVE_STATUS)
                jobData = wmstatsDB.getLatestJobInfoByRequests(requestNames)
                DataCache.setlatestJobData(jobData)

        except Exception as ex:
            self.logger.error(str(ex))
        return
Example #20
0
class WMStatsInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host, self.config.couch_reqmgr_db)
        self.wmstats = WMStatsReader(wmstats_url, reqdbURL=reqdb_url, reqdbCouchApp="ReqMgr")

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()
        else:
            raise MethodWithoutQueryString
        return


    @restcall(formats = [('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self, request_name):
        result = self.wmstats.getRequestSummaryWithJobInfo(request_name)
        return rows([result])
Example #21
0
class JobDetailInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount, t0flag=False):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host, self.config.couch_reqmgr_db)
        if t0flag:
            couchAppName = "T0Request"
        else:
            couchAppName = "ReqMgr"
        self.wmstats = WMStatsReader(wmstats_url, reqdbURL=reqdb_url, reqdbCouchApp=couchAppName)

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()

        prop = 'sample_size'
        safe.kwargs[prop] = int(param.kwargs.get(prop, 1))
        if prop in param.kwargs:
            del param.kwargs[prop]
        return


    @restcall(formats=[('text/plain', PrettyJSONFormat()), ('text/html', PrettyJSONHTMLFormat()), ('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self, request_name, sample_size):

        result = self.wmstats.getTaskJobSummaryByRequest(request_name, sample_size)
        return rows([result])
Example #22
0
 def setUp(self):
     """
     _setUp_
     """
     self.schema = []
     self.couchApps = ["WMStats"]
     self.testInit = TestInitCouchApp('WorkQueueServiceTest')
     self.testInit.setLogging()
     self.testInit.setDatabaseConnection()
     self.testInit.setSchema(customModules=self.schema, useDefault=False)
     dbName = 'wmstats_t'
     self.testInit.setupCouch(dbName, *self.couchApps)
     self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, dbName)
     self.wmstatsReader = WMStatsReader(self.testInit.couchUrl, dbName)
     self.wmstatsReader.defaultStale = {}
     return
Example #23
0
class ActiveRequestJobInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host,
                                 self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host,
                               self.config.couch_reqmgr_db)
        self.wmstats = WMStatsReader(wmstats_url,
                                     reqdb_url,
                                     reqdbCouchApp="ReqMgr")

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        return

    @restcall(formats=[('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self):
        results = DataCache.getlatestJobData()
        if results == None or DataCache.islatestJobDataExpired():
            results = self.wmstats.getActiveData(jobInfoFlag=True)
        return rows([results])
def gatherActiveDataStats():
    wmstats_url = "https://cmsweb.cern.ch/couchdb/wmstats"
    reqmgrdb_url = "https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache"
    jobInfoFlag = False
    tStart = time.time()
    try:
        if DataCache.islatestJobDataExpired():
            wmstatsDB = WMStatsReader(wmstats_url,
                                      reqdbURL=reqmgrdb_url,
                                      reqdbCouchApp="ReqMgr")
            jobData = wmstatsDB.getActiveData(jobInfoFlag=jobInfoFlag)
            DataCache.setlatestJobData(jobData)
            print("DataCache is updated: {}".format(len(jobData)))
        else:
            print("DataCache is up-to-date")
    except Exception as ex:
        print("Exception updating cache. Details: {}\nTraceback: {}".format(
            str(ex), str(traceback.format_exc())))
    print("Total time executing this cycle: {}".format(time.time() - tStart))
Example #25
0
    def moveToArchived(self, config):
        """
        gather active data statistics
        """

        testbedWMStats = WMStatsReader(config.wmstats_url,
                                       reqdbURL=config.reqmgrdb_url)
        reqdbWriter = RequestDBWriter(config.reqmgrdb_url)

        statusTransition = {
            "aborted": ["aborted-completed", "aborted-archived"],
            "rejected": ["rejected-archived"]
        }

        for status, nextStatusList in statusTransition.items():

            requests = testbedWMStats.getRequestByStatus([status],
                                                         jobInfoFlag=True,
                                                         legacyFormat=True)

            self.logger.info("checking %s workflows: %d" %
                             (status, len(requests)))

            if len(requests) > 0:

                requestCollection = RequestInfoCollection(requests)

                requestsDict = requestCollection.getData()
                numOfArchived = 0

                for requestName, requestInfo in requestsDict.items():

                    if requestInfo.getJobSummary().getTotalJobs() == 0:
                        for nextStatus in nextStatusList:
                            reqdbWriter.updateRequestStatus(
                                requestName, nextStatus)
                        numOfArchived += 1

                self.logger.info("Total %s-archieved: %d" %
                                 (status, numOfArchived))

        return
Example #26
0
 def setUp(self):
     """
     _setUp_
     """
     self.schema = []
     self.couchApps = ["WMStats"]
     self.testInit = TestInitCouchApp('WorkQueueServiceTest')
     self.testInit.setLogging()
     self.testInit.setDatabaseConnection()
     self.testInit.setSchema(customModules=self.schema, useDefault=False)
     dbName = 'wmstats_t'
     self.testInit.setupCouch(dbName, "WMStats")
     reqDBName = "reqmgrdb_t"
     self.testInit.setupCouch(reqDBName, "ReqMgr")
     wmstatsURL = "%s/%s" % (self.testInit.couchUrl, dbName)
     reqDBURL = "%s/%s" % (self.testInit.couchUrl, reqDBName)
     self.reqDBWriter = RequestDBWriter(reqDBURL)
     self.wmstatsReader = WMStatsReader(wmstatsURL, reqdbURL=reqDBURL)
     self.wmstatsReader.defaultStale = {}
     self.wmstatsReader.reqDB.defaultStale = {}
     return
Example #27
0
 def gatherActiveDataStats(self, config):
     """
     gather active data statistics
     """
     self.logger.info("Starting gatherActiveDataStats with jobInfo set to: %s", self.getJobInfo)
     try:
         tStart = time.time()
         if DataCache.islatestJobDataExpired():
             wmstatsDB = WMStatsReader(config.wmstats_url, reqdbURL=config.reqmgrdb_url,
                                       reqdbCouchApp="ReqMgr", logger=self.logger)
             self.logger.info("Getting active data with job info for statuses: %s", WMSTATS_JOB_INFO)
             jobData = wmstatsDB.getActiveData(WMSTATS_JOB_INFO, jobInfoFlag=self.getJobInfo)
             self.logger.info("Getting active data with NO job info for statuses: %s", WMSTATS_NO_JOB_INFO)
             tempData = wmstatsDB.getActiveData(WMSTATS_NO_JOB_INFO, jobInfoFlag=False)
             jobData.update(tempData)
             self.logger.info("Running setlatestJobData...")
             DataCache.setlatestJobData(jobData)
             self.logger.info("DataCache is up-to-date with %d requests data", len(jobData))
     except Exception as ex:
         self.logger.exception("Exception updating DataCache. Error: %s", str(ex))
     self.logger.info("Total time loading data from ReqMgr2 and WMStats: %s", time.time() - tStart)
     return
Example #28
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
     self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
def getAssignedApprovedWork():
    """
    Split the un-split. Use a local couch for it.
    """
    workStatistics = {}
    wmstatsReader = WMStatsReader(wmstatsEndpoint)
    unAssignedRequests = wmstatsReader.workflowsByStatus(['assignment-approved'], stale = False)

    queueConfig = queueConfigFromConfigObject(workqueueConfig())
    workqueue = queueFromConfig(queueConfig)

    for requestName in unAssignedRequests:
        if 'TEST' in requestName:
            continue
        workqueue.queueWork('%s/reqmgr_workload_cache/%s/spec' % (externalCouchDb, requestName), requestName, 'notreallyateam')

    for requestName in unAssignedRequests:
        workStatistics[requestName] = 0
        workElements = workqueue.backend.getElementsForWorkflow(requestName)
        for element in workElements:
            jobs = element['Jobs']
            workStatistics[requestName] += jobs
        
    return workStatistics
Example #30
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
Example #31
0
 def setUp(self):
     """
     _setUp_
     """
     self.schema = []
     self.couchApps = ["WMStats"]
     self.testInit = TestInitCouchApp('WorkQueueServiceTest')
     self.testInit.setLogging()
     self.testInit.setDatabaseConnection()
     self.testInit.setSchema(customModules = self.schema,
                             useDefault = False)
     dbName = 'wmstats_t'
     self.testInit.setupCouch(dbName, *self.couchApps)
     self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, dbName)
     self.wmstatsReader = WMStatsReader(self.testInit.couchUrl, dbName)
     self.wmstatsReader.defaultStale = {}
     return
Example #32
0
 def setUp(self):
     """
     _setUp_
     """
     self.schema = []
     self.couchApps = ["WMStats"]
     self.testInit = TestInitCouchApp("WorkQueueServiceTest")
     self.testInit.setLogging()
     self.testInit.setDatabaseConnection()
     self.testInit.setSchema(customModules=self.schema, useDefault=False)
     dbName = "wmstats_t"
     self.testInit.setupCouch(dbName, "WMStats")
     reqDBName = "reqmgrdb_t"
     self.testInit.setupCouch(reqDBName, "ReqMgr")
     wmstatsURL = "%s/%s" % (self.testInit.couchUrl, dbName)
     reqDBURL = "%s/%s" % (self.testInit.couchUrl, reqDBName)
     self.reqDBWriter = RequestDBWriter(reqDBURL)
     self.wmstatsReader = WMStatsReader(wmstatsURL, reqDBURL)
     self.wmstatsReader.defaultStale = {}
     self.wmstatsReader.reqDB.defaultStale = {}
     return
Example #33
0
class TeamInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
        self.wmstats = WMStatsReader(wmstats_url)           
        
    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()   
        return            

    
    @restcall(formats = [('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self):
        result = self.wmstats.agentsByTeam(filterDrain=False)
        return rows(result.keys())
Example #34
0
class TeamInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)
        wmstats_url = "%s/%s" % (self.config.couch_host,
                                 self.config.couch_wmstats_db)
        self.wmstats = WMStatsReader(wmstats_url)

    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        if args_length == 1:
            safe.args.append(param.args[0])
            param.args.pop()
        return

    @restcall(formats=[('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self):
        result = self.wmstats.agentsByTeam(filterDrain=False)
        return rows(result)
Example #35
0
class ActiveRequestJobInfo(RESTEntity):
    """
    This class need to move under WMStats server when wmstats server created
    """
    def __init__(self, app, api, config, mount):
        # main CouchDB database where requests/workloads are stored
        RESTEntity.__init__(self, app, api, config, mount)  
        wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
        reqdb_url = "%s/%s" % (self.config.couch_host, self.config.couch_reqmgr_db)
        self.wmstats = WMStatsReader(wmstats_url, reqdb_url, reqdbCouchApp = "ReqMgr")             
        
    def validate(self, apiobj, method, api, param, safe):
        args_length = len(param.args)
        return            

    
    @restcall(formats = [('application/json', JSONFormat())])
    @tools.expires(secs=-1)
    def get(self):
        results = DataCache.getlatestJobData()
        if results == None or DataCache.islatestJobDataExpired():
            results = self.wmstats.getActiveData(jobInfoFlag = True)
        return rows([results])
Example #36
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info(
                "This component is not enabled in the configuration. Doing nothing."
            )
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            # first, update site status
            ssbSiteStatus = self.getSiteStatus()
            self.checkStatusChanges(sitesRC, ssbSiteStatus)

            # now fetch site slots thresholds
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                logging.error(
                    "One or more of the SSB metrics is down. Please contact the Dashboard team."
                )
                return

            logging.debug("Info from SSB: %s", sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info(
            "Resource control cycle finished updating site state and thresholds."
        )

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view for agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(
                filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning(
                "agentInfo couch view is not available, use default value %s",
                self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName,
                                                    self.agentsNumByTeam)
            logging.debug(
                "Agents connected to the same team (not in DrainMode): %d",
                self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric)
        ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric)

        ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots)

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoSSB.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down',
                                 site)
                    self.updateSiteState(site, 'Down')

        # normally set all the others
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site,
                             infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return

    def checkSlotsChanges(self, infoRC, infoSSB):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then updates the task level too.
        """
        logging.debug(
            "Settings for site and task pending slots: %s%% and %s%%",
            self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and site.startswith('T1_'):
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100
            else:
                # round very small sites to the bare minimum
                infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'],
                                                self.minCPUSlots)
                infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'],
                                               self.minIOSlots)
            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']

            sitePending = max(
                int(CPUBound / self.agentsNumByTeam *
                    self.pendingSlotsSitePercent / 100), self.minCPUSlots)

            # update site slots, if needed
            if infoRC[site]['running_slots'] != CPUBound or infoRC[site][
                    'pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info(
                    "Updating %s site thresholds for pend/runn: %d/%d", site,
                    sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(
                    site,
                    pendingJobSlots=sitePending,
                    runningJobSlots=CPUBound)

            # now handle the task level thresholds
            self.checkTaskSlotsChanges(site, CPUBound, IOBound)

    def thresholdsByVOName(self, infoCpu, infoIo):
        """
        _thresholdsByVOName_

        Creates a dictionary with CPU and IO slots keyed by the site name.
        If any of the thresholds is missing or has an invalid value, the whole
        site thresholds is skipped.
        """
        ssbSiteSlots = {}
        for entry in infoCpu:
            if entry['Value'] is None:
                logging.warn(
                    'Site %s has invalid CPU thresholds in SSB. Taking no action',
                    entry['VOName'])
            else:
                ssbSiteSlots[entry['VOName']] = {
                    'slotsCPU': int(entry['Value'])
                }

        # then iterate over the IO slots
        for entry in infoIo:
            if entry['Value'] is None:
                logging.warn(
                    'Site %s has invalid IO thresholds in SSB. Taking no action',
                    entry['VOName'])
            else:
                ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value'])

        # Before proceeding, remove sites without both metrics
        for site in ssbSiteSlots.keys():
            if len(ssbSiteSlots[site]) != 2:
                logging.warn("Site: %s has incomplete SSB metrics, see %s",
                             site, ssbSiteSlots[site])
                ssbSiteSlots.pop(site)

        return ssbSiteSlots

    def getSiteStatus(self):
        """
        _getSiteStatus_

        Fetch site state from SSB and map it to agent state
        """
        ssbState = self.ssb.getMetric(self.siteStatusMetric)

        ssbSiteState = {}
        for site in ssbState:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteState:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error(
                        "Unknown status '%s' for site %s, please check SSB",
                        status, voname)
                else:
                    ssbSiteState[voname] = {'state': statusAgent}
            else:
                logging.warning(
                    'I have a duplicated status entry in SSB for %s', voname)

        return ssbSiteState

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining'
        }
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return

    def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound):
        """
        _checkTaskSlotsChanges_

        Update the CPU and IOBound slots for a given site.
        """
        siteTaskSlots = self.resourceControl.thresholdBySite(siteName)
        taskCPUPending = max(
            int(CPUBound / self.agentsNumByTeam *
                self.pendingSlotsTaskPercent / 100), self.minCPUSlots)
        taskIOPending = max(
            int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent /
                100), self.minIOSlots)

        updateTasks = False
        if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0][
                'task_pending_slots'] != taskCPUPending:
            updateTasks = True
        elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[
                0]['task_pending_slots'] != taskIOPending:
            updateTasks = True

        if updateTasks:
            logging.info(
                "Updating %s CPU tasks thresholds for pend/runn: %d/%d",
                siteName, taskCPUPending, CPUBound)
            self.resourceControl.insertThreshold(siteName,
                                                 taskType=self.tasksCPU,
                                                 maxSlots=CPUBound,
                                                 pendingSlots=taskCPUPending)
            logging.info(
                "Updating %s IO tasks thresholds for pend/runn: %d/%d",
                siteName, taskIOPending, IOBound)
            self.resourceControl.insertThreshold(siteName,
                                                 taskType=self.tasksIO,
                                                 maxSlots=IOBound,
                                                 pendingSlots=taskIOPending)

        if self.tier0Mode:
            # Set task thresholds for Tier0
            logging.debug("Updating %s Express and Repack task thresholds.",
                          siteName)
            expressSlots = int(CPUBound * self.runningExpressPercent / 100)
            pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent /
                                 100)
            self.resourceControl.insertThreshold(siteName, 'Express',
                                                 expressSlots, pendingExpress)

            repackSlots = int(CPUBound * self.runningRepackPercent / 100)
            pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent /
                                100)
            self.resourceControl.insertThreshold(siteName, 'Repack',
                                                 repackSlots, pendingRepack)
Example #37
0
 def __init__(self, app, api, config, mount):
     # main CouchDB database where requests/workloads are stored
     RESTEntity.__init__(self, app, api, config, mount)
     wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db)
     reqdb_url = "%s/%s" % (self.config.couch_host, self.config.couch_reqmgr_db)
     self.wmstats = WMStatsReader(wmstats_url, reqdbURL=reqdb_url, reqdbCouchApp="ReqMgr")
Example #38
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize 
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        self.setVariables(self.config)
        
    def setVariables(self, config):
        """
        load all the variables from the config file
        """
        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        
        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercentCPUBound = config.AgentStatusWatcher.runningExpressPercentCPUBound
        self.runningRepackPercentIOBound = config.AgentStatusWatcher.runningRepackPercentIOBound
        
        # forced site list
        self.forcedSiteList = config.AgentStatusWatcher.forcedSiteList
        
        # agent teams (for dynamic threshold) and queueParams (drain mode)
        self.teamNames = config.Agent.teamName
        self.queueParams = config.WorkQueueManager.queueParams
        
        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB
        
        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        
    def setup(self, parameters):
        """
        Set db connection and prepare resource control
        """
        # Interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set resource control
        self.resourceControl = ResourceControl(config = self.config)
        
        # wmstats connection 
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
        
        # init variables
        self.agentsByTeam = {}

    def algorithm(self, parameters):
        """
        _algorithm_
        
        Update site info about state and thresholds
            1. Get information from SSB
            2. Get information about teams and agents from WMStats
            3. Set site status and set therholds for each valid site
        Sites from SSB are validated with PhEDEx node names
        """
        try:
            # set variables every polling cycle
            self.setVariables(self.config)
            
            # Get sites in Resource Control
            currentSites = self.resourceControl.listCurrentSites()
            
            logging.debug("Starting algorithm, getting site info from SSB")
            stateBySite, slotsCPU, slotsIO = self.getInfoFromSSB()
            
            if not stateBySite or not slotsCPU or not slotsIO:
                logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
                return
            
            logging.debug("Setting status and thresholds for all sites, site pending: %s%%, task pending: %s%%" % 
                          (str(self.pendingSlotsSitePercent), str(self.pendingSlotsTaskPercent))) 
            
            if self.queueParams.get('DrainMode', False):
                agentsNum = 1
                logging.debug("This agent is in DrainMode, don't divide pending thresholds")
                
            else:
                # get number of agents working in the same team (not in DrainMode)
                agentsByTeam = self.centralCouchDBReader.agentsByTeam()
                if not agentsByTeam:
                    agentsNum = 1
                    logging.debug("agentInfo couch view is not available, don't divide pending thresholds")
                else:
                    self.agentsByTeam = agentsByTeam
                    teams = self.teamNames.split(',')
                    agentsCount = []
                    for team in teams:
                        if self.agentsByTeam[team] == 0:
                            agentsCount.append(1)
                        else:
                            agentsCount.append(self.agentsByTeam[team])
                    agentsNum = min(agentsCount) # If agent is in several teams, we choose the team with less agents
                    logging.debug("Number of agents not in DrainMode running in the same team: %s" % str(agentsNum))
            
            # set site status and thresholds
            listSites = stateBySite.keys()
            if self.forcedSiteList:
                if set(self.forcedSiteList).issubset(set(listSites)):
                    listSites = self.forcedSiteList
                    logging.info("Forcing site list: %s" % (', '.join(self.forcedSiteList)))
                else:
                    listSites = self.forcedSiteList
                    logging.warn("Forcing site list: %s. Some site(s) are not in SSB" % (', '.join(self.forcedSiteList)))
                    
            for site in listSites:
                if site in currentSites:
                    sitestate = stateBySite.get(site,'Normal')
                    if not slotsCPU[site] or not slotsIO[site]:
                        pluginResponse = self.updateSiteInfo(site, sitestate, 0, 0, agentsNum)
                        if not pluginResponse: 
                            continue
                        logging.error('Setting site %s to %s, forcing CPUBound: 0, IOBound: 0 due to missing information in SSB' % 
                                 (site, sitestate))
                        continue
                    
                    pluginResponse = self.updateSiteInfo(site, sitestate, slotsCPU[site], slotsIO[site], agentsNum)
                    if not pluginResponse:
                        continue
                    logging.info('Setting site %s to %s, CPUBound: %s, IOBound: %s' % 
                                 (site, sitestate, slotsCPU[site], slotsIO[site]))
                else:
                    logging.debug("Site '%s' has not been added to Resource Control" % site)
            
            # if onlySSB sites or forcedSiteList, force to down all the sites not in SSB/forcedSiteList
            if self.onlySSB or self.forcedSiteList:
                for site in set(currentSites).difference(set(listSites)):
                    pluginResponse = self.updateSiteInfo(site, 'Down', 0, 0)
                    if not pluginResponse:
                        continue
                    logging.info('Only SSBsites/forcedSiteList, forcing site %s to Down, CPUBound: 0, IOBound: 0' % site)
            
            logging.info("Resource update is completed, waiting for the next cycle.\n")
            
        except Exception, ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
Example #39
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize 
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        self.setVariables(self.config)
        
    def setVariables(self, config):
        """
        load all the variables from the config file
        """
        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        
        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent
        
        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])
        
        # agent teams (for dynamic threshold) and queueParams (drain mode)
        self.teamNames = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)
                
        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB
        
        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)
       
        
    def setup(self, parameters):
        """
        Set db connection and prepare resource control
        """
        # Interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set resource control
        self.resourceControl = ResourceControl(config = self.config)
        
        # wmstats connection 
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
        
    def algorithm(self, parameters):
        """
        _algorithm_
        
        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        # set variables every polling cycle
        self.setVariables(self.config)
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s" % sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s" % sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")


    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_
        
        Get the WMStats view about agents and teams
        """
        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam()
        except Exception as ex:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.debug("agentInfo couch view is not available, use default value %s" % self.agentsNumByTeam)
        else:
            self.agentsByTeam = agentsByTeam
            agentsCount = []
            for team in self.teamNames.split(','):
                if team not in self.agentsByTeam:
                    agentsCount.append(1)
                else:
                    agentsCount.append(self.agentsByTeam[team])
            # If agent is in several teams, we choose the team with less agents
            self.agentsNumByTeam = min(agentsCount, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d" % self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_
        
        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        url_site_state = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.siteStatusMetric)
        url_cpu_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.cpuBoundMetric)
        url_io_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(url_site_state).read()
        cpu_bound = urllib2.urlopen(url_cpu_bound).read()
        io_bound = urllib2.urlopen(url_io_bound).read()

        # parse from json format to dictionary, get only 'csvdata'
        site_state = json.loads(sites)['csvdata']
        cpu_slots = json.loads(cpu_bound)['csvdata']
        io_slots = json.loads(io_bound)['csvdata']

        # dictionaries with status/thresholds info by VOName
        stateBySite = self.siteStatusByVOName(site_state)
        slotsCPU = self.thresholdsByVOName(cpu_slots)
        slotsIO = self.thresholdsByVOName(io_slots)

        sitesSSB = {}
        if not stateBySite or not slotsCPU or not slotsIO:
            logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
            return sitesSSB

        for k,v in stateBySite.iteritems():
            sitesSSB[k] = {'state': v}
            sitesSSB[k]['slotsCPU'] = slotsCPU[k] if k in slotsCPU else None
            sitesSSB[k]['slotsIO'] = slotsIO[k] if k in slotsIO else None
        return sitesSSB

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down" % site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down' % site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s' % (site, infoRC[site]['state'],
                                                                  infoSSB[site]['state']))
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug("Settings for site and task pending slots: %s%% and %s%%" % 
                      (self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)) 

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores/100
                infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores/100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(int(CPUBound/agentsCount * self.pendingSlotsSitePercent/100), minCPUSlots)
            taskCPUPending = max(int(CPUBound/agentsCount * self.pendingSlotsTaskPercent/100), minCPUSlots)
            taskIOPending = max(int(IOBound/agentsCount * self.pendingSlotsTaskPercent/100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.debug("Updating %s site thresholds for pend/runn: %d/%d" % (site, sitePending, CPUBound))
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots = sitePending,
                                                        runningJobSlots = CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d" % (site, taskCPUPending,
                                                                                        CPUBound))
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(site, taskType = task, maxSlots = CPUBound,
                                                         pendingSlots = taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d" % (site, taskIOPending,
                                                                                       IOBound))
                for task in tasksIO:
                    self.resourceControl.insertThreshold(site, taskType = task, maxSlots = IOBound,
                                                         pendingSlots = taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug("Updating %s Express and Repack task thresholds." % site)
                expressSlots = int(CPUBound * self.runningExpressPercent/100)
                pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent/100)
                self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent/100)
                pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent/100)
                self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack)


    def thresholdsByVOName(self, sites):
        """
        _thresholdsByVOName_
        
        Creates a dictionary with keys->VOName and values->threshold: 
        """
        thresholdbyVOName = {}
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname not in thresholdbyVOName:
                if value is None: 
                    logging.warn('Site %s does not have thresholds in SSB, assuming 0' % voname) 
                    thresholdbyVOName[voname] = 0
                else:
                    thresholdbyVOName[voname] = int(value)
            else:
                logging.error('I have a duplicated threshold entry in SSB for %s' % voname) 
        return thresholdbyVOName
    
    def siteStatusByVOName(self, sites):
        """
        _siteStatusByVOName_
        
        Creates a dictionary with keys->VOName and values->status:
        """
        statusBySite = {}
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if not status: 
                logging.error('Site %s does not have status in SSB' % voname)
                continue
            if voname not in statusBySite:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unkwown status '%s' for site %s, please check SSB" % (status, voname))
                    continue
                statusBySite[voname] = statusAgent
            else:
                logging.error('I have a duplicated status entry in SSB for %s' % voname) 
        return statusBySite

    def getState(self, stateSSB):
        """
        _getState_
        
        Translates SSB states into resource control state
        """
        ssb2agent = {'on':    'Normal',
                     'drain': 'Draining',
                     'down': 'Down',
                     'skip': 'Down'}

        if stateSSB in ssb2agent:
            return ssb2agent[stateSSB]
        elif stateSSB == "tier0":
            logging.debug('There is a site in tier0 status (Tier0Mode is %s)' % self.tier0Mode )
            if self.tier0Mode: 
                return "Normal"
            else:
                return "Draining"
        else:
            return None

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_
    
        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:" % (siteName, state))
            logging.error(str(ex))
            logging.error("Traceback: \n%s" % traceback.format_exc())
        return
Example #40
0
class CleanCouchPoller(BaseWorkerThread):
    """
    Cleans up local couch db according the the given condition.
    1. Cleans local couch db when request is completed and reported to cental db.
       This will clean up local couchdb, local summary db, local queue
       
    2. Cleans old couchdoc which is created older than the time threshold
    
    """
    def __init__(self, config):
        """
        Initialize config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
        
        if self.useReqMgrForCompletionCheck:
            self.deletableStates = ["announced"]
            self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
        else:
            # Tier0 case
            self.deletableStates = ["completed"]
            self.centralCouchDBWriter = self.wmstatsCouchDB
        
        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb  = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Cleaning up the old request docs")
            report = self.wmstatsCouchDB.deleteOldDocs(self.config.TaskArchiver.DataKeepDays)
            logging.info("%s docs deleted" % report)
            logging.info("getting complete and announced requests")
            
            deletableWorkflows = self.centralCouchDBReader.workflowsByStatus(self.deletableStates)
            
            logging.info("Ready to delete %s" % deletableWorkflows)     
            for workflowName in deletableWorkflows:
                if self.cleanAllLocalCouchDB(workflowName):
                    self.centralCouchDBWriter.updateRequestStatus(workflowName, "normal-archived")
                    logging.info("status updated to normal-archived %s" % workflowName)
            
            abortedWorkflows = self.centralCouchDBReader.workflowsByStatus(["aborted-completed"])
            logging.info("Ready to delete aborted %s" % abortedWorkflows)
            for workflowName in abortedWorkflows:
                if self.cleanAllLocalCouchDB(workflowName):
                    self.centralCouchDBWriter.updateRequestStatus(workflowName, "aborted-archived")
                    logging.info("status updated to aborted-archived %s" % workflowName)

            #TODO: following code is temproraly - remove after production archived data is cleaned 
            removableWorkflows = self.centralCouchDBReader.workflowsByStatus(["archived"])
            
            logging.info("Ready to delete %s from wmagent_summary" % removableWorkflows)     
            for workflowName in removableWorkflows:
                logging.info("Deleting %s from WMAgent Summary Couch" % workflowName)
                report = self.deleteWorkflowFromJobCouch(workflowName, "WMStats")
                logging.info("%s docs deleted from wmagent_summary" % report)
                # only updatet he status when delete is successful
                # TODO: need to handle the case when there are multiple agent running the same request.
                if report["status"] == "ok":
                    self.centralCouchDBWriter.updateRequestStatus(workflowName, "normal-archived")
                    logging.info("status updated to normal-archived from archived (this is temp solution for production) %s" % workflowName)

        except Exception, ex:
            logging.error(str(ex))
            logging.error("Error occurred, will try again next cycle")
Example #41
0
class CleanCouchPoller(BaseWorkerThread):
    """
    Cleans up local couch db according the the given condition.
    1. Cleans local couch db when request is completed and reported to cental db.
       This will clean up local couchdb, local summary db, local queue
       
    2. Cleans old couchdoc which is created older than the time threshold
    
    """
    def __init__(self, config):
        """
        Initialize config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config

    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(
            self.config.TaskArchiver.centralWMStatsURL)

        if self.useReqMgrForCompletionCheck:
            self.deletableStates = ["announced"]
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
            self.reqmgrSvc = RequestManager(
                {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableStates = ["completed"]
            self.centralCouchDBWriter = self.wmstatsCouchDB

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Cleaning up the old request docs")
            report = self.wmstatsCouchDB.deleteOldDocs(
                self.config.TaskArchiver.DataKeepDays)
            logging.info("%s docs deleted" % report)
            logging.info("getting complete and announced requests")

            deletableWorkflows = self.centralCouchDBReader.workflowsByStatus(
                self.deletableStates)

            logging.info("Ready to archive normal %s workflows" %
                         len(deletableWorkflows))
            numUpdated = self.archiveWorkflows(deletableWorkflows,
                                               "normal-archived")
            logging.info("archive normal %s workflows" % numUpdated)

            abortedWorkflows = self.centralCouchDBReader.workflowsByStatus(
                ["aborted-completed"])
            logging.info("Ready to archive aborted %s workflows" %
                         len(abortedWorkflows))
            numUpdated = self.archiveWorkflows(abortedWorkflows,
                                               "aborted-archived")
            logging.info("archive aborted %s workflows" % numUpdated)

            rejectedWorkflows = self.centralCouchDBReader.workflowsByStatus(
                ["rejected"])
            logging.info("Ready to archive rejected %s workflows" %
                         len(rejectedWorkflows))
            numUpdated = self.archiveWorkflows(rejectedWorkflows,
                                               "rejected-archived")
            logging.info("archive rejected %s workflows" % numUpdated)

        except Exception, ex:
            logging.error(str(ex))
            logging.error("Error occurred, will try again next cycle")
Example #42
0
class WMStatsTest(unittest.TestCase):
    """
    """
    def setUp(self):
        """
        _setUp_
        """
        self.schema = []
        self.couchApps = ["WMStats"]
        self.testInit = TestInitCouchApp('WorkQueueServiceTest')
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules=self.schema, useDefault=False)
        dbName = 'wmstats_t'
        self.testInit.setupCouch(dbName, "WMStats")
        reqDBName = "reqmgrdb_t"
        self.testInit.setupCouch(reqDBName, "ReqMgr")
        wmstatsURL = "%s/%s" % (self.testInit.couchUrl, dbName)
        reqDBURL = "%s/%s" % (self.testInit.couchUrl, reqDBName)
        self.reqDBWriter = RequestDBWriter(reqDBURL)
        self.wmstatsReader = WMStatsReader(wmstatsURL, reqdbURL=reqDBURL)
        self.wmstatsReader.defaultStale = {}
        self.wmstatsReader.reqDB.defaultStale = {}
        return

    def tearDown(self):
        """
        _tearDown_

        Drop all the WMBS tables.
        """
        self.testInit.tearDownCouch()

    def testWMStatsWriter(self):
        # test getWork
        schema = generate_reqmgr_schema()

        result = self.reqDBWriter.insertGenericRequest(schema[0])
        self.assertEquals(result[0]['ok'], True, 'insert fail')

        result = self.reqDBWriter.updateRequestStatus(schema[0]['RequestName'],
                                                      "failed")
        self.assertEquals(result, 'OK', 'update fail')

        result = self.reqDBWriter.updateRequestStatus("not_exist_schema",
                                                      "assigned")
        self.assertEquals(result, 'Error: document not found')

        result = self.reqDBWriter.updateRequestProperty(
            schema[0]['RequestName'], {"Teams": ['teamA']})
        self.assertEquals(result, 'OK', 'update fail')

        result = self.reqDBWriter.updateRequestProperty(
            "not_exist_schema", {"Teams": ['teamA']})
        self.assertEquals(result, 'Error: document not found')

        totalStats = {
            'TotalEstimatedJobs': 100,
            'TotalInputEvents': 1000,
            'TotalInputLumis': 1234,
            'TotalInputFiles': 5
        }
        result = self.reqDBWriter.updateRequestProperty(
            schema[0]['RequestName'], totalStats)
        self.assertEquals(result, 'OK', 'update fail')

        result = self.reqDBWriter.updateRequestProperty(
            schema[0]['RequestName'], totalStats)
        self.assertEquals(result, 'OK', 'update fail')

        result = self.reqDBWriter.updateRequestProperty(
            "not_exist_schema", totalStats)
        self.assertEquals(result, 'Error: document not found')

        spec1 = newWorkload(schema[0]['RequestName'])
        production = spec1.newTask("Production")
        production.setTaskType("Merge")
        production.setSiteWhitelist(['TEST_SITE'])
        properties = {
            "RequestPriority": spec1.priority(),
            'SiteWhitelist': spec1.getTopLevelTask()[0].siteWhitelist(),
            'OutputDatasets': spec1.listOutputDatasets()
        }
        result = self.reqDBWriter.updateRequestProperty(
            spec1.name(), properties)
        self.assertEquals(result, 'OK', 'update fail')

        spec2 = newWorkload("not_exist_schema")
        production = spec2.newTask("Production")
        production.setTaskType("Merge")
        properties = {
            "RequestPriority": spec2.priority(),
            'SiteWhitelist': spec2.getTopLevelTask()[0].siteWhitelist(),
            'OutputDatasets': spec2.listOutputDatasets()
        }
        result = self.reqDBWriter.updateRequestProperty(
            spec2.name(), properties)
        self.assertEquals(result, 'Error: document not found')

        requests = self.wmstatsReader.getRequestByStatus(["failed"],
                                                         jobInfoFlag=False,
                                                         legacyFormat=True)
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])

        requestCollection = RequestInfoCollection(requests)
        result = requestCollection.getJSONData()
        self.assertEquals(result.keys(), [schema[0]['RequestName']])

        requests = self.wmstatsReader.getActiveData()
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
        requests = self.wmstatsReader.getRequestByStatus(["failed"])
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])

        requests = self.wmstatsReader.getRequestSummaryWithJobInfo(
            schema[0]['RequestName'])
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
Example #43
0
    parser = OptionParser()
    parser.add_option("-s", "--server", dest="server",
                    help="CouchDB server to write results to", )
    parser.add_option("-d", "--database", dest="database", default='latency_analytics',
                    help="CouchDB database for results")
    parser.add_option("-a", "--archived",
                  action="store_true", dest="archived", default=False,
                  help="Request info on archived workflows instead")

    (options, args) = parser.parse_args()

    analyticsServer = CouchServer(options.server)
    couchdb = analyticsServer.connectDatabase(options.database)

    url = "https://cmsweb.cern.ch/couchdb/wmstats"
    WMStats = WMStatsReader(url)
    reqMgr = CouchServer('https://cmsweb.cern.ch/couchdb/').connectDatabase('reqmgr_workload_cache', False)
    print "Getting job information from %s. Please wait." % url

    if options.archived:
        checkStates = ['normal-archived', 'rejected-archived', 'aborted-archived']
        jobInfoFlag = False
    else:
        checkStates = WMStatsReader.ACTIVE_STATUS
        jobInfoFlag = True
    requests = WMStats.getRequestByStatus(checkStates, jobInfoFlag = jobInfoFlag)

    requestCollection = RequestInfoCollection(requests)
    result = requestCollection.getJSONData()
    requestsDict = requestCollection.getData()
    print "Total %s requests retrieved\n" % len(result)
Example #44
0
class CleanCouchPoller(BaseWorkerThread):
    """
    Cleans up local couch db according the the given condition.
    1. Cleans local couch db when request is completed and reported to cental db.
       This will clean up local couchdb, local summary db, local queue
       
    2. Cleans old couchdoc which is created older than the time threshold
    
    """
    def __init__(self, config):
        """
        Initialize config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb  = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Cleaning up the old request docs")
            report = self.wmstatsCouchDB.deleteOldDocs(self.config.TaskArchiver.DataKeepDays)
            logging.info("%s docs deleted" % report)
            logging.info("getting complete and announced requests")
            
            #TODO: define what is deletable status. Also add the code to delet summary document, 
            # request summary and job summary
            if self.useReqMgrForCompletionCheck:
                deletableWorkflows = self.centralCouchDBReader.workflowsByStatus(["announced"])
            else:
                deletableWorkflows = self.centralCouchDBReader.workflowsByStatus(["completed"])
            
            logging.info("Ready to delete %s" % deletableWorkflows)     
            for workflowName in deletableWorkflows:
                logging.info("Deleting %s from JobCouch" % workflowName)
                
                report = self.deleteWorkflowFromJobCouch(workflowName, "JobDump")
                logging.info("%s docs deleted from JobDump" % report)
                report = self.deleteWorkflowFromJobCouch(workflowName, "FWJRDump")
                logging.info("%s docs deleted from FWJRDump" % report)
                
                self.centralCouchDBWriter.updateRequestStatus(workflowName, "archived")
                logging.info("status updated to archived %s" % workflowName)
                
        except Exception, ex:
            logging.error(str(ex))
            logging.error("Error occurred, will try again next cycle")
Example #45
0
class WMStatsTest(unittest.TestCase):
    """
    """
    def setUp(self):
        """
        _setUp_
        """
        self.schema = []
        self.couchApps = ["WMStats"]
        self.testInit = TestInitCouchApp('WorkQueueServiceTest')
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules=self.schema, useDefault=False)
        dbName = 'wmstats_t'
        self.testInit.setupCouch(dbName, *self.couchApps)
        self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, dbName)
        self.wmstatsReader = WMStatsReader(self.testInit.couchUrl, dbName)
        self.wmstatsReader.defaultStale = {}
        return

    def tearDown(self):
        """
        _tearDown_

        Drop all the WMBS tables.
        """
        self.testInit.tearDownCouch()

    def testWMStatsWriter(self):
        # test getWork
        schema = generate_reqmgr_schema()
        self.assertEquals(self.wmstatsWriter.insertRequest(schema[0]), 'OK',
                          'insert fail')
        self.assertEquals(
            self.wmstatsWriter.updateRequestStatus(schema[0]['RequestName'],
                                                   "failed"), 'OK',
            'update fail')
        self.assertEquals(
            self.wmstatsWriter.updateRequestStatus("not_exist_schema",
                                                   "assigned"),
            'ERROR: request not found - not_exist_schema')
        self.assertEquals(
            self.wmstatsWriter.updateTeam(schema[0]['RequestName'], 'teamA'),
            'OK', 'update fail')
        self.assertEquals(
            self.wmstatsWriter.updateTeam("not_exist_schema", 'teamA'),
            'ERROR: request not found - not_exist_schema')
        totalStats = {
            'total_jobs': 100,
            'input_events': 1000,
            'input_lumis': 1234,
            'input_num_files': 5
        }
        self.assertEquals(
            self.wmstatsWriter.insertTotalStats(schema[0]['RequestName'],
                                                totalStats), 'INSERTED',
            'update fail')
        self.assertEquals(
            self.wmstatsWriter.insertTotalStats(schema[0]['RequestName'],
                                                totalStats), 'UPDATED',
            'update fail')
        self.assertEquals(
            self.wmstatsWriter.insertTotalStats("not_exist_schema",
                                                totalStats),
            'ERROR: request not found - not_exist_schema')
        spec1 = newWorkload(schema[0]['RequestName'])
        production = spec1.newTask("Production")
        production.setTaskType("Merge")
        production.setSiteWhitelist(['TEST_SITE'])
        self.assertEquals(self.wmstatsWriter.updateFromWMSpec(spec1), 'OK',
                          'update fail')
        spec2 = newWorkload("not_exist_schema")
        production = spec2.newTask("Production")
        production.setTaskType("Merge")
        self.assertEquals(self.wmstatsWriter.updateFromWMSpec(spec2),
                          'ERROR: request not found - not_exist_schema')

        requests = self.wmstatsReader.getRequestByStatus(["failed"],
                                                         jobInfoFlag=False)
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])

        requestCollection = RequestInfoCollection(requests)
        result = requestCollection.getJSONData()
        self.assertEquals(result.keys(), [schema[0]['RequestName']])

        requests = self.wmstatsReader.getActiveData()
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
        requests = self.wmstatsReader.workflowsByStatus(["failed"])
        self.assertEquals(requests, [schema[0]['RequestName']])
Example #46
0
def gatherWMDataMiningStats(
    wmstatsUrl, reqmgrUrl, wmMiningUrl, mcmUrl, mcmCert, mcmKey, tmpDir, archived=False, log=logging.info
):

    server, database = splitCouchServiceURL(wmMiningUrl)
    analyticsServer = CouchServer(server)
    couchdb = analyticsServer.connectDatabase(database)

    WMStats = WMStatsReader(wmstatsUrl)

    reqMgrServer, reqMgrDB = splitCouchServiceURL(reqmgrUrl)

    reqMgr = CouchServer(reqMgrServer).connectDatabase(reqMgrDB, False)

    if archived:
        funcName = "Archived Requests"
    else:
        funcName = "Active Requests"

    log("INFO: %s: Getting job information from %s and %s. Please wait." % (funcName, wmstatsUrl, reqmgrUrl))

    if archived:
        checkStates = ["normal-archived", "rejected-archived", "aborted-archived"]
        jobInfoFlag = False
    else:
        checkStates = WMStatsReader.ACTIVE_STATUS
        jobInfoFlag = True
    requests = WMStats.getRequestByStatus(checkStates, jobInfoFlag=jobInfoFlag)

    requestCollection = RequestInfoCollection(requests)
    result = requestCollection.getJSONData()
    requestsDict = requestCollection.getData()
    log("INFO: %s: Total %s requests retrieved\n" % (funcName, len(result)))

    report = {}
    nMCMCalls = 0
    with McM(cert=mcmCert, key=mcmKey, url=mcmUrl, tmpDir=tmpDir) as mcm:
        for wf in result.keys():

            # Store a copy of the CouchDB document so we can compare later before updating
            if couchdb.documentExists(wf):
                oldCouchDoc = couchdb.document(wf)
                wfExists = True
            else:
                oldCouchDoc = CouchDoc(id=wf)
                wfExists = False

            newCouchDoc = copy.deepcopy(oldCouchDoc)
            ancientCouchDoc = copy.deepcopy(oldCouchDoc)
            report[wf] = oldCouchDoc
            # FIXME: remove report, only have two instances of couchDoc

            if not oldCouchDoc.has_key("filterEfficiency") or not oldCouchDoc.has_key("runWhiteList"):
                runWhiteList = []
                filterEfficiency = None
                try:
                    # log("DEBUG: Looking up %s in ReqMgr" % wf)
                    rmDoc = reqMgr.document(wf)
                    runWhiteList = rmDoc.get("RunWhiteList", [])
                    filterEfficiency = rmDoc.get("FilterEfficiency", None)
                except:
                    pass  # ReqMgr no longer has the workflow
                report[wf].update({"filterEfficiency": filterEfficiency, "runWhiteList": runWhiteList})

            if not oldCouchDoc.has_key("mcmTotalEvents") or not oldCouchDoc.has_key("mcmApprovalTime"):
                prepID = oldCouchDoc.get("prepID", None)
                if prepID and nMCMCalls <= maxMCMCalls:
                    nMCMCalls += 1
                    mcmHistory = mcm.getHistory(prepID=prepID)
                    mcmRequest = mcm.getRequest(prepID=prepID)
                    report[wf].update({"mcmTotalEvents": mcmRequest.get("total_events", "Unknown")})

                    if not oldCouchDoc.has_key("mcmApprovalTime"):
                        report[wf].update({"mcmApprovalTime": "Unknown"})
                    for entry in mcmHistory:
                        if entry["action"] == "set status" and entry["step"] == "announced":
                            dateString = entry["updater"]["submission_date"]
                            dt = datetime.strptime(dateString, "%Y-%m-%d-%H-%M")
                            report[wf].update({"mcmApprovalTime": time.mktime(dt.timetuple())})

            # Basic parameters of the workflow
            priority = requests[wf]["priority"]
            requestType = requests[wf]["request_type"]
            targetLumis = requests[wf].get("input_lumis", 0)
            targetEvents = requests[wf].get("input_events", 0)
            campaign = requests[wf]["campaign"]
            prep_id = requests[wf].get("prep_id", None)
            outputdatasets = requests[wf].get("outputdatasets", [])

            # Can be an empty list, full list, empty string, or non-empty string!
            inputdataset = requests[wf]["inputdataset"]
            if isinstance(inputdataset, (list,)):
                if inputdataset:
                    inputdataset = inputdataset[0]
                else:
                    inputdataset = ""

            outputTier = "Unknown"
            try:
                outputTiers = []
                for ds in outputdatasets:
                    if type(ds) == list:
                        outputTiers.append(ds[0].split("/")[-1])
                    else:
                        outputTiers.append(ds.split("/")[-1])
            except:
                log(
                    "ERROR: Could not decode outputdatasets: %s" % outputdatasets
                )  # Sometimes is a list of lists, not just a list. Bail
            if inputdataset:
                inputTier = inputdataset.split("/")[-1]
                if inputTier in ["GEN"]:
                    outputTier = "LHE"
                elif inputTier in ["RAW", "RECO"]:
                    outputTier = "AOD"
                elif inputTier in ["GEN-SIM"]:
                    outputTier = "AODSIM"
                elif "AODSIM" in outputTiers:
                    outputTier = "AODSIM"

            else:
                if len(outputTiers) == 1 and "GEN" in outputTiers:
                    if "STEP0ATCERN" in wf:
                        outputTier = "STEP0"
                    else:
                        outputTier = "FullGen"
                elif "GEN-SIM" in outputTiers and "AODSIM" in outputTiers and requestType == "TaskChain":
                    outputTier = "RelVal"
                elif "RECO" in outputTiers and requestType == "TaskChain":
                    outputTier = "RelVal"
                elif "GEN-SIM" in outputTiers:
                    outputTier = "GEN-SIM"
                elif "AODSIM" in outputTiers:
                    outputTier = "AODSIM"
                elif "RECO" in outputTiers:
                    outputTier = "AOD"
                elif "AOD" in outputTiers:
                    outputTier = "AOD"
                else:
                    outputTier = "GEN-SIM"

            # Calculate completion ratios for events and lumi sections, take minimum for all datasets
            eventPercent = 200
            lumiPercent = 200
            datasetReports = requestsDict[wf].getProgressSummaryByOutputDataset()
            for dataset in datasetReports:
                dsr = datasetReports[dataset].getReport()
                events = dsr.get("events", 0)
                lumis = dsr.get("totalLumis", 0)
                if targetLumis:
                    lumiPercent = min(lumiPercent, lumis / targetLumis * 100)
                if targetEvents:
                    eventPercent = min(eventPercent, events / targetEvents * 100)
            if eventPercent > 100:
                eventPercent = 0
            if lumiPercent > 100:
                lumiPercent = 0

            # Sum up all jobs across agents to see if we've run the first, last
            successJobs = 0
            totalJobs = 0
            for agent in result[wf]:
                jobs = result[wf][agent]
                successJobs += jobs["sucess"]
                totalJobs += jobs["created"]
            try:
                if totalJobs and not report[wf].get("firstJobTime", None):
                    report[wf].update({"firstJobTime": int(time.time())})
                if totalJobs and successJobs == totalJobs and not report[wf].get("lastJobTime", None):
                    report[wf].update({"lastJobTime": int(time.time())})
            except:
                pass

            # Figure out current status of workflow and transition times
            finalStatus = None
            newTime = None
            approvedTime = None
            assignedTime = None
            acquireTime = None
            completedTime = None
            closeoutTime = None
            announcedTime = None
            archivedTime = None
            requestDate = None
            for status in requests[wf]["request_status"]:
                finalStatus = status["status"]
                if status["status"] == "new":
                    newTime = status["update_time"]
                if status["status"] == "assignment-approved":
                    approvedTime = status["update_time"]
                if status["status"] == "assigned":
                    assignedTime = status["update_time"]
                if status["status"] == "completed":
                    completedTime = status["update_time"]
                if status["status"] == "acquired":
                    acquireTime = status["update_time"]
                if status["status"] == "closed-out":
                    closeoutTime = status["update_time"]
                if status["status"] == "announced":
                    announcedTime = status["update_time"]
                if status["status"] == "normal-archived":
                    archivedTime = status["update_time"]

            # Build or modify the report dictionary for the WF
            report.setdefault(wf, {})

            if approvedTime and not report[wf].get("approvedTime", None):
                report[wf].update({"approvedTime": approvedTime})
            if assignedTime and not report[wf].get("assignedTime", None):
                report[wf].update({"assignedTime": assignedTime})
            if acquireTime and not report[wf].get("acquireTime", None):
                report[wf].update({"acquireTime": acquireTime})
            if closeoutTime and not report[wf].get("closeoutTime", None):
                report[wf].update({"closeoutTime": closeoutTime})
            if announcedTime and not report[wf].get("announcedTime", None):
                report[wf].update({"announcedTime": announcedTime})
            if completedTime and not report[wf].get("completedTime", None):
                report[wf].update({"completedTime": completedTime})
            if newTime and not report[wf].get("newTime", None):
                report[wf].update({"newTime": newTime})
            if archivedTime and not report[wf].get("archivedTime", None):
                report[wf].update({"archivedTime": archivedTime})

            try:
                dt = requests[wf]["request_date"]
                requestDate = "%4.4d-%2.2d-%2.2d %2.2d:%2.2d:%2.2d" % tuple(dt)
                report[wf].update({"requestDate": requestDate})
            except:
                pass

            report[wf].update({"priority": priority, "status": finalStatus, "type": requestType})
            report[wf].update({"totalLumis": targetLumis, "totalEvents": targetEvents})
            report[wf].update({"campaign": campaign, "prepID": prep_id, "outputTier": outputTier})
            report[wf].update({"outputDatasets": outputdatasets, "inputDataset": inputdataset})

            report[wf].setdefault("lumiPercents", {})
            report[wf].setdefault("eventPercents", {})
            lumiProgress = 0
            eventProgress = 0
            for percentage in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98, 99, 100]:
                percent = str(percentage)
                percentReported = report[wf]["lumiPercents"].get(percent, None)
                if not percentReported and lumiPercent >= percentage:
                    report[wf]["lumiPercents"][percent] = int(time.time())
                if lumiPercent >= percentage:
                    lumiProgress = percentage

                percentReported = report[wf]["eventPercents"].get(percent, None)
                if not percentReported and eventPercent >= percentage:
                    report[wf]["eventPercents"][percent] = int(time.time())
                if eventPercent >= percentage:
                    eventProgress = percentage

            report[wf].update({"eventProgress": eventProgress, "lumiProgress": lumiProgress})

            newCouchDoc.update(report[wf])

            # Queue the updated document for addition if it's changed.
            if ancientCouchDoc != newCouchDoc:
                if wfExists:
                    # log("DEBUG: Workflow updated: %s" % wf)
                    pass
                else:
                    # log("DEBUG Workflow created: %s" % wf)
                    pass

                try:
                    newCouchDoc["updateTime"] = int(time.time())
                    report[wf]["updateTime"] = int(time.time())
                    cjson.encode(newCouchDoc)  # Make sure it encodes before trying to queue
                    couchdb.queue(newCouchDoc)
                except:
                    log("ERROR: Failed to queue document:%s \n" % pprint.pprint(newCouchDoc))

    log("INFO: %s: Finished getting job. wait for the next Cycle" % funcName)
    # Commit all changes to CouchDB
    couchdb.commit()
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """

    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)

    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s", sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view about agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        urlSiteState = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.siteStatusMetric)
        urlCpuBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.cpuBoundMetric)
        urlIoBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(urlSiteState).read()
        cpuBound = urllib2.urlopen(urlCpuBound).read()
        ioBound = urllib2.urlopen(urlIoBound).read()

        # parse from json format to dictionary, get only 'csvdata'
        ssbSiteState = json.loads(sites)['csvdata']
        ssbCpuSlots = json.loads(cpuBound)['csvdata']
        ssbIoSlots = json.loads(ioBound)['csvdata']

        # dict updated by these methods with status/thresholds info keyed by the site name
        ssbSiteSlots = {}
        self.siteStatusByVOName(ssbSiteState, ssbSiteSlots)
        self.thresholdsByVOName(ssbCpuSlots, ssbSiteSlots, slotsType='slotsCPU')
        self.thresholdsByVOName(ssbIoSlots, ssbSiteSlots, slotsType='slotsIO')

        # Now remove sites with state only, such that no updates are applied to them
        ssbSiteSlots = {k: v for k, v in ssbSiteSlots.iteritems() if len(v) == 3}

        if not ssbSiteSlots:
            logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
            return ssbSiteSlots

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down', site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent,
                                                                                 self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores / 100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(int(CPUBound / agentsCount * self.pendingSlotsSitePercent / 100), minCPUSlots)
            taskCPUPending = max(int(CPUBound / agentsCount * self.pendingSlotsTaskPercent / 100), minCPUSlots)
            taskIOPending = max(int(IOBound / agentsCount * self.pendingSlotsTaskPercent / 100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending,
                                                        runningJobSlots=CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d", site, taskCPUPending,
                              CPUBound)
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(site, taskType=task, maxSlots=CPUBound,
                                                         pendingSlots=taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d", site, taskIOPending,
                              IOBound)
                for task in tasksIO:
                    self.resourceControl.insertThreshold(site, taskType=task, maxSlots=IOBound,
                                                         pendingSlots=taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug("Updating %s Express and Repack task thresholds.", site)
                expressSlots = int(CPUBound * self.runningExpressPercent / 100)
                pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent / 100)
                pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack)

    def thresholdsByVOName(self, sites, ssbSiteSlots, slotsType):
        """
        _thresholdsByVOName_

        Updates the dict with CPU and IO slots, only for sites with a valid state
        """
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname in ssbSiteSlots:
                if value is None:
                    logging.warn('Site %s does not have thresholds in SSB. Taking no action', voname)
                    # then we better remove this site from our final dict
                    ssbSiteSlots.pop(voname)
                else:
                    ssbSiteSlots[voname][slotsType] = int(value)
            else:
                logging.warn('Found %s thresholds for site %s which has no state in SSB', slotsType, voname)
        return

    def siteStatusByVOName(self, sites, ssbSiteSlots):
        """
        _siteStatusByVOName_

        Creates an inner dictionary for each site that will contain
        the site state and the number of slots
        """
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteSlots:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unkwown status '%s' for site %s, please check SSB", status, voname)
                else:
                    ssbSiteSlots[voname] = {'state': statusAgent}
            else:
                logging.error('I have a duplicated status entry in SSB for %s', voname)
        return

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {'enabled': 'Normal',
                     'drain': 'Draining',
                     'disabled': 'Down',
                     'test': 'Draining'}
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return
Example #48
0
 def __init__(self, couchURL, appName="WMStats", reqdbURL=None, reqdbCouchApp="ReqMgr"):
     # set the connection for local couchDB call
     WMStatsReader.__init__(self, couchURL, appName, reqdbURL, reqdbCouchApp)
Example #49
0
class CleanCouchPoller(BaseWorkerThread):
    """
    Cleans up local couch db according the the given condition.
    1. Cleans local couch db when request is completed and reported to cental db.
       This will clean up local couchdb, local summary db, local queue
       
    2. Cleans old couchdoc which is created older than the time threshold
    
    """
    def __init__(self, config):
        """
        Initialize config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config

    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(
            self.config.TaskArchiver.centralWMStatsURL)

        if self.useReqMgrForCompletionCheck:
            self.deletableStates = ["announced"]
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
        else:
            # Tier0 case
            self.deletableStates = ["completed"]
            self.centralCouchDBWriter = self.wmstatsCouchDB

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Cleaning up the old request docs")
            report = self.wmstatsCouchDB.deleteOldDocs(
                self.config.TaskArchiver.DataKeepDays)
            logging.info("%s docs deleted" % report)
            logging.info("getting complete and announced requests")

            deletableWorkflows = self.centralCouchDBReader.workflowsByStatus(
                self.deletableStates)

            logging.info("Ready to delete %s" % deletableWorkflows)
            for workflowName in deletableWorkflows:
                if self.cleanAllLocalCouchDB(workflowName):
                    self.centralCouchDBWriter.updateRequestStatus(
                        workflowName, "normal-archived")
                    logging.info("status updated to normal-archived %s" %
                                 workflowName)

            abortedWorkflows = self.centralCouchDBReader.workflowsByStatus(
                ["aborted-completed"])
            logging.info("Ready to delete aborted %s" % abortedWorkflows)
            for workflowName in abortedWorkflows:
                if self.cleanAllLocalCouchDB(workflowName):
                    self.centralCouchDBWriter.updateRequestStatus(
                        workflowName, "aborted-archived")
                    logging.info("status updated to aborted-archived %s" %
                                 workflowName)

            #TODO: following code is temproraly - remove after production archived data is cleaned
            removableWorkflows = self.centralCouchDBReader.workflowsByStatus(
                ["archived"])

            logging.info("Ready to delete %s from wmagent_summary" %
                         removableWorkflows)
            for workflowName in removableWorkflows:
                logging.info("Deleting %s from WMAgent Summary Couch" %
                             workflowName)
                report = self.deleteWorkflowFromJobCouch(
                    workflowName, "WMStats")
                logging.info("%s docs deleted from wmagent_summary" % report)
                # only updatet he status when delete is successful
                # TODO: need to handle the case when there are multiple agent running the same request.
                if report["status"] == "ok":
                    self.centralCouchDBWriter.updateRequestStatus(
                        workflowName, "normal-archived")
                    logging.info(
                        "status updated to normal-archived from archived (this is temp solution for production) %s"
                        % workflowName)

        except Exception, ex:
            logging.error(str(ex))
            logging.error("Error occurred, will try again next cycle")
Example #50
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)

    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info(
                "This component is not enabled in the configuration. Doing nothing."
            )
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s", sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info(
            "Resource control cycle finished updating site state and thresholds."
        )

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view about agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(
                filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning(
                "agentInfo couch view is not available, use default value %s",
                self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName,
                                                    self.agentsNumByTeam)
            logging.debug(
                "Agents connected to the same team (not in DrainMode): %d",
                self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        urlSiteState = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.siteStatusMetric)
        urlCpuBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.cpuBoundMetric)
        urlIoBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(urlSiteState).read()
        cpuBound = urllib2.urlopen(urlCpuBound).read()
        ioBound = urllib2.urlopen(urlIoBound).read()

        # parse from json format to dictionary, get only 'csvdata'
        ssbSiteState = json.loads(sites)['csvdata']
        ssbCpuSlots = json.loads(cpuBound)['csvdata']
        ssbIoSlots = json.loads(ioBound)['csvdata']

        # dict updated by these methods with status/thresholds info keyed by the site name
        ssbSiteSlots = {}
        self.siteStatusByVOName(ssbSiteState, ssbSiteSlots)
        self.thresholdsByVOName(ssbCpuSlots,
                                ssbSiteSlots,
                                slotsType='slotsCPU')
        self.thresholdsByVOName(ssbIoSlots, ssbSiteSlots, slotsType='slotsIO')

        # Now remove sites with state only, such that no updates are applied to them
        ssbSiteSlots = {
            k: v
            for k, v in ssbSiteSlots.iteritems() if len(v) == 3
        }

        if not ssbSiteSlots:
            logging.error(
                "One or more of the SSB metrics is down. Please contact the Dashboard team."
            )
            return ssbSiteSlots

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down',
                                 site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site,
                             infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug(
            "Settings for site and task pending slots: %s%% and %s%%",
            self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site][
                    'slotsCPU'] * self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] = infoSSB[site][
                    'slotsIO'] * self.t1SitesCores / 100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(
                int(CPUBound / agentsCount * self.pendingSlotsSitePercent /
                    100), minCPUSlots)
            taskCPUPending = max(
                int(CPUBound / agentsCount * self.pendingSlotsTaskPercent /
                    100), minCPUSlots)
            taskIOPending = max(
                int(IOBound / agentsCount * self.pendingSlotsTaskPercent /
                    100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site][
                    'pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info(
                    "Updating %s site thresholds for pend/runn: %d/%d", site,
                    sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(
                    site,
                    pendingJobSlots=sitePending,
                    runningJobSlots=CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug(
                    "Updating %s tasksCPU thresholds for pend/runn: %d/%d",
                    site, taskCPUPending, CPUBound)
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(
                        site,
                        taskType=task,
                        maxSlots=CPUBound,
                        pendingSlots=taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug(
                    "Updating %s tasksIO thresholds for pend/runn: %d/%d",
                    site, taskIOPending, IOBound)
                for task in tasksIO:
                    self.resourceControl.insertThreshold(
                        site,
                        taskType=task,
                        maxSlots=IOBound,
                        pendingSlots=taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug(
                    "Updating %s Express and Repack task thresholds.", site)
                expressSlots = int(CPUBound * self.runningExpressPercent / 100)
                pendingExpress = int(expressSlots *
                                     self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Express',
                                                     expressSlots,
                                                     pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent / 100)
                pendingRepack = int(repackSlots *
                                    self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Repack',
                                                     repackSlots,
                                                     pendingRepack)

    def thresholdsByVOName(self, sites, ssbSiteSlots, slotsType):
        """
        _thresholdsByVOName_

        Updates the dict with CPU and IO slots, only for sites with a valid state
        """
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname in ssbSiteSlots:
                if value is None:
                    logging.warn(
                        'Site %s does not have thresholds in SSB. Taking no action',
                        voname)
                    # then we better remove this site from our final dict
                    ssbSiteSlots.pop(voname)
                else:
                    ssbSiteSlots[voname][slotsType] = int(value)
            else:
                logging.warn(
                    'Found %s thresholds for site %s which has no state in SSB',
                    slotsType, voname)
        return

    def siteStatusByVOName(self, sites, ssbSiteSlots):
        """
        _siteStatusByVOName_

        Creates an inner dictionary for each site that will contain
        the site state and the number of slots
        """
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteSlots:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error(
                        "Unkwown status '%s' for site %s, please check SSB",
                        status, voname)
                else:
                    ssbSiteSlots[voname] = {'state': statusAgent}
            else:
                logging.error('I have a duplicated status entry in SSB for %s',
                              voname)
        return

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining'
        }
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return
Example #51
0
def gatherWMDataMiningStats(wmstatsUrl,
                            reqmgrUrl,
                            wmMiningUrl,
                            mcmUrl,
                            mcmCert,
                            mcmKey,
                            tmpDir,
                            archived=False,
                            log=logging.info):
    server, database = splitCouchServiceURL(wmMiningUrl)
    analyticsServer = CouchServer(server)
    couchdb = analyticsServer.connectDatabase(database)

    WMStats = WMStatsReader(wmstatsUrl,
                            reqdbURL=reqmgrUrl,
                            reqdbCouchApp="ReqMgr")

    reqMgrServer, reqMgrDB = splitCouchServiceURL(reqmgrUrl)

    reqMgr = CouchServer(reqMgrServer).connectDatabase(reqMgrDB, False)

    if archived:
        funcName = "Archived Requests"
    else:
        funcName = "Active Requests"

    log.info("%s: Getting job information from %s and %s. Please wait." %
             (funcName, wmstatsUrl, reqmgrUrl))

    if archived:
        checkStates = [
            'normal-archived', 'rejected-archived', 'aborted-archived'
        ]
        jobInfoFlag = False
    else:
        checkStates = WMStatsReader.ACTIVE_STATUS
        jobInfoFlag = True
    requests = WMStats.getRequestByStatus(checkStates,
                                          jobInfoFlag=jobInfoFlag,
                                          legacyFormat=True)

    requestCollection = RequestInfoCollection(requests)
    result = requestCollection.getJSONData()
    requestsDict = requestCollection.getData()
    log.info("%s: Total %s requests retrieved\n" % (funcName, len(result)))

    report = {}
    nMCMCalls = 0
    with McM(cert=mcmCert, key=mcmKey, url=mcmUrl, tmpDir=tmpDir) as mcm:
        for wf in result.keys():

            # Store a copy of the CouchDB document so we can compare later before updating
            if couchdb.documentExists(wf):
                oldCouchDoc = couchdb.document(wf)
                wfExists = True
            else:
                oldCouchDoc = CouchDoc(id=wf)
                wfExists = False

            newCouchDoc = copy.deepcopy(oldCouchDoc)
            ancientCouchDoc = copy.deepcopy(oldCouchDoc)
            report[wf] = oldCouchDoc
            # FIXME: remove report, only have two instances of couchDoc

            if 'filterEfficiency' not in oldCouchDoc or 'runWhiteList' not in oldCouchDoc:
                runWhiteList = []
                filterEfficiency = None
                try:
                    # log.debug("Looking up %s in ReqMgr" % wf)
                    rmDoc = reqMgr.document(wf)
                    runWhiteList = rmDoc.get('RunWhiteList', [])
                    filterEfficiency = rmDoc.get('FilterEfficiency', None)
                except:
                    pass  # ReqMgr no longer has the workflow
                report[wf].update({
                    'filterEfficiency': filterEfficiency,
                    'runWhiteList': runWhiteList
                })

            if oldCouchDoc.get('mcmTotalEvents', 'Unknown') == 'Unknown' or \
                oldCouchDoc.get('mcmApprovalTime', 'Unknown') == 'Unknown':

                prepID = oldCouchDoc.get('prepID', None)
                if prepID and nMCMCalls <= maxMCMCalls:
                    log.info("Trying to update McM info for %s, PREPID %s" %
                             (wf, prepID))
                    # Get information from McM. Don't call too many times, can take a long time
                    nMCMCalls += 1
                    try:
                        mcmHistory = mcm.getHistory(prepID=prepID)
                        if 'mcmApprovalTime' not in oldCouchDoc:
                            report[wf].update({'mcmApprovalTime': 'NoMcMData'})
                        found = False
                        for entry in mcmHistory:
                            if entry['action'] == 'set status' and entry[
                                    'step'] == 'announced':
                                dateString = entry['updater'][
                                    'submission_date']
                                dt = datetime.strptime(dateString,
                                                       '%Y-%m-%d-%H-%M')
                                report[wf].update({
                                    'mcmApprovalTime':
                                    time.mktime(dt.timetuple())
                                })
                                found = True
                        if not found:
                            log.error(
                                "History found but no approval time for %s" %
                                wf)
                    except McMNoDataError:
                        log.error("Setting NoMcMData for %s" % wf)
                        report[wf].update({'mcmApprovalTime': 'NoMcMData'})
                    except (RuntimeError, IOError):
                        exc_type, dummy_exc_value, dummy_exc_traceback = sys.exc_info(
                        )
                        log.error(
                            "%s getting history from McM for PREP ID %s. May be transient and/or SSO problem."
                            % (exc_type, prepID))
                    except:
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error(
                            "%s getting history from McM for PREP ID %s. Unknown error."
                            % (exc_type, prepID))

                    try:
                        mcmRequest = mcm.getRequest(prepID=prepID)
                        report[wf].update({
                            'mcmTotalEvents':
                            mcmRequest.get('total_events', 'NoMcMData')
                        })
                    except (RuntimeError, IOError):
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error(
                            "%s getting request from McM for PREP ID %s. May be transient and/or SSO problem."
                            % (exc_type, prepID))
                    except:
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error(
                            "%s getting request from McM for PREP ID %s. Unknown error."
                            % (exc_type, prepID))

            # Basic parameters of the workflow
            priority = requests[wf].get('priority', 0)
            requestType = requests[wf].get('request_type', 'Unknown')
            targetLumis = requests[wf].get('input_lumis', 0)
            targetEvents = requests[wf].get('input_events', 0)
            campaign = requests[wf].get('campaign', 'Unknown')
            prep_id = requests[wf].get('prep_id', None)
            outputdatasets = requests[wf].get('outputdatasets', [])
            statuses = requests[wf].get('request_status', [])

            if not statuses:
                log.error("Could not find any status from workflow: %s" %
                          wf)  # Should not happen but it does.

            # Remove a single  task_ from the start of PREP ID if it exists
            if prep_id and prep_id.startswith('task_'):
                prep_id.replace('task_', '', 1)

            # Can be an empty list, full list, empty string, or non-empty string!
            inputdataset = requests[wf].get('inputdataset', "")
            if isinstance(inputdataset, list):
                if inputdataset:
                    inputdataset = inputdataset[0]
                else:
                    inputdataset = ''

            outputTier = 'Unknown'
            try:
                outputTiers = []
                for ds in outputdatasets:
                    if isinstance(ds, list):
                        outputTiers.append(ds[0].split('/')[-1])
                    else:
                        outputTiers.append(ds.split('/')[-1])
            except:
                log.error(
                    "Could not decode outputdatasets: %s" % outputdatasets
                )  # Sometimes is a list of lists, not just a list. Bail
            if inputdataset:
                inputTier = inputdataset.split('/')[-1]
                if inputTier in ['GEN']:
                    outputTier = 'LHE'
                elif inputTier in ['RAW', 'RECO']:
                    outputTier = 'AOD'
                elif inputTier in ['GEN-SIM']:
                    outputTier = 'AODSIM'
                elif 'AODSIM' in outputTiers:
                    outputTier = 'AODSIM'

            else:
                if len(outputTiers) == 1 and 'GEN' in outputTiers:
                    if 'STEP0ATCERN' in wf:
                        outputTier = 'STEP0'
                    else:
                        outputTier = 'FullGen'
                elif 'GEN-SIM' in outputTiers and 'AODSIM' in outputTiers and requestType == 'TaskChain':
                    outputTier = 'RelVal'
                elif 'RECO' in outputTiers and requestType == 'TaskChain':
                    outputTier = 'RelVal'
                elif 'GEN-SIM' in outputTiers:
                    outputTier = 'GEN-SIM'
                elif 'AODSIM' in outputTiers:
                    outputTier = 'AODSIM'
                elif 'RECO' in outputTiers:
                    outputTier = 'AOD'
                elif 'AOD' in outputTiers:
                    outputTier = 'AOD'
                else:
                    outputTier = 'GEN-SIM'

            # Calculate completion ratios for events and lumi sections, take minimum for all datasets
            eventPercent = 200
            lumiPercent = 200
            datasetReports = requestsDict[
                wf].getProgressSummaryByOutputDataset()
            for dataset in datasetReports:
                dsr = datasetReports[dataset].getReport()
                events = dsr.get('events', 0)
                lumis = dsr.get('totalLumis', 0)
                if targetLumis:
                    lumiPercent = min(lumiPercent, lumis / targetLumis * 100)
                if targetEvents:
                    eventPercent = min(eventPercent,
                                       events / targetEvents * 100)
            if eventPercent > 100:
                eventPercent = 0
            if lumiPercent > 100:
                lumiPercent = 0

            # Sum up all jobs across agents to see if we've run the first, last
            successJobs = 0
            totalJobs = 0
            for agent in result[wf]:
                jobs = result[wf][agent]
                successJobs += jobs['sucess']
                totalJobs += jobs['created']
            try:
                if totalJobs and not report[wf].get('firstJobTime', None):
                    report[wf].update({'firstJobTime': int(time.time())})
                if totalJobs and successJobs == totalJobs and not report[
                        wf].get('lastJobTime', None):
                    report[wf].update({'lastJobTime': int(time.time())})
            except:
                pass

            # Figure out current status of workflow and transition times
            finalStatus = None
            newTime = None
            approvedTime = None
            assignedTime = None
            acquireTime = None
            completedTime = None
            closeoutTime = None
            announcedTime = None
            archivedTime = None
            requestDate = None

            for status in statuses:
                finalStatus = status['status']
                if status['status'] == 'new':
                    newTime = status['update_time']
                if status['status'] == 'assignment-approved':
                    approvedTime = status['update_time']
                if status['status'] == 'assigned':
                    assignedTime = status['update_time']
                if status['status'] == 'completed':
                    completedTime = status['update_time']
                if status['status'] == 'acquired':
                    acquireTime = status['update_time']
                if status['status'] == 'closed-out':
                    closeoutTime = status['update_time']
                if status['status'] == 'announced':
                    announcedTime = status['update_time']
                if status['status'] == 'normal-archived':
                    archivedTime = status['update_time']

            # Build or modify the report dictionary for the WF
            report.setdefault(wf, {})

            if approvedTime and not report[wf].get('approvedTime', None):
                report[wf].update({'approvedTime': approvedTime})
            if assignedTime and not report[wf].get('assignedTime', None):
                report[wf].update({'assignedTime': assignedTime})
            if acquireTime and not report[wf].get('acquireTime', None):
                report[wf].update({'acquireTime': acquireTime})
            if closeoutTime and not report[wf].get('closeoutTime', None):
                report[wf].update({'closeoutTime': closeoutTime})
            if announcedTime and not report[wf].get('announcedTime', None):
                report[wf].update({'announcedTime': announcedTime})
            if completedTime and not report[wf].get('completedTime', None):
                report[wf].update({'completedTime': completedTime})
            if newTime and not report[wf].get('newTime', None):
                report[wf].update({'newTime': newTime})
            if archivedTime and not report[wf].get('archivedTime', None):
                report[wf].update({'archivedTime': archivedTime})

            try:
                dt = requests[wf]['request_date']
                requestDate = '%4.4d-%2.2d-%2.2d %2.2d:%2.2d:%2.2d' % tuple(dt)
                report[wf].update({'requestDate': requestDate})
            except:
                pass

            report[wf].update({
                'priority': priority,
                'status': finalStatus,
                'type': requestType
            })
            report[wf].update({
                'totalLumis': targetLumis,
                'totalEvents': targetEvents,
            })
            report[wf].update({
                'campaign': campaign,
                'prepID': prep_id,
                'outputTier': outputTier,
            })
            report[wf].update({
                'outputDatasets': outputdatasets,
                'inputDataset': inputdataset,
            })

            report[wf].setdefault('lumiPercents', {})
            report[wf].setdefault('eventPercents', {})
            lumiProgress = 0
            eventProgress = 0
            for percentage in [
                    1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98, 99, 100
            ]:
                percent = str(percentage)
                percentReported = report[wf]['lumiPercents'].get(percent, None)
                if not percentReported and lumiPercent >= percentage:
                    report[wf]['lumiPercents'][percent] = int(time.time())
                if lumiPercent >= percentage:
                    lumiProgress = percentage

                percentReported = report[wf]['eventPercents'].get(
                    percent, None)
                if not percentReported and eventPercent >= percentage:
                    report[wf]['eventPercents'][percent] = int(time.time())
                if eventPercent >= percentage:
                    eventProgress = percentage

            report[wf].update({
                'eventProgress': eventProgress,
                'lumiProgress': lumiProgress,
            })

            newCouchDoc.update(report[wf])

            # Queue the updated document for addition if it's changed.
            if ancientCouchDoc != newCouchDoc:
                if wfExists:
                    # log.debug("Workflow updated: %s" % wf)
                    pass
                else:
                    # log.debug("Workflow created: %s" % wf)
                    pass

                try:
                    newCouchDoc['updateTime'] = int(time.time())
                    report[wf]['updateTime'] = int(time.time())
                    dummy = json.dumps(
                        newCouchDoc
                    )  # Make sure it encodes before trying to queue
                    couchdb.queue(newCouchDoc)
                except:
                    log.error("Failed to queue document:%s \n" %
                              pprint.pprint(newCouchDoc))

    log.info("%s: Finished getting job. wait for the next Cycle" % funcName)
    # Commit all changes to CouchDB
    couchdb.commit()
Example #52
0
class WMStatsTest(unittest.TestCase):
    """
    """
    def setUp(self):
        """
        _setUp_
        """
        self.schema = []
        self.couchApps = ["WMStats"]
        self.testInit = TestInitCouchApp('WorkQueueServiceTest')
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = self.schema,
                                useDefault = False)
        dbName = 'wmstats_t'
        self.testInit.setupCouch(dbName, "WMStats")
        reqDBName = "reqmgrdb_t"
        self.testInit.setupCouch(reqDBName, "ReqMgr")
        wmstatsURL = "%s/%s" % (self.testInit.couchUrl, dbName)
        reqDBURL = "%s/%s" % (self.testInit.couchUrl, reqDBName)
        self.reqDBWriter = RequestDBWriter(reqDBURL)
        self.wmstatsReader = WMStatsReader(wmstatsURL, reqdbURL=reqDBURL)
        self.wmstatsReader.defaultStale = {}
        self.wmstatsReader.reqDB.defaultStale = {}
        return

    def tearDown(self):
        """
        _tearDown_

        Drop all the WMBS tables.
        """
        self.testInit.tearDownCouch()

    def testWMStatsWriter(self):
        # test getWork
        schema = generate_reqmgr_schema()
        
        result = self.reqDBWriter.insertGenericRequest(schema[0])
        self.assertEquals(result[0]['ok'], True, 'insert fail')
        
        result = self.reqDBWriter.updateRequestStatus(schema[0]['RequestName'], "failed")
        self.assertEquals(result, 'OK', 'update fail')
        
        result = self.reqDBWriter.updateRequestStatus("not_exist_schema", "assigned") 
        self.assertEquals(result,'Error: document not found')
        
        result = self.reqDBWriter.updateRequestProperty(schema[0]['RequestName'], {"Teams": ['teamA']})
        self.assertEquals(result, 'OK', 'update fail')
        
        result = self.reqDBWriter.updateRequestProperty("not_exist_schema", {"Teams": ['teamA']})                  
        self.assertEquals(result, 'Error: document not found')
        
        totalStats = {'TotalEstimatedJobs': 100, 'TotalInputEvents': 1000, 'TotalInputLumis': 1234, 'TotalInputFiles': 5}
        result = self.reqDBWriter.updateRequestProperty(schema[0]['RequestName'], totalStats)
        self.assertEquals(result, 'OK', 'update fail')
        
        result = self.reqDBWriter.updateRequestProperty(schema[0]['RequestName'], totalStats)
        self.assertEquals(result, 'OK', 'update fail')
        
        result = self.reqDBWriter.updateRequestProperty("not_exist_schema", totalStats)
        self.assertEquals(result, 'Error: document not found')
        
        spec1 = newWorkload(schema[0]['RequestName'])
        production = spec1.newTask("Production")
        production.setTaskType("Merge")
        production.setSiteWhitelist(['TEST_SITE'])
        properties = {"RequestPriority": spec1.priority(),
                      'SiteWhitelist': spec1.getTopLevelTask()[0].siteWhitelist(),
                      'OutputDatasets': spec1.listOutputDatasets()}
        result = self.reqDBWriter.updateRequestProperty(spec1.name(), properties)
        self.assertEquals(result, 'OK', 'update fail')
        
        spec2 = newWorkload("not_exist_schema")
        production = spec2.newTask("Production")
        production.setTaskType("Merge")
        properties = {"RequestPriority": spec2.priority(),
                      'SiteWhitelist': spec2.getTopLevelTask()[0].siteWhitelist(),
                      'OutputDatasets': spec2.listOutputDatasets()}
        result = self.reqDBWriter.updateRequestProperty(spec2.name(), properties)
        self.assertEquals(result, 'Error: document not found')

        requests = self.wmstatsReader.getRequestByStatus(["failed"], jobInfoFlag = False, legacyFormat = True)
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
        
        requestCollection = RequestInfoCollection(requests)
        result = requestCollection.getJSONData()
        self.assertEquals(result.keys(), [schema[0]['RequestName']])
        
        requests = self.wmstatsReader.getActiveData()
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
        requests = self.wmstatsReader.getRequestByStatus(["failed"])
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
        
        requests = self.wmstatsReader.getRequestSummaryWithJobInfo(schema[0]['RequestName'])
        self.assertEquals(requests.keys(), [schema[0]['RequestName']])
Example #53
0
def gatherWMDataMiningStats(wmstatsUrl, reqmgrUrl, wmMiningUrl,
                            mcmUrl, mcmCert, mcmKey, tmpDir,
                            archived = False, log = logging.info):

    server, database = splitCouchServiceURL(wmMiningUrl)
    analyticsServer = CouchServer(server)
    couchdb = analyticsServer.connectDatabase(database)

    WMStats = WMStatsReader(wmstatsUrl, reqmgrUrl, reqdbCouchApp = "ReqMgr")

    reqMgrServer, reqMgrDB = splitCouchServiceURL(reqmgrUrl)

    reqMgr = CouchServer(reqMgrServer).connectDatabase(reqMgrDB, False)

    if archived:
        funcName = "Archived Requests"
    else:
        funcName = "Active Requests"

    log.info("%s: Getting job information from %s and %s. Please wait." % (
                  funcName, wmstatsUrl, reqmgrUrl))

    if archived:
        checkStates = ['normal-archived', 'rejected-archived', 'aborted-archived']
        jobInfoFlag = False
    else:
        checkStates = WMStatsReader.ACTIVE_STATUS
        jobInfoFlag = True
    requests = WMStats.getRequestByStatus(checkStates, jobInfoFlag = jobInfoFlag, legacyFormat = True)

    requestCollection = RequestInfoCollection(requests)
    result = requestCollection.getJSONData()
    requestsDict = requestCollection.getData()
    log.info("%s: Total %s requests retrieved\n" % (funcName, len(result)))

    report = {}
    nMCMCalls = 0
    with McM(cert=mcmCert, key=mcmKey, url=mcmUrl, tmpDir=tmpDir) as mcm:
        for wf in result.keys():

            # Store a copy of the CouchDB document so we can compare later before updating
            if couchdb.documentExists(wf):
                oldCouchDoc = couchdb.document(wf)
                wfExists = True
            else:
                oldCouchDoc = CouchDoc(id=wf)
                wfExists = False

            newCouchDoc = copy.deepcopy(oldCouchDoc)
            ancientCouchDoc = copy.deepcopy(oldCouchDoc)
            report[wf] = oldCouchDoc
            # FIXME: remove report, only have two instances of couchDoc

            if 'filterEfficiency' not in oldCouchDoc or 'runWhiteList' not in oldCouchDoc:
                runWhiteList = []
                filterEfficiency = None
                try:
                    #log.debug("Looking up %s in ReqMgr" % wf)
                    rmDoc = reqMgr.document(wf)
                    runWhiteList = rmDoc.get('RunWhiteList', [])
                    filterEfficiency = rmDoc.get('FilterEfficiency', None)
                except:
                    pass # ReqMgr no longer has the workflow
                report[wf].update({'filterEfficiency':filterEfficiency, 'runWhiteList':runWhiteList})

            if ('mcmTotalEvents' not in oldCouchDoc or
                'mcmApprovalTime' not in oldCouchDoc or
                oldCouchDoc.get('mcmTotalEvents', 'Unknown') == 'Unknown' or
                oldCouchDoc.get('mcmApprovalTime', 'Unknown') == 'Unknown'):

                prepID = oldCouchDoc.get('prepID', None)
                if prepID and nMCMCalls <= maxMCMCalls:
                    log.info("Trying to update McM info for %s, PREPID %s" % (wf, prepID))
                    # Get information from McM. Don't call too many times, can take a long time
                    nMCMCalls += 1
                    try:
                        mcmHistory = mcm.getHistory(prepID = prepID)
                        if 'mcmApprovalTime' not in oldCouchDoc:
                            report[wf].update({'mcmApprovalTime':'NoMcMData'})
                        found = False
                        for entry in mcmHistory:
                            if entry['action'] == 'set status' and entry['step'] == 'announced':
                                dateString = entry['updater']['submission_date']
                                dt = datetime.strptime(dateString, '%Y-%m-%d-%H-%M')
                                report[wf].update({'mcmApprovalTime':time.mktime(dt.timetuple())})
                                found = True
                        if not found:
                            log.error("History found but no approval time for %s" % wf)
                    except McMNoDataError:
                        log.error("Setting NoMcMData for %s" % wf)
                        report[wf].update({'mcmApprovalTime':'NoMcMData'})
                    except (RuntimeError, IOError):
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error("%s getting history from McM for PREP ID %s. May be transient and/or SSO problem." %
                            (exc_type, prepID))
                    except:
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error("%s getting history from McM for PREP ID %s. Unknown error." %
                            (exc_type, prepID))

                    try:
                        mcmRequest = mcm.getRequest(prepID = prepID)
                        report[wf].update({'mcmTotalEvents': mcmRequest.get('total_events', 'NoMcMData')})
                    except (RuntimeError, IOError):
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error("%s getting request from McM for PREP ID %s. May be transient and/or SSO problem." %
                            (exc_type, prepID))
                    except:
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        log.error("%s getting request from McM for PREP ID %s. Unknown error." %
                            (exc_type, prepID))

            # Basic parameters of the workflow
            priority = requests[wf]['priority']
            requestType = requests[wf]['request_type']
            targetLumis = requests[wf].get('input_lumis', 0)
            targetEvents = requests[wf].get('input_events', 0)
            campaign = requests[wf]['campaign']
            prep_id = requests[wf].get('prep_id', None)
            outputdatasets = requests[wf].get('outputdatasets', [])
            statuses = requests[wf].get('request_status', [])

            if not statuses:
                log.error("Could not find any status from workflow: %s" % wf) # Should not happen but it does.

            # Remove a single  task_ from the start of PREP ID if it exists
            if prep_id and prep_id.startswith('task_'):
                prep_id.replace('task_', '', 1)

            # Can be an empty list, full list, empty string, or non-empty string!
            inputdataset = requests[wf].get('inputdataset', "")
            if isinstance(inputdataset, list):
                if inputdataset:
                    inputdataset = inputdataset[0]
                else:
                    inputdataset = ''

            outputTier = 'Unknown'
            try:
                outputTiers = []
                for ds in outputdatasets:
                    if isinstance(ds, list):
                        outputTiers.append(ds[0].split('/')[-1])
                    else:
                        outputTiers.append(ds.split('/')[-1])
            except:
                log.error("Could not decode outputdatasets: %s" % outputdatasets) # Sometimes is a list of lists, not just a list. Bail
            if inputdataset:
                inputTier = inputdataset.split('/')[-1]
                if inputTier in ['GEN']:
                    outputTier = 'LHE'
                elif inputTier in ['RAW', 'RECO']:
                    outputTier = 'AOD'
                elif inputTier in ['GEN-SIM']:
                    outputTier = 'AODSIM'
                elif 'AODSIM' in outputTiers:
                    outputTier = 'AODSIM'

            else:
                if len(outputTiers) == 1 and 'GEN' in outputTiers:
                    if 'STEP0ATCERN' in wf:
                        outputTier = 'STEP0'
                    else:
                        outputTier = 'FullGen'
                elif 'GEN-SIM' in outputTiers and 'AODSIM' in outputTiers and requestType == 'TaskChain':
                    outputTier = 'RelVal'
                elif 'RECO' in outputTiers and requestType == 'TaskChain':
                    outputTier = 'RelVal'
                elif 'GEN-SIM' in outputTiers:
                    outputTier = 'GEN-SIM'
                elif 'AODSIM' in outputTiers:
                    outputTier = 'AODSIM'
                elif 'RECO' in outputTiers:
                    outputTier = 'AOD'
                elif 'AOD' in outputTiers:
                    outputTier = 'AOD'
                else:
                    outputTier = 'GEN-SIM'

            # Calculate completion ratios for events and lumi sections, take minimum for all datasets
            eventPercent = 200
            lumiPercent = 200
            datasetReports = requestsDict[wf].getProgressSummaryByOutputDataset()
            for dataset in datasetReports:
                dsr = datasetReports[dataset].getReport()
                events = dsr.get('events', 0)
                lumis = dsr.get('totalLumis', 0)
                if targetLumis:
                    lumiPercent = min(lumiPercent, lumis/targetLumis*100)
                if targetEvents:
                    eventPercent = min(eventPercent, events/targetEvents*100)
            if eventPercent > 100:
                eventPercent = 0
            if lumiPercent > 100:
                lumiPercent = 0

            # Sum up all jobs across agents to see if we've run the first, last
            successJobs = 0
            totalJobs = 0
            for agent in result[wf]:
                jobs = result[wf][agent]
                successJobs += jobs['sucess']
                totalJobs += jobs['created']
            try:
                if totalJobs and not report[wf].get('firstJobTime', None):
                    report[wf].update({'firstJobTime' : int(time.time())})
                if totalJobs and successJobs == totalJobs and not report[wf].get('lastJobTime', None):
                    report[wf].update({'lastJobTime' : int(time.time())})
            except:
                pass

            # Figure out current status of workflow and transition times
            finalStatus = None
            newTime = None
            approvedTime = None
            assignedTime = None
            acquireTime = None
            completedTime = None
            closeoutTime = None
            announcedTime = None
            archivedTime = None
            requestDate = None

            for status in statuses:
                finalStatus = status['status']
                if status['status'] == 'new':
                    newTime = status['update_time']
                if status['status'] == 'assignment-approved':
                    approvedTime = status['update_time']
                if status['status'] == 'assigned':
                    assignedTime = status['update_time']
                if status['status'] == 'completed':
                    completedTime = status['update_time']
                if status['status'] == 'acquired':
                    acquireTime = status['update_time']
                if status['status'] == 'closed-out':
                    closeoutTime = status['update_time']
                if status['status'] == 'announced':
                    announcedTime = status['update_time']
                if status['status'] == 'normal-archived':
                    archivedTime = status['update_time']

            # Build or modify the report dictionary for the WF
            report.setdefault(wf, {})

            if approvedTime and not report[wf].get('approvedTime', None):
                report[wf].update({'approvedTime':approvedTime})
            if assignedTime and not report[wf].get('assignedTime', None):
                report[wf].update({'assignedTime':assignedTime})
            if acquireTime and not report[wf].get('acquireTime', None):
                report[wf].update({'acquireTime':acquireTime})
            if closeoutTime and not report[wf].get('closeoutTime', None):
                report[wf].update({'closeoutTime':closeoutTime})
            if announcedTime and not report[wf].get('announcedTime', None):
                report[wf].update({'announcedTime':announcedTime})
            if completedTime and not report[wf].get('completedTime', None):
                report[wf].update({'completedTime':completedTime})
            if newTime and not report[wf].get('newTime', None):
                report[wf].update({'newTime':newTime})
            if archivedTime and not report[wf].get('archivedTime', None):
                report[wf].update({'archivedTime':archivedTime})

            try:
                dt = requests[wf]['request_date']
                requestDate = '%4.4d-%2.2d-%2.2d %2.2d:%2.2d:%2.2d' % tuple(dt)
                report[wf].update({'requestDate' : requestDate})
            except:
                pass

            report[wf].update({'priority':priority, 'status':finalStatus, 'type':requestType})
            report[wf].update({'totalLumis':targetLumis, 'totalEvents':targetEvents, })
            report[wf].update({'campaign' : campaign, 'prepID' : prep_id, 'outputTier' : outputTier, })
            report[wf].update({'outputDatasets' : outputdatasets, 'inputDataset' : inputdataset, })

            report[wf].setdefault('lumiPercents', {})
            report[wf].setdefault('eventPercents', {})
            lumiProgress = 0
            eventProgress = 0
            for percentage in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98, 99, 100]:
                percent = str(percentage)
                percentReported = report[wf]['lumiPercents'].get(percent, None)
                if not percentReported and lumiPercent >= percentage:
                    report[wf]['lumiPercents'][percent] = int(time.time())
                if lumiPercent >= percentage:
                    lumiProgress = percentage

                percentReported = report[wf]['eventPercents'].get(percent, None)
                if not percentReported and eventPercent >= percentage:
                    report[wf]['eventPercents'][percent] = int(time.time())
                if eventPercent >= percentage:
                    eventProgress = percentage

            report[wf].update({'eventProgress' : eventProgress, 'lumiProgress' : lumiProgress,  })

            newCouchDoc.update(report[wf])

            # Queue the updated document for addition if it's changed.
            if ancientCouchDoc != newCouchDoc:
                if wfExists:
                    #log.debug("Workflow updated: %s" % wf)
                    pass
                else:
                    #log.debug("Workflow created: %s" % wf)
                    pass

                try:
                    newCouchDoc['updateTime'] = int(time.time())
                    report[wf]['updateTime'] = int(time.time())
                    cjson.encode(newCouchDoc) # Make sure it encodes before trying to queue
                    couchdb.queue(newCouchDoc)
                except:
                    log.error("Failed to queue document:%s \n" % pprint.pprint(newCouchDoc))

    log.info("%s: Finished getting job. wait for the next Cycle" % funcName)
    # Commit all changes to CouchDB
    couchdb.commit()
Example #54
0
def gatherWMDataMiningStats(wmstatsUrl, reqmgrUrl, wmminigUrl, archived = False, log = logging.info):
    
    server, database = splitCouchServiceURL(wmminigUrl)
    analyticsServer = CouchServer(server)
    couchdb = analyticsServer.connectDatabase(database)

    WMStats = WMStatsReader(wmstatsUrl)
    
    reqMgrServer, reqMgrDB = splitCouchServiceURL(reqmgrUrl)
    
    reqMgr = CouchServer(reqMgrServer).connectDatabase(reqMgrDB, False)
    
    if archived:
        funcName = "Archived Requests"
    else:
        funcName = "Active Requests"
    
    log("INFO: %s: Getting job information from %s and %s. Please wait." % (
                  funcName, wmstatsUrl, reqmgrUrl))

    if archived:
        checkStates = ['normal-archived', 'rejected-archived', 'aborted-archived']
        jobInfoFlag = False
    else:
        checkStates = WMStatsReader.ACTIVE_STATUS
        jobInfoFlag = True
    requests = WMStats.getRequestByStatus(checkStates, jobInfoFlag = jobInfoFlag)

    requestCollection = RequestInfoCollection(requests)
    result = requestCollection.getJSONData()
    requestsDict = requestCollection.getData()
    log("INFO: %s: Total %s requests retrieved\n" % (funcName, len(result)))

    report = {}
    for wf in result.keys():

        # Store a copy of the CouchDB document so we can compare later before updating
        if couchdb.documentExists(wf):
            oldCouchDoc = couchdb.document(wf)
            wfExists = True
        else:
            oldCouchDoc = CouchDoc(id=wf)
            wfExists = False

        newCouchDoc = copy.deepcopy(oldCouchDoc)
        ancientCouchDoc = copy.deepcopy(oldCouchDoc)
        report[wf] = oldCouchDoc
        # FIXME: remove report, only have two instances of couchDoc

        if not oldCouchDoc.has_key('filterEfficiency') or not oldCouchDoc.has_key('runWhiteList'):
            runWhiteList = []
            filterEfficiency = None
            try:
                #log("DEBUG: Looking up %s in ReqMgr" % wf)
                rmDoc = reqMgr.document(wf)
                runWhiteList = rmDoc.get('RunWhiteList', [])
                filterEfficiency = rmDoc.get('FilterEfficiency', None)
            except:
                pass # ReqMgr no longer has the workflow
            report[wf].update({'filterEfficiency':filterEfficiency, 'runWhiteList':runWhiteList})

        # Basic parameters of the workflow
        priority = requests[wf]['priority']
        requestType = requests[wf]['request_type']
        targetLumis = requests[wf].get('input_lumis', 0)
        targetEvents = requests[wf].get('input_events', 0)
        campaign = requests[wf]['campaign']
        prep_id = requests[wf].get('prep_id', None)
        outputdatasets = requests[wf].get('outputdatasets', [])

        # Can be an empty list, full list, empty string, or non-empty string!
        inputdataset = requests[wf]['inputdataset']
        if isinstance(inputdataset, (list,)):
            if inputdataset:
                inputdataset = inputdataset[0]
            else:
                inputdataset = ''

        outputTier = 'Unknown'
        try:
            outputTiers = []
            for ds in outputdatasets:
                if type(ds) == list:
                    outputTiers.append(ds[0].split('/')[-1])
                else:
                    outputTiers.append(ds.split('/')[-1])
        except:
            log("ERROR: Could not decode outputdatasets: %s" % outputdatasets) # Sometimes is a list of lists, not just a list. Bail
        if inputdataset:
            inputTier = inputdataset.split('/')[-1]
            if inputTier in ['GEN']:
                outputTier = 'LHE'
            elif inputTier in ['RAW', 'RECO']:
                outputTier = 'AOD'
            elif inputTier in ['GEN-SIM']:
                outputTier = 'AODSIM'
            elif 'AODSIM' in outputTiers:
                outputTier = 'AODSIM'

        else:
            if len(outputTiers) == 1 and 'GEN' in outputTiers:
                if 'STEP0ATCERN' in wf:
                    outputTier = 'STEP0'
                else:
                    outputTier = 'FullGen'
            elif 'GEN-SIM' in outputTiers and 'AODSIM' in outputTiers and requestType == 'TaskChain':
                outputTier = 'RelVal'
            elif 'RECO' in outputTiers and requestType == 'TaskChain':
                outputTier = 'RelVal'
            elif 'GEN-SIM' in outputTiers:
                outputTier = 'GEN-SIM'
            elif 'AODSIM' in outputTiers:
                outputTier = 'AODSIM'
            elif 'RECO' in outputTiers:
                outputTier = 'AOD'
            elif 'AOD' in outputTiers:
                outputTier = 'AOD'
            else:
                outputTier = 'GEN-SIM'

        # Calculate completion ratios for events and lumi sections, take minimum for all datasets
        eventPercent = 200
        lumiPercent = 200
        datasetReports = requestsDict[wf].getProgressSummaryByOutputDataset()
        for dataset in datasetReports:
            dsr = datasetReports[dataset].getReport()
            events = dsr.get('events', 0)
            lumis = dsr.get('totalLumis', 0)
            if targetLumis:
                lumiPercent = min(lumiPercent, lumis/targetLumis*100)
            if targetEvents:
                eventPercent = min(eventPercent, events/targetEvents*100)
        if eventPercent > 100:
            eventPercent = 0
        if lumiPercent > 100:
            lumiPercent = 0

        # Sum up all jobs across agents to see if we've run the first, last
        successJobs = 0
        totalJobs = 0
        for agent in result[wf]:
            jobs = result[wf][agent]
            successJobs += jobs['sucess']
            totalJobs += jobs['created']
        try:
            if totalJobs and not report[wf].get('firstJobTime', None):
                report[wf].update({'firstJobTime' : int(time.time())})
            if totalJobs and successJobs == totalJobs and not report[wf].get('lastJobTime', None):
                report[wf].update({'lastJobTime' : int(time.time())})
        except:
            pass

        # Figure out current status of workflow and transition times
        finalStatus = None
        newTime = None
        approvedTime = None
        assignedTime = None
        acquireTime = None
        completedTime = None
        closeoutTime = None
        announcedTime = None
        archivedTime = None
        requestDate = None
        for status in requests[wf]['request_status']:
            finalStatus = status['status']
            if status['status'] == 'new':
                newTime = status['update_time']
            if status['status'] == 'assignment-approved':
                approvedTime = status['update_time']
            if status['status'] == 'assigned':
                assignedTime = status['update_time']
            if status['status'] == 'completed':
                completedTime = status['update_time']
            if status['status'] == 'acquired':
                acquireTime = status['update_time']
            if status['status'] == 'closed-out':
                closeoutTime = status['update_time']
            if status['status'] == 'announced':
                announcedTime = status['update_time']
            if status['status'] == 'normal-archived':
                archivedTime = status['update_time']

        # Build or modify the report dictionary for the WF
        report.setdefault(wf, {})

        if approvedTime and not report[wf].get('approvedTime', None):
            report[wf].update({'approvedTime':approvedTime})
        if assignedTime and not report[wf].get('assignedTime', None):
            report[wf].update({'assignedTime':assignedTime})
        if acquireTime and not report[wf].get('acquireTime', None):
            report[wf].update({'acquireTime':acquireTime})
        if closeoutTime and not report[wf].get('closeoutTime', None):
            report[wf].update({'closeoutTime':closeoutTime})
        if announcedTime and not report[wf].get('announcedTime', None):
            report[wf].update({'announcedTime':announcedTime})
        if completedTime and not report[wf].get('completedTime', None):
            report[wf].update({'completedTime':completedTime})
        if newTime and not report[wf].get('newTime', None):
            report[wf].update({'newTime':newTime})
        if archivedTime and not report[wf].get('archivedTime', None):
            report[wf].update({'archivedTime':archivedTime})

        try:
            dt = requests[wf]['request_date']
            requestDate = '%4.4d-%2.2d-%2.2d %2.2d:%2.2d:%2.2d' % tuple(dt)
            report[wf].update({'requestDate' : requestDate})
        except:
            pass

        report[wf].update({'priority':priority, 'status':finalStatus, 'type':requestType})
        report[wf].update({'totalLumis':targetLumis, 'totalEvents':targetEvents, })
        report[wf].update({'campaign' : campaign, 'prepID' : prep_id, 'outputTier' : outputTier, })
        report[wf].update({'outputDatasets' : outputdatasets, 'inputDataset' : inputdataset, })

        report[wf].setdefault('lumiPercents', {})
        report[wf].setdefault('eventPercents', {})
        lumiProgress = 0
        eventProgress = 0
        for percentage in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98, 99, 100]:
            percent = str(percentage)
            percentReported = report[wf]['lumiPercents'].get(percent, None)
            if not percentReported and lumiPercent >= percentage:
                report[wf]['lumiPercents'][percent] = int(time.time())
            if lumiPercent >= percentage:
                lumiProgress = percentage

            percentReported = report[wf]['eventPercents'].get(percent, None)
            if not percentReported and eventPercent >= percentage:
                report[wf]['eventPercents'][percent] = int(time.time())
            if eventPercent >= percentage:
                eventProgress = percentage

        report[wf].update({'eventProgress' : eventProgress, 'lumiProgress' : lumiProgress,  })

        newCouchDoc.update(report[wf])

        # Queue the updated document for addition if it's changed.
        if ancientCouchDoc != newCouchDoc:
            if wfExists:
                #log("DEBUG: Workflow updated: %s" % wf)
                pass
            else:
                #log("DEBUG Workflow created: %s" % wf)
                pass

            try:
                newCouchDoc['updateTime'] = int(time.time())
                report[wf]['updateTime'] = int(time.time())
                cjson.encode(newCouchDoc) # Make sure it encodes before trying to queue
                couchdb.queue(newCouchDoc)
            except:
                log("ERROR: Failed to queue document:%s \n" % pprint.pprint(newCouchDoc))

    log("INFO: %s: Finished getting job. wait for the next Cycle" % funcName)
    # Commit all changes to CouchDB
    couchdb.commit()