Beispiel #1
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """
        # set the connection to local queue
        self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, 
                                             self.config.JobStateMachine.summaryStatsDBName,
                                             self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, 
                                                 "WMStatsAgent")
        
        if hasattr(self.config, "Tier0Feeder"):
            #use local db for tier0
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL
        else:
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL
        
        self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
        #TODO: change the config to hold couch url
        self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)
        
        if self.pluginName != None:
            pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname = self.pluginName)
Beispiel #2
0
def assignRequest(requestName, teamName, prodMgr=None, wmstatUrl=None):
    """
    _assignRequest_

    Assign a request to a team.

    This does the following:

    - Changes the status to assigned
    - Creates an association to the team provided
    - Optionally associates the request to a prod mgr instance

    """

    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)

    teamId = factory(classname="Team.ID").execute(teamName)
    if teamId == None:
        msg = "Team named %s not known in database" % teamName
        msg += "Failed to assign request %s to team %s" % (requestName, teamName)
        raise RuntimeError, msg

    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateTeam(requestName, teamName)

    assigner = factory(classname="Assignment.New")
    assigner.execute(reqId, teamId)

    changeRequestStatus(requestName, "assigned", priority=None, wmstatUrl=wmstatUrl)

    if prodMgr != None:
        addPM = factory(classname="Progress.ProdMgr")
        addPM.execute(reqId, prodMgr)
Beispiel #3
0
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(
            self.config.TaskArchiver.centralWMStatsURL)

        if self.useReqMgrForCompletionCheck:
            self.deletableStates = ["announced"]
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
            self.reqmgrSvc = RequestManager(
                {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableStates = ["completed"]
            self.centralCouchDBWriter = self.wmstatsCouchDB

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)
class HeartbeatMonitorBase(CherryPyPeriodicTask):

    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration}]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional mointoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}
Beispiel #5
0
 def __init__(self, rest, config):
     super(HeartbeatMonitorBase, self).__init__(config)
     self.centralWMStats = WMStatsWriter(config.wmstats_url)
     self.threadList = config.thread_list
     self.userAMQ = getattr(config, "user_amq", None)
     self.passAMQ = getattr(config, "pass_amq", None)
     self.postToAMQ = getattr(config, "post_to_amq", False)
     self.topicAMQ = getattr(config, "topic_amq", None)
     self.hostPortAMQ = getattr(config, "host_port_amq", None)
Beispiel #6
0
def saveWorkload(helper, workload, wmstatUrl=None):
    """ Saves the changes to this workload """
    if workload.startswith('http'):
        helper.saveCouchUrl(workload)
        if wmstatUrl:
            wmstatSvc = WMStatsWriter(wmstatUrl)
            wmstatSvc.updateFromWMSpec(helper)
    else:
        helper.save(workload)
Beispiel #7
0
def saveWorkload(helper, workload, wmstatUrl = None):
    """ Saves the changes to this workload """
    if workload.startswith('http'):
        helper.saveCouchUrl(workload)
        if wmstatUrl:
            wmstatSvc = WMStatsWriter(wmstatUrl)
            wmstatSvc.updateFromWMSpec(helper)
    else:
        helper.save(workload)
Beispiel #8
0
class CleanUpTask(CherryPyPeriodicTask):
    """
    This class is used for both T0WMStats and WMStats
    controlled by config.reqdb_couch_app value
    """
    def __init__(self, rest, config):

        super(CleanUpTask, self).__init__(config)
        self.wmstatsDB = WMStatsWriter(config.wmstats_url,
                                       reqdbURL=config.reqmgrdb_url,
                                       reqdbCouchApp=config.reqdb_couch_app)

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which runs concurrently
        """
        self.concurrentTasks = [{
            'func':
            self.cleanUpOldRequests,
            'duration': (config.DataKeepDays * 24 * 60 * 60)
        }, {
            'func': self.cleanUpArchivedRequests,
            'duration': config.archivedCleanUpDuration
        }]

    def cleanUpOldRequests(self, config):
        """
        clean up wmstats data older then given days
        """
        self.logger.info("deleting %s hours old docs",
                         (config.DataKeepDays * 24))
        result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays)
        self.logger.info("%s old doc deleted", result)
        return

    def cleanUpArchivedRequests(self, config):
        """
        loop through the workflows in couchdb, if archived delete all the data in couchdb
        """
        self.logger.info("getting archived data")
        requestNames = self.wmstatsDB.getArchivedRequests()
        self.logger.info("archived list %s", requestNames)

        for req in requestNames:
            self.logger.info("Deleting data for: %s", req)
            try:
                result = self.wmstatsDB.deleteDocsByWorkflow(req)
            except Exception as ex:
                self.logger.error("deleting %s failed: %s", req, str(ex))
                for line in traceback.format_exc().rstrip().split("\n"):
                    self.logger.error(" " + line)
            else:
                if result is None:
                    self.logger.info("there were no documents to delete.")
                else:
                    self.logger.info("%s docs deleted", len(result))
        return
Beispiel #9
0
 def setUp(self):
     """
     _setUp_
     """
     self.schema = []
     self.couchApps = ["WMStats"]
     self.testInit = TestInitCouchApp('WorkQueueServiceTest')
     self.testInit.setLogging()
     self.testInit.setDatabaseConnection()
     self.testInit.setSchema(customModules=self.schema, useDefault=False)
     self.testInit.setupCouch('wmstats_t', *self.couchApps)
     self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, 'wmstats_t')
     return
Beispiel #10
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()
Beispiel #11
0
class CleanUpTask(CherryPyPeriodicTask):
    """
    This class is used for both T0WMStats and WMStats
    controlled by config.reqdb_couch_app value
    """

    def __init__(self, rest, config):

        super(CleanUpTask, self).__init__(config)
        self.wmstatsDB = WMStatsWriter(config.wmstats_url, reqdbURL=config.reqmgrdb_url,
                              reqdbCouchApp=config.reqdb_couch_app)

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which runs concurrently
        """
        self.concurrentTasks = [{'func': self.cleanUpOldRequests, 'duration': (config.DataKeepDays * 24 * 60 * 60)},
                                {'func': self.cleanUpArchivedRequests, 'duration': config.archivedCleanUpDuration}]

    def cleanUpOldRequests(self, config):
        """
        clean up wmstats data older then given days
        """
        self.logger.info("deleting %s hours old docs", (config.DataKeepDays * 24))
        result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays)
        self.logger.info("%s old doc deleted", result)
        return

    def cleanUpArchivedRequests(self, config):
        """
        loop through the workflows in couchdb, if archived delete all the data in couchdb
        """
        self.logger.info("getting archived data")
        requestNames = self.wmstatsDB.getArchivedRequests()
        self.logger.info("archived list %s", requestNames)

        for req in requestNames:
            self.logger.info("Deleting data for: %s", req)
            try:
                result = self.wmstatsDB.deleteDocsByWorkflow(req)
            except Exception as ex:
                self.logger.error("deleting %s failed: %s", req, str(ex))
                for line in traceback.format_exc().rstrip().split("\n"):
                    self.logger.error(" " + line)
            else:
                if result is None:
                    self.logger.info("there were no documents to delete.")
                else:
                    self.logger.info("%s docs deleted", len(result))
        return
Beispiel #12
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
     self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
Beispiel #13
0
    def processInboundWork(self, inbound_work = None, throw = False):
        """Retrieve work from inbox, split and store
        If request passed then only process that request
        """
        if self.params['LocalQueueFlag']:
            self.backend.fixConflicts() # db should be consistent

        result = []
        if not inbound_work:
            inbound_work = self.backend.getElementsForSplitting()
        for inbound in inbound_work:
            # Check we haven't already split the work
            work = self.backend.getElementsForParent(inbound)
            try:
                if work:
                    self.logger.info('Request "%s" already split - Resuming' % inbound['RequestName'])
                else:
                    work, totalStats = self._splitWork(inbound['WMSpec'], None, inbound['Inputs'], inbound['Mask'])
                    self.backend.insertElements(work, parent = inbound) # if this fails, rerunning will pick up here
                    # save inbound work to signal we have completed queueing

                    # add the total work on wmstat summary
                    self.backend.updateInboxElements(inbound.id, Status = 'Acquired')

                    if not self.params.get('LocalQueueFlag') and self.params.get('WMStatsCouchUrl'):
                        # only update global stats for global queue
                        try:
                            wmstatSvc = WMStatsWriter(self.params.get('WMStatsCouchUrl'))
                            wmstatSvc.insertTotalStats(inbound['WMSpec'].name(), totalStats)
                        except Exception, ex:
                            self.logger.info('Error publishing %s to WMStats: %s' % (inbound['RequestName'], str(ex)))

            except TERMINAL_EXCEPTIONS, ex:
                self.logger.info('Failing workflow "%s": %s' % (inbound['RequestName'], str(ex)))
                self.backend.updateInboxElements(inbound.id, Status = 'Failed')
                if throw:
                    raise
            except Exception, ex:
                # if request has been failing for too long permanently fail it.
                # last update time was when element was assigned to this queue
                if (float(inbound.updatetime) + self.params['QueueRetryTime']) < time.time():
                    self.logger.info('Failing workflow "%s" as not queued in %d secs: %s' % (inbound['RequestName'],
                                                                                             self.params['QueueRetryTime'],
                                                                                             str(ex)))
                    self.backend.updateInboxElements(inbound.id, Status = 'Failed')
                else:
                    self.logger.info('Exception splitting work for wmspec "%s": %s' % (inbound['RequestName'], str(ex)))
                if throw:
                    raise
                continue
Beispiel #14
0
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.archiveDelayHours = getattr(self.config.TaskArchiver,
                                         'archiveDelayHours', 0)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL, "WMStatsAgent")

        #TODO: we might need to use local db for Tier0
        self.centralRequestDBReader = RequestDBReader(
            self.config.AnalyticsDataCollector.centralRequestDBURL,
            couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)

        if self.useReqMgrForCompletionCheck:
            self.deletableState = "announced"
            self.centralRequestDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.centralRequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
            if self.config.TaskArchiver.reqmgr2Only:
                self.reqmgr2Svc = ReqMgr(
                    self.config.TaskArchiver.ReqMgr2ServiceURL)
            else:
                #TODO: remove this for reqmgr2
                self.reqmgrSvc = RequestManager(
                    {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableState = "completed"
            # use local for update
            self.centralRequestDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.localT0RequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)
Beispiel #15
0
def assignRequest(requestName,
                  teamName,
                  priorityModifier=0,
                  prodMgr=None,
                  wmstatUrl=None):
    """
    _assignRequest_

    Assign a request to a team.

    This does the following:

    - Changes the status to assigned
    - Creates an association to the team provided
    - Optionally associates the request to a prod mgr instance
    - Optionally sets the priority modifier for the team (allows same request to be
      shared between two teams with different priorities


    """

    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)

    teamId = factory(classname="Team.ID").execute(teamName)
    if teamId == None:
        msg = "Team named %s not known in database" % teamName
        msg += "Failed to assign request %s to team %s" % (requestName,
                                                           teamName)
        raise RuntimeError, msg

    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateTeam(requestName, teamName)

    assigner = factory(classname="Assignment.New")
    assigner.execute(reqId, teamId, priorityModifier)

    changeRequestStatus(requestName,
                        'assigned',
                        priority=None,
                        wmstatUrl=wmstatUrl)

    if prodMgr != None:
        addPM = factory(classname="Progress.ProdMgr")
        addPM.execute(reqId, prodMgr)

    return
Beispiel #16
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
     
     if self.useReqMgrForCompletionCheck:
         self.deletableStates = ["announced"]
         self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
         self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
     else:
         # Tier0 case
         self.deletableStates = ["completed"]
         self.centralCouchDBWriter = self.wmstatsCouchDB
     
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
     
     statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
     self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
Beispiel #17
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     
     #TODO: we might need to use local db for Tier0
     self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
     
     if self.useReqMgrForCompletionCheck:
         self.deletableStates = ["announced"]
         self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
         #TODO: remove this for reqmgr2
         self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
     else:
         # Tier0 case
         self.deletableStates = ["completed"]
         # use local for update
         self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, 
                                                couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
     
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
     
     statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
     self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
Beispiel #18
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """
        # set the connection to local queue
        if not hasattr(self.config, "Tier0Feeder"):
            self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, 
                                             self.config.JobStateMachine.summaryStatsDBName,
                                             self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, 
                                                 appName="WMStatsAgent")
        
        if hasattr(self.config, "Tier0Feeder"):
            #use local db for tier0
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL
        else:
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL
        
        self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
        #TODO: change the config to hold couch url
        self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)
        
        if self.pluginName != None:
            pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname = self.pluginName)
Beispiel #19
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        #
        self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL)
        logging.info("Setting the replication to central monitor ...")
        self.localSummaryCouchDB.replicate(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        if self.pluginName != None:
            pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname = self.pluginName)
Beispiel #20
0
class CleanUpTask(CherryPyPeriodicTask):
    """
    This class is used for both T0WMStats and WMStats
    controlled by config.reqdb_couch_app value
    """
    def __init__(self, rest, config):

        CherryPyPeriodicTask.__init__(self, config)
        self.wmstatsDB = WMStatsWriter(config.wmstats_url,
                                       reqdbURL=config.reqmgrdb_url,
                                       reqdbCouchApp=config.reqdb_couch_app)

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [{
            'func':
            self.cleanUpOldRequests,
            'duration': (config.DataKeepDays * 24 * 60 * 60)
        }, {
            'func': self.cleanUpArchivedRequests,
            'duration': config.archivedCleanUpDuration
        }]

    def cleanUpOldRequests(self, config):
        """
        clean up wmstats data older then given days
        """
        self.logger.info("deleting %s hours old docs" %
                         (config.DataKeepDays * 24))
        result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays)
        self.logger.info("%s old doc deleted" % result)
        return

    def cleanUpArchivedRequests(self, config):
        """
        loop through the workflows in couchdb, if archived delete all the data in couchdb
        """

        requestNames = self.wmstatsDB.getArchivedRequests()
        for req in requestNames:
            self.logger.info("deleting %s data" % req)
            result = self.wmstatsDB.deleteDocsByWorkflow(req)
            self.logger.info("%s deleted" % result)
        return
Beispiel #21
0
class WMStatsTest(unittest.TestCase):
    """
    """
    def setUp(self):
        """
        _setUp_
        """
        self.schema = []
        self.couchApps = ["WMStats"]
        self.testInit = TestInitCouchApp('WorkQueueServiceTest')
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = self.schema,
                                useDefault = False)
        self.testInit.setupCouch('wmstats_t', *self.couchApps)
        self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, 'wmstats_t');
        return

    def tearDown(self):
        """
        _tearDown_

        Drop all the WMBS tables.
        """
        self.testInit.tearDownCouch()

    def testWMStatsWriter(self):
        # test getWork
        schema = generate_reqmgr_schema()
        self.assertEquals(self.wmstatsWriter.insertRequest(schema[0]), 'OK', 'insert fail');
        self.assertEquals(self.wmstatsWriter.updateRequestStatus(schema[0]['RequestName'], "failed"), 'OK', 'update fail')
        self.assertEquals(self.wmstatsWriter.updateRequestStatus("not_exist_schema", "assigned"),
                          'ERROR: request not found - not_exist_schema')
        self.assertEquals(self.wmstatsWriter.updateTeam(schema[0]['RequestName'], 'teamA'), 'OK', 'update fail')
        self.assertEquals(self.wmstatsWriter.updateTeam("not_exist_schema", 'teamA'),
                          'ERROR: request not found - not_exist_schema')
        totalStats = {'total_jobs': 100, 'input_events': 1000, 'input_lumis': 1234, 'input_num_files': 5}
        self.assertEquals(self.wmstatsWriter.insertTotalStats(schema[0]['RequestName'], totalStats), 'INSERTED', 'update fail')
        self.assertEquals(self.wmstatsWriter.insertTotalStats(schema[0]['RequestName'], totalStats), 'UPDATED', 'update fail')
        self.assertEquals(self.wmstatsWriter.insertTotalStats("not_exist_schema", totalStats),
                          'ERROR: request not found - not_exist_schema')
        spec1 = newWorkload(schema[0]['RequestName'])
        production = spec1.newTask("Production")
        production.setTaskType("Merge")
        production.setSiteWhitelist(['TEST_SITE'])
        self.assertEquals(self.wmstatsWriter.updateFromWMSpec(spec1), 'OK', 'update fail')
        spec2 = newWorkload("not_exist_schema")
        production = spec2.newTask("Production")
        production.setTaskType("Merge")
        self.assertEquals(self.wmstatsWriter.updateFromWMSpec(spec2),
                          'ERROR: request not found - not_exist_schema')
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        
        if hasattr(self.config, "Tier0Feeder"):
            self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, 
                                                       appName= "WMStatsAgent")
        else:
            self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()
Beispiel #23
0
 def __init__(self, rest, config):
     super(HeartbeatMonitorBase, self).__init__(config)
     self.centralWMStats = WMStatsWriter(config.wmstats_url)
     self.threadList = config.thread_list
     self.userAMQ = getattr(config, "user_amq", None)
     self.passAMQ = getattr(config, "pass_amq", None)
     self.postToAMQ = getattr(config, "post_to_amq", False)
     self.topicAMQ = getattr(config, "topic_amq", None)
     self.hostPortAMQ = getattr(config, "host_port_amq", None)
Beispiel #24
0
def changeRequestStatus(requestName, newState, priority=None, wmstatUrl=None):
    """
    _changeRequestStatus_

    Basic API to change a request to a new state, also
    includes optional priority change for the request

    - *requestName* : name of the request to be modified
    - *newState*    : name of the new status for the request
    - *priority* : optional integer priority
    
    Apparently when changing request state (on assignment page),
    it's possible to change priority at one go. Hence the argument is here.

    """
    # MySQL/Oracle
    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)
    changeRequestIDStatus(reqId, newState, priority)

    # CouchDB
    # have to first get information where the request Couch document is,
    # extracting the information from reqmgr_request.workflow table field
    reqData = factory(classname="Request.Get").execute(reqId)
    # this would be something like this:
    # http://localhost:5984/reqmgr_workload_cache/maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec
    wfUrl = reqData['workflow']
    # cut off /maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec
    couchUrl = wfUrl.replace('/' + requestName + "/spec", '')
    couchDbName = couchUrl[couchUrl.rfind('/') + 1:]
    # cut off database name from the URL
    url = couchUrl.replace('/' + couchDbName, '')
    couchDb = Database(couchDbName, url)
    fields = {"RequestStatus": newState}
    couchDb.updateDocument(requestName,
                           "ReqMgr",
                           "updaterequest",
                           fields=fields,
                           useBody=True)

    #TODO: should we make this mendatory?
    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateRequestStatus(requestName, newState)
Beispiel #25
0
class CleanUpTask(CherryPyPeriodicTask):
    """
    This class is used for both T0WMStats and WMStats
    controlled by config.reqdb_couch_app value
    """

    def __init__(self, rest, config):

        CherryPyPeriodicTask.__init__(self, config)
        self.wmstatsDB = WMStatsWriter(
            config.wmstats_url, reqdbURL=config.reqmgrdb_url, reqdbCouchApp=config.reqdb_couch_app
        )

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [
            {"func": self.cleanUpOldRequests, "duration": (config.DataKeepDays * 24 * 60 * 60)},
            {"func": self.cleanUpArchivedRequests, "duration": config.archivedCleanUpDuration},
        ]

    def cleanUpOldRequests(self, config):
        """
        clean up wmstats data older then given days
        """
        self.logger.info("deleting %s hours old docs" % (config.DataKeepDays * 24))
        result = self.wmstatsDB.deleteOldDocs(config.DataKeepDays)
        self.logger.info("%s old doc deleted" % result)
        return

    def cleanUpArchivedRequests(self, config):
        """
        loop through the workflows in couchdb, if archived delete all the data in couchdb
        """

        requestNames = self.wmstatsDB.getArchivedRequests()
        for req in requestNames:
            self.logger.info("deleting %s data" % req)
            result = self.wmstatsDB.deleteDocsByWorkflow(req)
            self.logger.info("%s deleted" % result)
        return
Beispiel #26
0
    def setUp(self):
        """
        _setUp_

        Setup the test environment
        """
        self.testInit = TestInit(__file__)
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(["WMCore.WMBS"])
        self.wmstatsCouchDB = 'wmstats_plugin_t'
        self.testInit.setupCouch(self.wmstatsCouchDB, 'WMStats')
        self.testDir = self.testInit.generateWorkDir()

        self.wmstatsWriter = WMStatsWriter(os.environ['COUCHURL'], self.wmstatsCouchDB)

        self.stateMap = {}
        self.orderedStates = []
        self.plugin = None

        return
Beispiel #27
0
def changeRequestStatus(requestName, newState, priority = None, wmstatUrl = None):
    """
    _changeRequestStatus_

    Basic API to change a request to a new state, also
    includes optional priority change for the request

    - *requestName* : name of the request to be modified
    - *newState*    : name of the new status for the request
    - *priority* : optional integer priority

    """
    #TODO: should we make this mendatory?
    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateRequestStatus(requestName, newState)

    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)
    changeRequestIDStatus(reqId, newState, priority)
    return
Beispiel #28
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """
        # set the connection to local queue
        self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL)
        self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        if self.pluginName != None:
            pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname = self.pluginName)
Beispiel #29
0
def changeRequestStatus(requestName, newState, priority=None, wmstatUrl=None):
    """
    _changeRequestStatus_

    Basic API to change a request to a new state, also
    includes optional priority change for the request

    - *requestName* : name of the request to be modified
    - *newState*    : name of the new status for the request
    - *priority* : optional integer priority
    
    Apparently when changing request state (on assignment page),
    it's possible to change priority at one go. Hence the argument is here.

    """
    # MySQL/Oracle
    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)
    changeRequestIDStatus(reqId, newState, priority)

    # CouchDB
    # have to first get information where the request Couch document is,
    # extracting the information from reqmgr_request.workflow table field
    reqData = factory(classname="Request.Get").execute(reqId)
    # this would be something like this:
    # http://localhost:5984/reqmgr_workload_cache/maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec
    wfUrl = reqData["workflow"]
    # cut off /maxa_RequestString-OVERRIDE-ME_130306_205649_8066/spec
    couchUrl = wfUrl.replace("/" + requestName + "/spec", "")
    couchDbName = couchUrl[couchUrl.rfind("/") + 1 :]
    # cut off database name from the URL
    url = couchUrl.replace("/" + couchDbName, "")
    couchDb = Database(couchDbName, url)
    fields = {"RequestStatus": newState}
    couchDb.updateDocument(requestName, "ReqMgr", "updaterequest", fields=fields, useBody=True)

    # TODO: should we make this mendatory?
    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateRequestStatus(requestName, newState)
Beispiel #30
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        if hasattr(self.config, "Tier0Feeder"):
            self.centralWMStatsCouchDB = WMStatsWriter(
                self.config.AnalyticsDataCollector.localWMStatsURL,
                appName="WMStatsAgent")
        else:
            self.centralWMStatsCouchDB = WMStatsWriter(
                self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()
class HeartbeatMonitorBase(CherryPyPeriodicTask):
    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{
            'func': self.reportToWMStats,
            'duration': config.heartbeatCheckDuration
        }]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional mointoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(
            self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo,
                                                config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}
Beispiel #32
0
    def __init__(self, config):
        """
        _init_

        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "T0.WMBS",
                                     logger = logging,
                                     dbinterface = myThread.dbi)

        self.tier0ConfigFile = config.Tier0Feeder.tier0ConfigFile
        self.specDirectory = config.Tier0Feeder.specDirectory
        self.dropboxuser = config.Tier0Feeder.dropboxuser
        self.dropboxpass = config.Tier0Feeder.dropboxpass

        self.transferSystemBaseDir = getattr(config.Tier0Feeder, "transferSystemBaseDir", None)
        if self.transferSystemBaseDir != None:
            if not os.path.exists(self.transferSystemBaseDir):
                self.transferSystemBaseDir = None

        self.dqmUploadProxy = config.WMBSService.proxy

        self.localSummaryCouchDB = WMStatsWriter(config.AnalyticsDataCollector.localWMStatsURL)

        hltConfConnectUrl = config.HLTConfDatabase.connectUrl
        dbFactoryHltConf = DBFactory(logging, dburl = hltConfConnectUrl, options = {})
        dbInterfaceHltConf = dbFactoryHltConf.connect()
        daoFactoryHltConf = DAOFactory(package = "T0.WMBS",
                                       logger = logging,
                                       dbinterface = dbInterfaceHltConf)
        self.getHLTConfigDAO = daoFactoryHltConf(classname = "RunConfig.GetHLTConfig")

        storageManagerConnectUrl = config.StorageManagerDatabase.connectUrl
        dbFactoryStorageManager = DBFactory(logging, dburl = storageManagerConnectUrl, options = {})
        self.dbInterfaceStorageManager = dbFactoryStorageManager.connect()

        self.getExpressReadyRunsDAO = None
        if hasattr(config, "PopConLogDatabase"):
            popConLogConnectUrl = getattr(config.PopConLogDatabase, "connectUrl", None)
            if popConLogConnectUrl != None:
                dbFactoryPopConLog = DBFactory(logging, dburl = popConLogConnectUrl, options = {})
                dbInterfacePopConLog = dbFactoryPopConLog.connect()
                daoFactoryPopConLog = DAOFactory(package = "T0.WMBS",
                                                 logger = logging,
                                                 dbinterface = dbInterfacePopConLog)
                self.getExpressReadyRunsDAO = daoFactoryPopConLog(classname = "Tier0Feeder.GetExpressReadyRuns")

        return
Beispiel #33
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()
Beispiel #34
0
 def setUp(self):
     """
     _setUp_
     """
     self.schema = []
     self.couchApps = ["WMStats"]
     self.testInit = TestInitCouchApp('WorkQueueServiceTest')
     self.testInit.setLogging()
     self.testInit.setDatabaseConnection()
     self.testInit.setSchema(customModules = self.schema,
                             useDefault = False)
     self.testInit.setupCouch('wmstats_t', *self.couchApps)
     self.wmstatsWriter = WMStatsWriter(self.testInit.couchUrl, 'wmstats_t'); 
     return
Beispiel #35
0
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        #self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL)
        self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)
Beispiel #36
0
def changeRequestStatus(requestName, newState, priority=None, wmstatUrl=None):
    """
    _changeRequestStatus_

    Basic API to change a request to a new state, also
    includes optional priority change for the request

    - *requestName* : name of the request to be modified
    - *newState*    : name of the new status for the request
    - *priority* : optional integer priority
    
    Apparently when changing request state (on assignment page),
    it's possible to change priority at one go. Hence the argument is here.

    """
    #TODO: should we make this mendatory?
    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateRequestStatus(requestName, newState)

    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)
    changeRequestIDStatus(reqId, newState, priority)
    return
Beispiel #37
0
def changeRequestStatus(requestName, newState, priority = None, wmstatUrl = None):
    """
    _changeRequestStatus_

    Basic API to change a request to a new state, also
    includes optional priority change for the request

    - *requestName* : name of the request to be modified
    - *newState*    : name of the new status for the request
    - *priority* : optional integer priority
    
    Apparently when changing request state (on assignment page),
    it's possible to change priority at one go. Hence the argument is here.

    """
    #TODO: should we make this mendatory?
    if wmstatUrl:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.updateRequestStatus(requestName, newState)

    factory = DBConnect.getConnection()
    reqId = getRequestID(factory, requestName)
    changeRequestIDStatus(reqId, newState, priority)
    return
Beispiel #38
0
    def setUp(self):
        """
        _setUp_

        Setup the test environment
        """
        self.testInit = TestInit(__file__)
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(["WMCore.WMBS"])
        self.wmstatsCouchDB = 'wmstats_plugin_t'
        self.testInit.setupCouch(self.wmstatsCouchDB, 'WMStats')
        self.testDir = self.testInit.generateWorkDir()

        self.wmstatsWriter = WMStatsWriter(os.environ['COUCHURL'], self.wmstatsCouchDB)

        self.stateMap = {}
        self.orderedStates = []
        self.plugin = None

        return
Beispiel #39
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel

        proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY
        self.userCertFile = self.proxy.getUserCertFilename()  # X509_USER_CERT
        # credential lifetime warning/error thresholds, in days
        self.credThresholds = {
            'proxy': {
                'error': 3,
                'warning': 5
            },
            'certificate': {
                'error': 10,
                'warning': 20
            }
        }

        # Monitoring setup
        self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None)
        self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None)
        self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False)
        self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None)
        self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ",
                                   [('cms-mb.cern.ch', 61313)])

        # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here
        if hasattr(self.config, "Tier0Feeder"):
            self.isT0agent = True
            self.producer = "tier0wmagent"
        else:
            self.isT0agent = False
            self.producer = "wmagent"
            localWQUrl = config.AnalyticsDataCollector.localQueueURL
            self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.General.centralWMStatsURL
        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        if self.isT0agent:
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:
            # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(
            self.config.General.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    @timeFunction
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkCredLifetime(agentInfo, "proxy")
            self.checkCredLifetime(agentInfo, "certificate")

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not self.isT0agent:
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs",
                             timeSpent)

            self.uploadAgentInfoToCentralWMStats(agentInfo)

            self.buildMonITDocs(agentInfo)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s",
                              str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}
        wqStates = ['Available', 'Acquired']

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results[
            'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority(
            )

        elements = self.workqueueDS.getElementsByStatus(wqStates)
        uniSites, posSites = getGlobalSiteStatusSummary(elements,
                                                        status=wqStates,
                                                        dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {
            'name': 'CouchServer',
            'status': 'ok',
            'error_message': ""
        }

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config,
                                                               updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s",
                     agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo):
        """
        Add some required fields to the document before it can get uploaded
        to WMStats.
        :param agentInfo: dict with agent stats to be posted to couchdb
        """
        agentInfo['_id'] = agentInfo["agent_url"]
        agentInfo['timestamp'] = int(time.time())
        agentInfo['type'] = "agent_info"
        # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        try:
            self.centralWMStatsCouchDB.updateAgentInfo(
                agentInfo, propertiesToKeep=["data_last_update", "data_error"])
        except Exception as e:
            logging.error(
                "Failed to upload agent statistics to WMStats. Error: %s",
                str(e))

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s",
                      results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s",
                      results['wmbsCountByState'])
        logging.debug(
            "Total number of 'created' jobs in WMBS sorted by type: %s",
            results['wmbsCreatedTypeCount'])
        logging.debug(
            "Total number of 'executing' jobs in WMBS sorted by type: %s",
            results['wmbsExecutingTypeCount'])

        logging.debug(
            "Total number of active jobs in BossAir sorted by status: %s",
            results['activeRunJobByStatus'])
        logging.debug(
            "Total number of complete jobs in BossAir sorted by status: %s",
            results['completeRunJobByStatus'])

        logging.debug(
            "Available slots thresholds to pull work from GQ to LQ: %s",
            results['thresholdsGQ2LQ'])
        logging.debug(
            "List of jobs pending for each site, sorted by priority: %s",
            results['sitePendCountByPrio'])

        return results

    def checkCredLifetime(self, agInfo, credType):
        """
        Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT
        and raise either a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :param credType: credential type, can be: "proxy" or "certificate"
        :return: same dictionary object plus additional keys/values if needed.
        """
        if credType == "proxy":
            credFile = self.proxyFile
            secsLeft = self.proxy.getTimeLeft(proxy=credFile)
        elif credType == "certificate":
            credFile = self.userCertFile
            secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True)
        else:
            logging.error(
                "Unknown credential type. Available options are: [proxy, certificate]"
            )
            return

        logging.debug("%s '%s' lifetime is %d seconds", credType, credFile,
                      secsLeft)

        daysLeft = secsLeft / (60 * 60 * 24)

        if daysLeft <= self.credThresholds[credType]['error']:
            credWarning = True
            agInfo['status'] = "error"
        elif daysLeft <= self.credThresholds[credType]['warning']:
            credWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            credWarning = False

        if credWarning:
            warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType,
                                                                credFile)
            warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.)
            agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg
            logging.warning(warnMsg)

        return

    def buildMonITDocs(self, dataStats):
        """
        Convert agent statistics into MonIT-friendly documents to be posted
        to AMQ/ES. It creates 5 different type of documents:
         * priority information
         * site information
         * work information
         * agent information
         * agent health information
        Note that the internal methods are popping some metrics out of dataStats
        """
        if not self.postToAMQ:
            return

        logging.info("Preparing documents to be posted to AMQ/MonIT..")
        allDocs = self._buildMonITPrioDocs(dataStats)
        allDocs.extend(self._buildMonITSitesDocs(dataStats))
        allDocs.extend(self._buildMonITWorkDocs(dataStats))
        allDocs.extend(self._buildMonITWMBSDocs(dataStats))
        allDocs.extend(self._buildMonITAgentDocs(dataStats))
        allDocs.extend(self._buildMonITHealthDocs(dataStats))
        allDocs.extend(self._buildMonITSummaryDocs(dataStats))

        # and finally post them all to AMQ
        logging.info("Found %d documents to post to AMQ", len(allDocs))
        self.uploadToAMQ(allDocs, dataStats['agent_url'],
                         dataStats['timestamp'])

    def _buildMonITPrioDocs(self, dataStats):
        """
        Uses the `sitePendCountByPrio` metric in order to build documents
        reporting the site name, job priority and amount of jobs within that
        priority.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_prio_info MonIT docs
        """
        docType = "wma_prio_info"
        prioDocs = []
        sitePendCountByPrio = dataStats['WMBS_INFO'].pop(
            'sitePendCountByPrio', [])

        for site, item in viewitems(sitePendCountByPrio):
            # it seems sites with no jobs are also always here as "Sitename": {0: 0}
            if list(item) == [0]:
                continue
            for prio, jobs in viewitems(item):
                prioDoc = {}
                prioDoc['site_name'] = site
                prioDoc['type'] = docType
                prioDoc['priority'] = prio
                prioDoc['job_count'] = jobs
                prioDocs.append(prioDoc)
        return prioDocs

    def _buildMonITSitesDocs(self, dataStats):
        """
        Uses the site thresholds and job information for each site in order
        to build a `site_info` document type for MonIT.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_site_info MonIT docs
        """
        docType = "wma_site_info"
        siteDocs = []
        thresholds = dataStats['WMBS_INFO'].pop('thresholds', {})
        thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {})
        if self.isT0agent:
            possibleJobsPerSite = {}
            uniqueJobsPerSite = {}
        else:
            possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop(
                'possibleJobsPerSite', {})
            uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop(
                'uniqueJobsPerSite', {})

        for site in sorted(thresholds):
            siteDoc = {}
            siteDoc['site_name'] = site
            siteDoc['type'] = docType
            siteDoc['thresholds'] = thresholds[site]
            siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown')
            siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0)

            for status in possibleJobsPerSite:
                # make sure these keys are always present in the documents
                jobKey = "possible_%s_jobs" % status.lower()
                elemKey = "num_%s_elem" % status.lower()
                uniJobKey = "unique_%s_jobs" % status.lower()
                siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0
                if site in possibleJobsPerSite[status]:
                    siteDoc[jobKey] = possibleJobsPerSite[status][site][
                        'sum_jobs']
                    siteDoc[elemKey] = possibleJobsPerSite[status][site][
                        'num_elem']
                if site in uniqueJobsPerSite[status]:
                    siteDoc[uniJobKey] = uniqueJobsPerSite[status][site][
                        'sum_jobs']

            siteDocs.append(siteDoc)

        return siteDocs

    def _buildMonITWorkDocs(self, dataStats):
        """
        Uses the local workqueue information order by WQE status and build
        statistics for the workload in terms of workqueue elements and top
        level jobs.
        Using the WMBS data, also builds documents to show the amount of
        work in 'created' and 'executing' WMBS status.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_work_info MonIT docs
        """
        workDocs = []
        if self.isT0agent:
            return workDocs

        docType = "wma_work_info"
        workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {})
        for status, info in viewitems(workByStatus):
            workDoc = {}
            workDoc['type'] = docType
            workDoc['status'] = status
            workDoc['num_elem'] = info.get('num_elem', 0)
            workDoc['sum_jobs'] = info.get('sum_jobs', 0)
            workDocs.append(workDoc)

        return workDocs

    def _buildMonITWMBSDocs(self, dataStats):
        """
        Using the WMBS data, builds documents to show the amount of work in
        'created' and 'executing' WMBS status.
        It also builds a document for every single wmbs_status in the database.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs
        """
        docType = "wma_wmbs_info"
        wmbsDocs = []
        wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop(
            'wmbsCreatedTypeCount', {})
        wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop(
            'wmbsExecutingTypeCount', {})
        for jobType in wmbsCreatedTypeCount:
            wmbsDoc = {}
            wmbsDoc['type'] = docType
            wmbsDoc['job_type'] = jobType
            wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType]
            wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType]
            wmbsDocs.append(wmbsDoc)

        docType = "wma_wmbs_state_info"
        wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {})
        for wmbsStatus in wmbsCountByState:
            wmbsDoc = {}
            wmbsDoc['type'] = docType
            wmbsDoc['wmbs_status'] = wmbsStatus
            wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus]
            wmbsDocs.append(wmbsDoc)

        return wmbsDocs

    def _buildMonITAgentDocs(self, dataStats):
        """
        Uses the BossAir and WMBS table information in order to build a
        view of amount of jobs in different statuses.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_agent_info MonIT docs
        """
        docType = "wma_agent_info"
        agentDocs = []
        activeRunJobByStatus = dataStats['WMBS_INFO'].pop(
            'activeRunJobByStatus', {})
        completeRunJobByStatus = dataStats['WMBS_INFO'].pop(
            'completeRunJobByStatus', {})
        for schedStatus in activeRunJobByStatus:
            agentDoc = {}
            agentDoc['type'] = docType
            agentDoc['schedd_status'] = schedStatus
            agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus]
            agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus]
            agentDocs.append(agentDoc)

        return agentDocs

    def _buildMonITHealthDocs(self, dataStats):
        """
        Creates documents with specific agent information, status of
        each component and worker thread (similar to what is shown in
        wmstats) and also some very basic performance numbers.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_health_info MonIT docs
        """
        docType = "wma_health_info"
        healthDocs = []
        workersStatus = dataStats.pop('workers', {})
        for worker in workersStatus:
            healthDoc = {}
            healthDoc['type'] = docType
            healthDoc['worker_name'] = worker['name']
            healthDoc['worker_state'] = worker['state']
            healthDoc['worker_poll'] = worker['poll_interval']
            healthDoc['worker_last_hb'] = worker['last_updated']
            healthDoc['worker_cycle_time'] = worker['cycle_time']
            healthDocs.append(healthDoc)

        return healthDocs

    def _buildMonITSummaryDocs(self, dataStats):
        """
        Creates a document with the very basic agent info used
        in the wmstats monitoring tab.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_health_info MonIT docs
        """
        docType = "wma_summary_info"
        summaryDocs = []
        summaryDoc = {}
        summaryDoc['type'] = docType
        summaryDoc['agent_team'] = dataStats['agent_team']
        summaryDoc['agent_version'] = dataStats['agent_version']
        summaryDoc['agent_status'] = dataStats['status']
        if not self.isT0agent:
            summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO'][
                'total_query_time']
        summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO'][
            'total_query_time']
        summaryDoc['drain_mode'] = dataStats['drain_mode']
        summaryDoc['down_components'] = dataStats['down_components']
        summaryDocs.append(summaryDoc)
        return summaryDocs

    def uploadToAMQ(self, docs, agentUrl, timeS):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in the MonIT infrastructure.
        :param docs: list of documents/dicts to be posted
        """
        if not docs:
            logging.info("There are no documents to send to AMQ")
            return
        # add mandatory information for every single document
        for doc in docs:
            doc['agent_url'] = agentUrl

        docType = "cms_%s_info" % self.producer
        notifications = []

        logging.debug("Sending the following data to AMQ %s", pformat(docs))
        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=self.producer,
                                topic=self.topicAMQ,
                                validation_schema=None,
                                host_and_ports=self.hostPortAMQ,
                                logger=logging)

            for doc in docs:
                singleNotif, _, _ = stompSvc.make_notification(
                    payload=doc,
                    docType=docType,
                    ts=timeS,
                    dataSubfield="payload")
                notifications.append(singleNotif)

            failures = stompSvc.send(notifications)
            msg = "%i out of %i documents successfully sent to AMQ" % (
                len(notifications) - len(failures), len(notifications))
            logging.info(msg)
        except Exception as ex:
            logging.exception("Failed to send data to StompAMQ. Error %s",
                              str(ex))

        return
Beispiel #40
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = (
            config.AnalyticsDataCollector.summaryLevel).lower()

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        #TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

    # delete or replicator docs befor setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        if hasattr(self.config, "Tier0Feeder"):
            self.centralWMStatsCouchDB = WMStatsWriter(
                self.config.AnalyticsDataCollector.localWMStatsURL,
                appName="WMStatsAgent")
        else:
            self.centralWMStatsCouchDB = WMStatsWriter(
                self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Getting Agent info ...")
            agentInfo = self.collectAgentInfo()

            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())

            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            logging.info("Agent components down:\n %s" %
                         agentInfo['down_components'])
            logging.info(
                "Agent in drain mode:\n %s \nsleep for next WMStats alarm updating cycle"
                % agentInfo['drain_mode'])

        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())

    def collectCouchDBInfo(self):

        couchInfo = {'status': 'ok', 'error_message': ""}

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        msg = ""
        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'

        couchInfo['error_message'] = msg
        return couchInfo

    def collectAgentInfo(self):

        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['status'] = "warning"

        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()

        if (couchInfo['status'] != 'ok'):
            agentInfo['down_components'].append("CouchServer")
            agentInfo['status'] = couchInfo['status']
            couchInfo['name'] = "CouchServer"
            agentInfo['down_component_detail'].append(couchInfo)

        # Disk space warning
        diskUseList = diskUse()
        diskUseThreshold = float(
            self.config.AnalyticsDataCollector.diskUseThreshold)
        agentInfo['disk_warning'] = []
        for disk in diskUseList:
            if float(disk['percent'].strip('%')) >= diskUseThreshold and disk[
                    'mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk:
                agentInfo['disk_warning'].append(disk)

        # Couch process warning
        couchProc = numberCouchProcess()
        couchProcessThreshold = float(
            self.config.AnalyticsDataCollector.couchProcessThreshold)
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo(self)
        if lastDataUpload['data_last_update'] != 0:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error'] != "":
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok':
            if agentInfo['disk_warning'] != []:
                agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if ('data_error' in agentInfo and agentInfo['data_error'] != 'ok') or \
               ('couch_process_warning' in agentInfo and agentInfo['couch_process_warning'] != 0):
                agentInfo['status'] = "error"

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC,
                                           uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)
Beispiel #41
0
        for ds in helper.listOutputDatasets():
            if ds not in request['OutputDatasets']:
                request['OutputDatasets'].append(ds)

    # don't want to JSONify the whole workflow
    del metadata['WorkloadSpec']
    workloadUrl = helper.saveCouch(couchUrl, couchDB, metadata=metadata)
    request['RequestWorkflow'] = removePasswordFromUrl(workloadUrl)
    try:
        CheckIn.checkIn(request, reqSchema['RequestType'])
    except CheckIn.RequestCheckInError, ex:
        msg = ex._message
        raise HTTPError(400, "Error in Request check-in: %s" % msg)

    try:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.insertRequest(request)
    except Exception as ex:
        webApi.error("Could not update WMStats, reason: %s" % ex)
        raise HTTPError(400,
                        "Creating request failed, could not update WMStats.")

    return request


def makeRequest(webApi, reqInputArgs, couchUrl, couchDB, wmstatUrl):
    """
    Handles the submission of requests.
    
    """
Beispiel #42
0
class AnalyticsPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = (
            config.AnalyticsDataCollector.summaryLevel).lower()
        self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName",
                                  None)
        self.plugin = None

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """
        # set the connection to local queue
        if not hasattr(self.config, "Tier0Feeder"):
            self.localQueue = WorkQueueService(
                self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(
            self.config.AnalyticsDataCollector.localCouchURL,
            self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(
            self.config.AnalyticsDataCollector.localWMStatsURL,
            appName="WMStatsAgent")

        if hasattr(self.config, "Tier0Feeder"):
            #use local db for tier0
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL
        else:
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL

        self.centralRequestCouchDB = RequestDBWriter(
            centralRequestCouchDBURL,
            couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
        #TODO: change the config to hold couch url
        self.localCouchServer = CouchMonitor(
            self.config.JobStateMachine.couchurl)

        if self.pluginName != None:
            pluginFactory = WMFactory(
                "plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname=self.pluginName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:

            #jobs per request info
            logging.info("Getting Job Couch Data ...")
            jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite(
            )

            #fwjr per request info
            logging.info("Getting FWJRJob Couch Data ...")

            fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB(
            )

            logging.info("Getting Batch Job Data ...")
            batchJobInfo = self.wmagentDB.getBatchJobInfo()

            logging.info("Getting Finished Task Data ...")
            finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask()

            # get the data from local workqueue:
            # request name, input dataset, inWMBS, inQueue
            logging.info("Getting Local Queue Data ...")
            localQInfo = {}
            if not hasattr(self.config, "Tier0Feeder"):
                localQInfo = self.localQueue.getAnalyticsData()
            else:
                logging.debug("Tier-0 instance, not checking WorkQueue")

            # combine all the data from 3 sources
            logging.info(
                """Combining data from
                                   Job Couch(%s),
                                   FWJR(%s), 
                                   Batch Job(%s),
                                   Finished Tasks(%s),
                                   Local Queue(%s)  ...""" %
                (len(jobInfoFromCouch), len(fwjrInfoFromCouch),
                 len(batchJobInfo), len(finishedTasks), len(localQInfo)))

            tempCombinedData = combineAnalyticsData(jobInfoFromCouch,
                                                    batchJobInfo)
            combinedRequests = combineAnalyticsData(tempCombinedData,
                                                    localQInfo)

            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())

            logging.info(
                "%s requests Data combined,\n uploading request data..." %
                len(combinedRequests))
            requestDocs = convertToRequestCouchDoc(combinedRequests,
                                                   fwjrInfoFromCouch,
                                                   finishedTasks,
                                                   self.agentInfo, uploadTime,
                                                   self.summaryLevel)

            if self.plugin != None:
                self.plugin(requestDocs, self.localSummaryCouchDB,
                            self.centralRequestCouchDB)

            self.localSummaryCouchDB.uploadData(requestDocs)
            logging.info(
                "Request data upload success\n %s request, \nsleep for next cycle"
                % len(requestDocs))
            DataUploadTime.setInfo(uploadTime, "ok")

        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            DataUploadTime.setInfo(False, str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
Beispiel #43
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile

        proxyArgs = {'logger': logging.getLogger()}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY

        localWQUrl = config.AnalyticsDataCollector.localQueueURL
        self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        # TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(
            self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkProxyLifetime(agentInfo)

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not hasattr(self.config, "Tier0Feeder"):
                # Tier0 Agent doesn't have LQ.
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs",
                             timeSpent)

            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            # save locally json file as well
            with open(self.jsonFile, 'w') as outFile:
                json.dump(agentInfo, outFile, indent=2)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s",
                              str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results[
            'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority(
            )

        elements = self.workqueueDS.getElementsByStatus(
            ['Available', 'Acquired'])
        uniSites, posSites = getGlobalSiteStatusSummary(elements,
                                                        dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {
            'name': 'CouchServer',
            'status': 'ok',
            'error_message': ""
        }

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config,
                                                               updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s",
                     agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC,
                                           uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s",
                      results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s",
                      results['wmbsCountByState'])
        logging.debug(
            "Total number of 'created' jobs in WMBS sorted by type: %s",
            results['wmbsCreatedTypeCount'])
        logging.debug(
            "Total number of 'executing' jobs in WMBS sorted by type: %s",
            results['wmbsExecutingTypeCount'])

        logging.debug(
            "Total number of active jobs in BossAir sorted by status: %s",
            results['activeRunJobByStatus'])
        logging.debug(
            "Total number of complete jobs in BossAir sorted by status: %s",
            results['completeRunJobByStatus'])

        logging.debug(
            "Available slots thresholds to pull work from GQ to LQ: %s",
            results['thresholdsGQ2LQ'])
        logging.debug(
            "List of jobs pending for each site, sorted by priority: %s",
            results['sitePendCountByPrio'])

        return results

    def checkProxyLifetime(self, agInfo):
        """
        Check the proxy lifetime (usually X509_USER_CERT) and raise either
        a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :return: same dictionary object plus additional keys/values if needed.
        """
        secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile)
        logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile,
                      secsLeft)

        if secsLeft <= 86400 * 3:  # 3 days
            proxyWarning = True
            agInfo['status'] = "error"
        elif secsLeft <= 86400 * 5:  # 5 days
            proxyWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            proxyWarning = False

        if proxyWarning:
            warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile
            warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.)
            agInfo['proxy_warning'] = warnMsg

        return
import os
import sys
from optparse import OptionParser
from WMCore.Services.WMStats.WMStatsWriter import WMStatsWriter
from WMCore.Configuration import loadConfigurationFile

if __name__ == "__main__":

    if "WMAGENT_CONFIG" not in os.environ:
        print("The WMAGENT_CONFIG environment variable needs to be set before this can run")
        sys.exit(1)

    wmagentConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
    
    if hasattr(wmagentConfig, "AnalyticsDataCollector") and hasattr(wmagentConfig.AnalyticsDataCollector, "centralWMStatsURL"):
        wmstats = WMStatsWriter(wmagentConfig.AnalyticsDataCollector.centralWMStatsURL)
    else:
        print("AnalyticsDataCollector.centralWMStatsURL is not specified")
        sys.exit(1)
        
    parser = OptionParser()
    parser.set_usage("wmstats-request-status-chagne [agent_url:port]")
    
    parser.add_option("-r", "--request", dest = "request",
                      help = "resquest name")
    
    parser.add_option("-s", "--status", dest = "newstatus",
                      help = "set to new status")
    
    (options, args) = parser.parse_args()
    
Beispiel #45
0
class HeartbeatMonitorBase(CherryPyPeriodicTask):

    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list
        self.userAMQ = getattr(config, "user_amq", None)
        self.passAMQ = getattr(config, "pass_amq", None)
        self.postToAMQ = getattr(config, "post_to_amq", False)
        self.topicAMQ = getattr(config, "topic_amq", None)
        self.hostPortAMQ = getattr(config, "host_port_amq", None)

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration}]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional monitoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}

    def uploadToAMQ(self, docs, producer=None):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in elastic search.
        :param docs: list of documents/dicts to be posted
        :param producer: service name that's providing this info
        """
        if not docs:
            self.logger.info("There are no documents to send to AMQ")
            return

        producer = producer or self.producer
        self.logger.debug("Sending the following data to AMQ %s", pformat(docs))
        ts = int(time.time())

        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=producer,
                                topic=self.topicAMQ,
                                host_and_ports=self.hostPortAMQ,
                                logger=self.logger)

            notifications = [stompSvc.make_notification(payload=doc, docType=self.docTypeAMQ, ts=ts,
                                                        dataSubfield="payload") for doc in docs]

            failures = stompSvc.send(notifications)
            self.logger.info("%i docs successfully sent to Stomp AMQ", len(notifications) - len(failures))
        except Exception as ex:
            self.logger.exception("Failed to send data to StompAMQ. Error %s", str(ex))

        return
Beispiel #46
0
class HeartbeatMonitorBase(CherryPyPeriodicTask):
    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list
        self.userAMQ = getattr(config, "user_amq", None)
        self.passAMQ = getattr(config, "pass_amq", None)
        self.postToAMQ = getattr(config, "post_to_amq", False)
        self.topicAMQ = getattr(config, "topic_amq", None)
        self.hostPortAMQ = getattr(config, "host_port_amq", None)

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{
            'func': self.reportToWMStats,
            'duration': config.heartbeatCheckDuration
        }]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional monitoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(
            self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo,
                                                config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}

    def uploadToAMQ(self, docs, producer=None):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in elastic search.
        :param docs: list of documents/dicts to be posted
        :param producer: service name that's providing this info
        """
        if not docs:
            self.logger.info("There are no documents to send to AMQ")
            return

        producer = producer or self.producer
        self.logger.debug("Sending the following data to AMQ %s",
                          pformat(docs))
        ts = int(time.time())

        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=producer,
                                topic=self.topicAMQ,
                                host_and_ports=self.hostPortAMQ,
                                logger=self.logger)

            notifications = stompSvc.make_notification(payload=docs,
                                                       docType=self.docTypeAMQ,
                                                       docId=producer,
                                                       ts=ts)

            failures = stompSvc.send(notifications)
            self.logger.info("%i docs successfully sent to Stomp AMQ",
                             len(notifications) - len(failures))
        except Exception as ex:
            self.logger.exception("Failed to send data to StompAMQ. Error %s",
                                  str(ex))

        return
Beispiel #47
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile
        # counter for deep agent monitoring. Every 15min (3 cycles of the component)
        self.monitorCounter = 0
        self.monitorInterval = getattr(config.AgentStatusWatcher,
                                       'monitorPollInterval', 3)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        #TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source':
                sanitizeURL(parentQURL)['url'],
                'target':
                localQInboxURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                rp['source'],
                rp['target'],
                filter=rp['filter'],
                query_params=rp.get('query_params', False),
                continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(
            self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            if self.monitorCounter % self.monitorInterval == 0:
                monitoring = self.collectWMBSInfo()
                monitoring['components'] = agentInfo['down_components']
                monitoring['timestamp'] = int(time.time())
                with open(self.jsonFile, 'w') as outFile:
                    json.dump(monitoring, outFile, indent=2)
            self.monitorCounter += 1
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())

    def collectCouchDBInfo(self):

        couchInfo = {
            'name': 'CouchServer',
            'status': 'ok',
            'error_message': ""
        }

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Disk space warning
        diskUseList = diskUse()
        diskUseThreshold = float(
            self.config.AnalyticsDataCollector.diskUseThreshold)
        agentInfo['disk_warning'] = []
        for disk in diskUseList:
            if float(disk['percent'].strip('%')) >= diskUseThreshold and \
                            disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk:
                agentInfo['disk_warning'].append(disk)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        if agentInfo['down_components']:
            logging.info("List of agent components down: %s" %
                         agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC,
                                           uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        results = {}
        logging.info("Getting wmbs job info ...")
        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.info("Running and pending site thresholds: %s",
                     results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())
        logging.info("Total number of jobs in WMBS sorted by status: %s",
                     results['wmbsCountByState'])
        logging.info(
            "Total number of 'created' jobs in WMBS sorted by type: %s",
            results['wmbsCreatedTypeCount'])
        logging.info(
            "Total number of 'executing' jobs in WMBS sorted by type: %s",
            results['wmbsExecutingTypeCount'])

        logging.info(
            "Total number of active jobs in BossAir sorted by status: %s",
            results['activeRunJobByStatus'])
        logging.info(
            "Total number of complete jobs in BossAir sorted by status: %s",
            results['completeRunJobByStatus'])

        logging.info(
            "Available slots thresholds to pull work from GQ to LQ: %s",
            results['thresholdsGQ2LQ'])
        logging.info(
            "List of jobs pending for each site, sorted by priority: %s",
            results['sitePendCountByPrio'])

        return results
Beispiel #48
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.config = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, 'WorkQueueParams'):
                self.workQueue = localQueue(
                    **self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.maxProcessSize = getattr(self.config.TaskArchiver,
                                      'maxProcessSize', 250)
        self.timeout = getattr(self.config.TaskArchiver, "timeOut", None)
        self.nOffenders = getattr(self.config.TaskArchiver, 'nOffenders', 3)
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.uploadPublishInfo = getattr(self.config.TaskArchiver,
                                         'uploadPublishInfo', False)
        self.uploadPublishDir = getattr(self.config.TaskArchiver,
                                        'uploadPublishDir', None)
        self.userFileCacheURL = getattr(self.config.TaskArchiver,
                                        'userFileCacheURL', None)

        # Set up optional histograms
        self.histogramKeys = getattr(self.config.TaskArchiver, "histogramKeys",
                                     [])
        self.histogramBins = getattr(self.config.TaskArchiver, "histogramBins",
                                     10)
        self.histogramLimit = getattr(self.config.TaskArchiver,
                                      "histogramLimit", 5.0)

        if not self.useReqMgrForCompletionCheck:
            #sets the local monitor summary couch db
            self.wmstatsCouchDB = WMStatsWriter(
                self.config.TaskArchiver.localWMStatsURL)
            self.centralCouchDBWriter = self.wmstatsCouchDB
        else:
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
        # Start a couch server for getting job info
        # from the FWJRs for committal to archive
        try:
            workDBName = getattr(self.config.TaskArchiver,
                                 'workloadSummaryCouchDBName',
                                 'workloadsummary')
            workDBurl = getattr(self.config.TaskArchiver,
                                'workloadSummaryCouchURL')
            jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
            jobDBName = self.config.JobStateMachine.couchDBName
            self.jobCouchdb = CouchServer(jobDBurl)
            self.workCouchdb = CouchServer(workDBurl)

            self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                                jobDBName)
            self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                                jobDBName)
            self.workdatabase = self.workCouchdb.connectDatabase(workDBName)

            logging.debug("Using url %s/%s for job" % (jobDBurl, jobDBName))
            logging.debug("Writing to  %s/%s for workloadSummary" %
                          (sanitizeURL(workDBurl)['url'], workDBName))
            self.requireCouch = getattr(self.config.TaskArchiver,
                                        'requireCouch', False)
        except Exception, ex:
            msg = "Error in connecting to couch.\n"
            msg += str(ex)
            logging.error(msg)
            self.jobsdatabase = None
            self.fwjrdatabase = None
            if getattr(self.config.TaskArchiver, 'requireCouch', False):
                raise TaskArchiverPollerException(msg)
 def __init__(self, rest, config):
     super(HeartbeatMonitorBase, self).__init__(config)
     self.centralWMStats = WMStatsWriter(config.wmstats_url)
     self.threadList = config.thread_list
Beispiel #50
0
class AnalyticsPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower()
        self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None)
        self.plugin = None
                    
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """
        # set the connection to local queue
        if not hasattr(self.config, "Tier0Feeder"):
            self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, 
                                             self.config.JobStateMachine.summaryStatsDBName,
                                             self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, 
                                                 appName="WMStatsAgent")
        
        if hasattr(self.config, "Tier0Feeder"):
            #use local db for tier0
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL
        else:
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL
        
        self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
        #TODO: change the config to hold couch url
        self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)
        
        if self.pluginName != None:
            pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname = self.pluginName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            
            #jobs per request info
            logging.info("Getting Job Couch Data ...")
            jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite()

            #fwjr per request info
            logging.info("Getting FWJRJob Couch Data ...")
            
            fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB()
            
            logging.info("Getting Batch Job Data ...")
            batchJobInfo = self.wmagentDB.getBatchJobInfo()
            
            logging.info("Getting Finished Task Data ...")
            finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask()

            # get the data from local workqueue:
            # request name, input dataset, inWMBS, inQueue
            logging.info("Getting Local Queue Data ...")
            localQInfo = {}
            if not hasattr(self.config, "Tier0Feeder"):
                localQInfo = self.localQueue.getAnalyticsData()
            else: 
                logging.debug("Tier-0 instance, not checking WorkQueue")

            # combine all the data from 3 sources
            logging.info("""Combining data from
                                   Job Couch(%s),
                                   FWJR(%s), 
                                   Batch Job(%s),
                                   Finished Tasks(%s),
                                   Local Queue(%s)  ...""" 
                    % (len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(batchJobInfo), len(finishedTasks), len(localQInfo)))

            tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo)
            combinedRequests = combineAnalyticsData(tempCombinedData, localQInfo)

            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())
            
            logging.info("%s requests Data combined,\n uploading request data..." % len(combinedRequests))
            requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks,
                                                   self.agentInfo, uploadTime, self.summaryLevel)


            if self.plugin != None:
                self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB)

            self.localSummaryCouchDB.uploadData(requestDocs)
            logging.info("Request data upload success\n %s request, \nsleep for next cycle" % len(requestDocs))
            DataUploadTime.setInfo(uploadTime, "ok")
            
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            DataUploadTime.setInfo(False, str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
Beispiel #51
0
def buildWorkloadAndCheckIn(webApi, reqSchema, couchUrl, couchDB, wmstatUrl, clone=False):
    """
    If clone is True, the function is called on a cloned request in which
    case no modification of the reqSchema shall happen and should be checked in
    as is.
    
    """
    try:
        request = buildWorkloadForRequest(typename = reqSchema["RequestType"], 
                                          schema = reqSchema)
    except WMSpecFactoryException as ex:
        logging.error(traceback.format_exc())
        raise HTTPError(400, "Error in Workload Validation: %s" % ex.message())
    
    helper = WMWorkloadHelper(request['WorkloadSpec'])
    
    # update request as well for wmstats update
    # there is a better way to do this (passing helper to request but make sure all the information is there) 
    request["Campaign"] = helper.getCampaign()

    # Add the output datasets if necessary
    # for some bizarre reason OutpuDatasets is list of lists, when cloning
    # [['/MinimumBias/WMAgentCommissioning10-v2/RECO'], ['/MinimumBias/WMAgentCommissioning10-v2/ALCARECO']]
    # #3743
    if not clone:
        for ds in helper.listOutputDatasets():
            if ds not in request['OutputDatasets']:
                request['OutputDatasets'].append(ds)
    #TODO: need to update output dataset by Task for task chain requests
    # can't save Request object directly, because it makes it hard to retrieve the _rev
    metadata = {}
    metadata.update(request)            
    # don't want to JSONify the whole workflow
    del metadata['WorkloadSpec']
    workloadUrl = helper.saveCouch(couchUrl, couchDB, metadata=metadata)
    request['RequestWorkflow'] = removePasswordFromUrl(workloadUrl)
    try:
        CheckIn.checkIn(request, reqSchema['RequestType'])
    except CheckIn.RequestCheckInError as ex:
        raise HTTPError(400, "Error in Request check-in: %s" % str(ex))
        
    # Inconsistent request parameters between Oracle and Couch (#4380, #4388)
    # metadata above is what is saved into couch to represent a request document.
    # Number of request arguments on a corresponding couch document
    # is not set, has default null/None values, update those accordingly now.
    # It's a mess to have two mutually inconsistent database backends.
    # Not easy to handle this earlier since couch is stored first and
    # some parameters are worked out later when storing into Oracle.
    reqDetails = requestDetails(request["RequestName"])
    # couchdb request parameters which are null at the injection time and remain so
    paramsToUpdate = ["RequestStatus",
                      "RequestSizeFiles",
                      "AcquisitionEra",
                      "RequestWorkflow",
                      "RequestType",
                      "RequestStatus",
                      "RequestPriority",
                      "Requestor",
                      "Group",
                      "SizePerEvent",
                      "PrepID",
                      "RequestNumEvents",
                      "ProcessingString",
                      "ProcessingVersion",
                      ]
    
    couchDb = Database(reqDetails["CouchWorkloadDBName"], reqDetails["CouchURL"])
    fields = {}
    for key in paramsToUpdate:
        fields[key] = reqDetails[key]
    couchDb.updateDocument(request["RequestName"], "ReqMgr", "updaterequest", fields=fields, useBody=True) 
        
    try:
        wmstatSvc = WMStatsWriter(wmstatUrl)
        wmstatSvc.insertRequest(request)
    except Exception as ex:
        webApi.error("Could not update WMStats, reason: %s" % ex)
        raise HTTPError(400, "Creating request failed, could not update WMStats.")

    return request
Beispiel #52
0
    def __init__(self, rest, config):

        super(CleanUpTask, self).__init__(config)
        self.wmstatsDB = WMStatsWriter(config.wmstats_url, reqdbURL=config.reqmgrdb_url, 
                              reqdbCouchApp=config.reqdb_couch_app)
import sys
from optparse import OptionParser
from WMCore.Services.WMStats.WMStatsWriter import WMStatsWriter
from WMCore.Configuration import loadConfigurationFile

if __name__ == "__main__":

    if "WMAGENT_CONFIG" not in os.environ:
        print "The WMAGENT_CONFIG environment variable needs to be set before this can run"
        sys.exit(1)

    wmagentConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])

    if hasattr(wmagentConfig, "AnalyticsDataCollector") and hasattr(
            wmagentConfig.AnalyticsDataCollector, "centralWMStatsURL"):
        wmstats = WMStatsWriter(
            wmagentConfig.AnalyticsDataCollector.centralWMStatsURL)
    else:
        print "AnalyticsDataCollector.centralWMStatsURL is not specified"
        sys.exit(1)

    parser = OptionParser()
    parser.set_usage("wmstats-request-status-chagne [agent_url:port]")

    parser.add_option("-r", "--request", dest="request", help="resquest name")

    parser.add_option("-s",
                      "--status",
                      dest="newstatus",
                      help="set to new status")

    (options, args) = parser.parse_args()
Beispiel #54
0
class AnalyticsPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower()
        self.pluginName = getattr(config.AnalyticsDataCollector, "pluginName", None)
        self.plugin = None

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """
        # set the connection to local queue
        if not hasattr(self.config, "Tier0Feeder"):
            self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)

        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL,
                                             self.config.JobStateMachine.summaryStatsDBName,
                                             self.summaryLevel)

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL,
                                                 appName="WMStatsAgent")

        # use local db for tier0
        if hasattr(self.config, "Tier0Feeder"):
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL
        else:
            centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL

        self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL,
                                                     couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
        self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL)

        #TODO: change the config to hold couch url
        self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)

        self.dbsBufferUtil = DBSBufferUtil()

        if self.pluginName is not None:
            pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins")
            self.plugin = pluginFactory.loadObject(classname=self.pluginName)

    @timeFunction
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            # jobs per request info
            logging.info("Getting Job Couch Data ...")
            jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite()

            # fwjr per request info
            logging.info("Getting FWJRJob Couch Data ...")

            fwjrInfoFromCouch = self.localCouchDB.getJobPerformanceByTaskAndSiteFromSummaryDB()
            skippedInfoFromCouch = self.localCouchDB.getSkippedFilesSummaryByWorkflow()

            logging.info("Getting Batch Job Data ...")
            batchJobInfo = self.wmagentDB.getBatchJobInfo()

            logging.info("Getting Finished Task Data ...")
            finishedTasks = self.wmagentDB.getFinishedSubscriptionByTask()

            logging.info("Getting DBS PhEDEx upload status ...")
            completedWfs = self.dbsBufferUtil.getPhEDExDBSStatusForCompletedWorkflows(summary=True)

            # get the data from local workqueue:
            # request name, input dataset, inWMBS, inQueue
            logging.info("Getting Local Queue Data ...")
            localQInfo = {}
            if not hasattr(self.config, "Tier0Feeder"):
                localQInfo = self.localQueue.getAnalyticsData()
            else:
                logging.debug("Tier-0 instance, not checking WorkQueue")

            # combine all the data from 3 sources
            logging.info("""Combining data from
                                   Job Couch(%s),
                                   FWJR(%s),
                                   WorkflowsWithSkippedFile(%s),
                                   Batch Job(%s),
                                   Finished Tasks(%s),
                                   Local Queue(%s)
                                   Completed workflows(%s)..  ...""",
                         len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(skippedInfoFromCouch),
                         len(batchJobInfo), len(finishedTasks), len(localQInfo), len(completedWfs))

            tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo)
            tempCombinedData2 = combineAnalyticsData(tempCombinedData, localQInfo)
            combinedRequests = combineAnalyticsData(tempCombinedData2, completedWfs)

            # set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())

            logging.info("%s requests Data combined,\n uploading request data...", len(combinedRequests))
            requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch, finishedTasks,
                                                   skippedInfoFromCouch, self.agentInfo,
                                                   uploadTime, self.summaryLevel)

            if self.plugin != None:
                self.plugin(requestDocs, self.localSummaryCouchDB, self.centralRequestCouchDB)

            existingDocs = self.centralWMStatsCouchDB.getAllAgentRequestRevByID(self.agentInfo["agent_url"])
            self.centralWMStatsCouchDB.bulkUpdateData(requestDocs, existingDocs)

            logging.info("Request data upload success\n %s request, \nsleep for next cycle", len(requestDocs))

            self.centralWMStatsCouchDB.updateAgentInfoInPlace(self.agentInfo["agent_url"],
                                                              {"data_last_update": uploadTime, "data_error": "ok"})

        except Exception as ex:
            msg = str(ex)
            logging.exception("Error occurred, will retry later: %s", msg)

            try:
                self.centralWMStatsCouchDB.updateAgentInfoInPlace(self.agentInfo["agent_url"], {"data_error": msg})
            except:
                logging.error("upload Agent Info to central couch failed")
Beispiel #55
0
    def __init__(self, rest, config):

        super(CleanUpTask, self).__init__(config)
        self.wmstatsDB = WMStatsWriter(config.wmstats_url,
                                       reqdbURL=config.reqmgrdb_url,
                                       reqdbCouchApp=config.reqdb_couch_app)
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile

        proxyArgs = {'logger': logging.getLogger()}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY

        localWQUrl = config.AnalyticsDataCollector.localQueueURL
        self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget,
                                    'filter': "WMStatsAgent/repfilter"})
        # TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({'source': t0Source, 'target': t0Target,
                                        'filter': "T0Request/repfilter"})
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']}
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL,
                                        'filter': wqfilter, 'query_params': query_params})
            self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL,
                                        'filter': wqfilter, 'query_params': query_params})

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                rp['source'], rp['target'], filter=rp['filter'],
                query_params=rp.get('query_params', False),
                continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkProxyLifetime(agentInfo)

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not hasattr(self.config, "Tier0Feeder"):
                # Tier0 Agent doesn't have LQ.
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs", timeSpent)

            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            # save locally json file as well
            with open(self.jsonFile, 'w') as outFile:
                json.dump(agentInfo, outFile, indent=2)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority()

        elements = self.workqueueDS.getElementsByStatus(['Available', 'Acquired'])
        uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""}

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'],
                                                                  rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s", agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s", results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState'])
        logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount'])
        logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount'])

        logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus'])
        logging.debug("Total number of complete jobs in BossAir sorted by status: %s",
                      results['completeRunJobByStatus'])

        logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ'])
        logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio'])

        return results

    def checkProxyLifetime(self, agInfo):
        """
        Check the proxy lifetime (usually X509_USER_CERT) and raise either
        a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :return: same dictionary object plus additional keys/values if needed.
        """
        secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile)
        logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft)


        if secsLeft <= 86400 * 3:  # 3 days
            proxyWarning = True
            agInfo['status'] = "error"
        elif secsLeft <= 86400 * 5:  # 5 days
            proxyWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            proxyWarning = False

        if proxyWarning:
            warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile
            warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.)
            agInfo['proxy_warning'] = warnMsg

        return
Beispiel #57
0
class AccountantWorker(WMConnectionBase):
    """
    Class that actually does the work of parsing FWJRs for the Accountant
    Run through ProcessPool
    """
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn_to_psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()

        return

    def reset(self):
        """
        _reset_

        Reset all global vars between runs.
        """
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        gc.collect()
        return

    def loadJobReport(self, parameters):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        jobReportPath = parameters.get("fwjr_path", None)
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999,
                                           "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://", "")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99999,
                'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except Exception as ex:
            msg = "Error loading jobReport %s\n" % jobReportPath
            msg += str(ex)
            logging.error(msg)
            logging.debug("Failing job: %s\n" % parameters)
            return self.createMissingFWKJR(parameters, 99997,
                                           'Cannot load jobReport')

        if len(jobReport.listSteps()) == 0:
            logging.error("FwkJobReport with no steps: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99997,
                'jobReport with no steps: %s ' % jobReportPath)

        return jobReport

    def isTaskExistInFWJR(self, jobReport, jobStatus):
        """
        If taskName is not available in the FWJR, then tries to
        recover it getting data from the SQL database.
        """
        if not jobReport.getTaskName():
            logging.warning(
                "Trying to recover a corrupted FWJR for a %s job with job id %s"
                % (jobStatus, jobReport.getJobID()))
            jobInfo = self.getJobTaskNameAction.execute(
                jobId=jobReport.getJobID(),
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

            jobReport.setTaskName(jobInfo['taskName'])
            jobReport.save(jobInfo['fwjr_path'])
            if not jobReport.getTaskName():
                msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (
                    jobStatus, jobReport.getJobID())
                raise AccountantWorkerException(msg)
            else:
                logging.info(
                    "TaskName '%s' successfully recovered and added to fwjr id %s."
                    % (jobReport.getTaskName(), jobReport.getJobID()))

        return

    def __call__(self, parameters):
        """
        __call__

        Handle a completed job.  The parameters dictionary will contain the job
        ID and the path to the framework job report.
        """
        returnList = []
        self.reset()

        for job in parameters:
            logging.info("Handling %s" % job["fwjr_path"])

            # Load the job and set the ID
            fwkJobReport = self.loadJobReport(job)
            fwkJobReport.setJobID(job['id'])

            jobSuccess = self.handleJob(jobID=job["id"],
                                        fwkJobReport=fwkJobReport)

            if self.returnJobReport:
                returnList.append({
                    'id': job["id"],
                    'jobSuccess': jobSuccess,
                    'jobReport': fwkJobReport
                })
            else:
                returnList.append({'id': job["id"], 'jobSuccess': jobSuccess})

            self.count += 1

        self.beginTransaction()

        # Now things done at the end of the job
        # Do what we can with WMBS files
        self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds)

        # handle merge files separately since parentage need to set
        # separately to support robust merge
        self.handleWMBSFiles(self.wmbsMergeFilesToBuild,
                             self.parentageBindsForMerge)

        # Create DBSBufferFiles
        self.createFilesInDBSBuffer()

        # Handle filesetAssoc
        if len(self.filesetAssoc) > 0:
            self.bulkAddToFilesetAction.execute(
                binds=self.filesetAssoc,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        # Move successful jobs to successful
        if len(self.listOfJobsToSave) > 0:
            idList = [x['id'] for x in self.listOfJobsToSave]
            outcomeBinds = [{
                'jobid': x['id'],
                'outcome': x['outcome']
            } for x in self.listOfJobsToSave]
            self.setBulkOutcome.execute(binds=outcomeBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.jobCompleteInput.execute(
                id=idList,
                lfnsToSkip=self.jobsWithSkippedFiles,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToSave, "success",
                                        "complete")

        # If we have failed jobs, fail them
        if len(self.listOfJobsToFail) > 0:
            outcomeBinds = [{
                'jobid': x['id'],
                'outcome': x['outcome']
            } for x in self.listOfJobsToFail]
            self.setBulkOutcome.execute(binds=outcomeBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed",
                                        "complete")

        # Arrange WMBS parentage
        if len(self.parentageBinds) > 0:
            self.setParentageByJob.execute(
                binds=self.parentageBinds,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
        if len(self.parentageBindsForMerge) > 0:
            self.setParentageByMergeJob.execute(
                binds=self.parentageBindsForMerge,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        # Straighten out DBS Parentage
        if len(self.mergedOutputFiles) > 0:
            self.handleDBSBufferParentage()

        if len(self.jobsWithSkippedFiles) > 0:
            self.handleSkippedFiles()

        self.commitTransaction(existingTransaction=False)

        return returnList

    def outputFilesetsForJob(self, outputMap, merged, moduleLabel):
        """
        _outputFilesetsForJob_

        Determine if the file should be placed in any other fileset.  Note that
        this will not return the JobGroup output fileset as all jobs will have
        their output placed there.
        """
        if moduleLabel not in outputMap:
            logging.info("Output module label missing from output map.")
            return []

        outputFilesets = []
        for outputFileset in outputMap[moduleLabel]:
            if merged == False and outputFileset["output_fileset"] != None:
                outputFilesets.append(outputFileset["output_fileset"])
            else:
                if outputFileset["merged_output_fileset"] != None:
                    outputFilesets.append(
                        outputFileset["merged_output_fileset"])

        return outputFilesets

    def addFileToDBS(self, jobReportFile, task, errorDataset=False):
        """
        _addFileToDBS_

        Add a file that was output from a job to the DBS buffer.
        """
        datasetInfo = jobReportFile["dataset"]

        dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"],
                                size=jobReportFile["size"],
                                events=jobReportFile["events"],
                                checksums=jobReportFile["checksums"],
                                status="NOTUPLOADED")
        dbsFile.setAlgorithm(appName=datasetInfo["applicationName"],
                             appVer=datasetInfo["applicationVersion"],
                             appFam=jobReportFile["module_label"],
                             psetHash="GIBBERISH",
                             configContent=jobReportFile.get('configURL'))

        if errorDataset:
            dbsFile.setDatasetPath(
                "/%s/%s/%s" %
                (datasetInfo["primaryDataset"] + "-Error",
                 datasetInfo["processedDataset"], datasetInfo["dataTier"]))
        else:
            dbsFile.setDatasetPath(
                "/%s/%s/%s" %
                (datasetInfo["primaryDataset"],
                 datasetInfo["processedDataset"], datasetInfo["dataTier"]))

        dbsFile.setValidStatus(
            validStatus=jobReportFile.get("validStatus", None))
        dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None))
        dbsFile.setAcquisitionEra(
            era=jobReportFile.get('acquisitionEra', None))
        dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None))
        #TODO need to find where to get the prep id
        dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None))
        dbsFile['task'] = task

        for run in jobReportFile["runs"]:
            newRun = Run(runNumber=run.run)
            newRun.extend(run.lumis)
            dbsFile.addRun(newRun)

        dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0],
                            immediateSave=False)
        self.dbsFilesToCreate.append(dbsFile)
        return

    def findDBSParents(self, lfn):
        """
        _findDBSParents_

        Find the parent of the file in DBS
        This is meant to be called recursively
        """
        parentsInfo = self.getParentInfoAction.execute(
            [lfn],
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        newParents = set()
        for parentInfo in parentsInfo:
            # This will catch straight to merge files that do not have redneck
            # parents.  We will mark the straight to merge file from the job
            # as a child of the merged parent.
            if int(parentInfo["merged"]) == 1:
                newParents.add(parentInfo["lfn"])

            elif parentInfo['gpmerged'] == None:
                continue

            # Handle the files that result from merge jobs that aren't redneck
            # children.  We have to setup parentage and then check on whether or
            # not this file has any redneck children and update their parentage
            # information.
            elif int(parentInfo["gpmerged"]) == 1:
                newParents.add(parentInfo["gplfn"])

            # If that didn't work, we've reached the great-grandparents
            # And we have to work via recursion
            else:
                parentSet = self.findDBSParents(lfn=parentInfo['gplfn'])
                for parent in parentSet:
                    newParents.add(parent)

        return newParents

    def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None):
        """
        _addFileToWMBS_

        Add a file that was produced in a job to WMBS.
        """
        fwjrFile["first_event"] = jobMask["FirstEvent"]

        if fwjrFile["first_event"] == None:
            fwjrFile["first_event"] = 0

        if jobType == "Merge" and fwjrFile["module_label"] != "logArchive":
            setattr(fwjrFile["fileRef"], 'merged', True)
            fwjrFile["merged"] = True

        wmbsFile = self.createFileFromDataStructsFile(file=fwjrFile,
                                                      jobID=jobID)

        if jobType == "Merge":
            self.wmbsMergeFilesToBuild.append(wmbsFile)
        else:
            self.wmbsFilesToBuild.append(wmbsFile)

        if fwjrFile["merged"]:
            self.addFileToDBS(
                fwjrFile, task, jobType == "Repack"
                and fwjrFile["size"] > self.maxAllowedRepackOutputSize)

        return wmbsFile

    def _mapLocation(self, fwkJobReport):
        for file in fwkJobReport.getAllFileRefs():
            if file and hasattr(file, 'location'):
                file.location = self.phedex.getBestNodeName(
                    file.location, self.locLists)

    def handleJob(self, jobID, fwkJobReport):
        """
        _handleJob_

        Figure out if a job was successful or not, handle it appropriately
        (parse FWJR, update WMBS) and return the success status as a boolean

        """
        jobSuccess = fwkJobReport.taskSuccessful()

        outputMap = self.getOutputMapAction.execute(
            jobID=jobID,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        jobType = self.getJobTypeAction.execute(
            jobID=jobID,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        if jobSuccess:
            fileList = fwkJobReport.getAllFiles()

            # consistency check comparing outputMap to fileList
            # they should match except for some limited special cases
            outputModules = set([])
            for fwjrFile in fileList:
                outputModules.add(fwjrFile['outputModule'])
            if set(outputMap.keys()) == outputModules:
                pass
            elif jobType == "LogCollect" and len(
                    outputMap.keys()) == 0 and outputModules == set(
                        ['LogCollect']):
                pass
            elif jobType == "Merge" and set(outputMap.keys()) == set([
                    'Merged', 'MergedError', 'logArchive'
            ]) and outputModules == set(['Merged', 'logArchive']):
                pass
            elif jobType == "Merge" and set(outputMap.keys()) == set([
                    'Merged', 'MergedError', 'logArchive'
            ]) and outputModules == set(['MergedError', 'logArchive']):
                pass
            elif jobType == "Express" and set(
                    outputMap.keys()).difference(outputModules) == set(
                        ['write_RAW']):
                pass
            else:
                failJob = True
                if jobType in ["Processing", "Production"]:
                    cmsRunSteps = 0
                    for step in fwkJobReport.listSteps():
                        if step.startswith("cmsRun"):
                            cmsRunSteps += 1
                    if cmsRunSteps > 1:
                        failJob = False

                if failJob:
                    jobSuccess = False
                    logging.error(
                        "Job %d , list of expected outputModules does not match job report, failing job",
                        jobID)
                    logging.debug("Job %d , expected outputModules %s", jobID,
                                  sorted(outputMap.keys()))
                    logging.debug("Job %d , fwjr outputModules %s", jobID,
                                  sorted(outputModules))
                    fileList = fwkJobReport.getAllFilesFromStep(
                        step='logArch1')
                else:
                    logging.debug(
                        "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job",
                        jobID)
        else:
            fileList = fwkJobReport.getAllFilesFromStep(step='logArch1')

        if jobSuccess:
            logging.info("Job %d , handle successful job", jobID)
        else:
            logging.warning("Job %d , bad jobReport, failing job", jobID)

        # make sure the task name is present in FWJR (recover from WMBS if needed)
        if len(fileList) > 0:
            if jobSuccess:
                self.isTaskExistInFWJR(fwkJobReport, "success")
            else:
                self.isTaskExistInFWJR(fwkJobReport, "failed")

        # special check for LogCollect jobs
        skipLogCollect = False
        if jobSuccess and jobType == "LogCollect":
            for fwjrFile in fileList:
                try:
                    # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes
                    self.associateLogCollectToParentJobsInWMStats(
                        fwkJobReport, fwjrFile["lfn"],
                        fwkJobReport.getTaskName())
                except Exception as ex:
                    skipLogCollect = True
                    logging.error(
                        "Error occurred: associating log collect location, will try again\n %s"
                        % str(ex))
                    break

        # now handle the job (unless the special LogCollect check failed)
        if not skipLogCollect:

            wmbsJob = Job(id=jobID)
            wmbsJob.load()
            outputID = wmbsJob.loadOutputID()
            wmbsJob.getMask()

            wmbsJob["fwjr"] = fwkJobReport

            if jobSuccess:
                wmbsJob["outcome"] = "success"
            else:
                wmbsJob["outcome"] = "failure"

            for fwjrFile in fileList:

                logging.debug("Job %d , register output %s", jobID,
                              fwjrFile["lfn"])

                wmbsFile = self.addFileToWMBS(jobType,
                                              fwjrFile,
                                              wmbsJob["mask"],
                                              jobID=jobID,
                                              task=fwkJobReport.getTaskName())
                merged = fwjrFile['merged']
                moduleLabel = fwjrFile["module_label"]

                if merged:
                    self.mergedOutputFiles.append(wmbsFile)

                self.filesetAssoc.append({
                    "lfn": wmbsFile["lfn"],
                    "fileset": outputID
                })

                # LogCollect jobs have no output fileset
                if jobType == "LogCollect":
                    pass
                # Repack jobs that wrote too large merged output skip output filesets
                elif jobType == "Repack" and merged and wmbsFile[
                        "size"] > self.maxAllowedRepackOutputSize:
                    pass
                else:
                    outputFilesets = self.outputFilesetsForJob(
                        outputMap, merged, moduleLabel)
                    for outputFileset in outputFilesets:
                        self.filesetAssoc.append({
                            "lfn": wmbsFile["lfn"],
                            "fileset": outputFileset
                        })

            # Check if the job had any skipped files, put them in ACDC containers
            # We assume full file processing (no job masks)
            if jobSuccess:
                skippedFiles = fwkJobReport.getAllSkippedFiles()
                if skippedFiles and jobType not in ['LogCollect', 'Cleanup']:
                    self.jobsWithSkippedFiles[jobID] = skippedFiles

            # Only save once job is done, and we're sure we made it through okay
            self._mapLocation(wmbsJob['fwjr'])
            if jobSuccess:
                self.listOfJobsToSave.append(wmbsJob)
            else:
                self.listOfJobsToFail.append(wmbsJob)

        return jobSuccess

    def associateLogCollectToParentJobsInWMStats(self, fwkJobReport,
                                                 logAchiveLFN, task):
        """
        _associateLogCollectToParentJobsInWMStats_

        Associate a logArchive output to its parent job
        """
        inputFileList = fwkJobReport.getAllInputFiles()
        requestName = task.split('/')[1]
        keys = []
        for inputFile in inputFileList:
            keys.append([requestName, inputFile["lfn"]])
        resultRows = self.fwjrCouchDB.loadView(
            "FWJRDump",
            'jobsByOutputLFN',
            options={"stale": "update_after"},
            keys=keys)['rows']
        if len(resultRows) > 0:
            #get data from wmbs
            parentWMBSJobIDs = []
            for row in resultRows:
                parentWMBSJobIDs.append({"jobid": row["value"]})
            #update Job doc in wmstats
            results = self.getJobInfoByID.execute(parentWMBSJobIDs)
            parentJobNames = []

            if isinstance(results, list):
                for jobInfo in results:
                    parentJobNames.append(jobInfo['name'])
            else:
                parentJobNames.append(results['name'])

            self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN)
        else:
            #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0)
            #It need to be failed and retried.
            logging.error(
                "job report is missing for updating log archive mapping\n Input file list\n %s"
                % inputFileList)

        return

    def createMissingFWKJR(self,
                           parameters,
                           errorCode=999,
                           errorDescription='Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", 84, errorCode, errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report

    def createFilesInDBSBuffer(self):
        """
        _createFilesInDBSBuffer_
        It does the actual job of creating things in DBSBuffer
        WARNING: This assumes all files in a job have the same final location
        """
        if len(self.dbsFilesToCreate) == 0:
            # Whoops, nothing to do!
            return

        dbsFileTuples = []
        dbsFileLoc = []
        dbsCksumBinds = []
        runLumiBinds = []
        selfChecksums = None
        jobLocations = set()

        for dbsFile in self.dbsFilesToCreate:
            # Append a tuple in the format specified by DBSBufferFiles.Add
            # Also run insertDatasetAlgo

            assocID = None
            datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (
                dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"],
                dbsFile["appFam"], dbsFile["psetHash"],
                dbsFile['processingVer'], dbsFile['acquisitionEra'],
                dbsFile['globalTag'])
            # First, check if this is in the cache
            if datasetAlgoPath in self.datasetAlgoPaths:
                for da in self.datasetAlgoID:
                    if da['datasetAlgoPath'] == datasetAlgoPath:
                        assocID = da['assocID']
                        break

            if not assocID:
                # Then we have to get it ourselves
                try:
                    assocID = dbsFile.insertDatasetAlgo()
                    self.datasetAlgoPaths.append(datasetAlgoPath)
                    self.datasetAlgoID.append({
                        'datasetAlgoPath': datasetAlgoPath,
                        'assocID': assocID
                    })
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath
                    msg += str(ex)
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            # Associate the workflow to the file using the taskPath and the requestName
            # TODO: debug why it happens and then drop/recover these cases automatically
            taskPath = dbsFile.get('task')
            if not taskPath:
                msg = "Can't do workflow association, report this error to a developer.\n"
                msg += "DbsFile : %s" % str(dbsFile)
                raise AccountantWorkerException(msg)
            workflowName = taskPath.split('/')[1]
            workflowPath = '%s:%s' % (workflowName, taskPath)
            if workflowPath in self.workflowPaths:
                for wf in self.workflowIDs:
                    if wf['workflowPath'] == workflowPath:
                        workflowID = wf['workflowID']
                        break
            else:
                result = self.dbsGetWorkflow.execute(
                    workflowName,
                    taskPath,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
                workflowID = result['id']

            self.workflowPaths.append(workflowPath)
            self.workflowIDs.append({
                'workflowPath': workflowPath,
                'workflowID': workflowID
            })

            lfn = dbsFile['lfn']
            selfChecksums = dbsFile['checksums']
            jobLocation = dbsFile.getLocations()[0]
            jobLocations.add(jobLocation)
            dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'],
                                  assocID, dbsFile['status'], workflowID))

            dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation})
            if dbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']})

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    dbsCksumBinds.append({
                        'lfn': lfn,
                        'cksum': selfChecksums[entry],
                        'cktype': entry
                    })

        try:

            diffLocation = jobLocations.difference(self.dbsLocations)

            for jobLocation in diffLocation:
                self.dbsInsertLocation.execute(
                    siteName=jobLocation,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
                self.dbsLocations.add(jobLocation)

            self.dbsCreateFiles.execute(files=dbsFileTuples,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.dbsSetLocation.execute(binds=dbsFileLoc,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.dbsSetChecksum.execute(bulkList=dbsCksumBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            if len(runLumiBinds) > 0:
                self.dbsSetRunLumi.execute(
                    file=runLumiBinds,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
        except WMException:
            raise
        except Exception as ex:
            msg = "Got exception while inserting files into DBSBuffer!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Listing binds:")
            logging.debug("jobLocation: %s\n" % jobLocation)
            logging.debug("dbsFiles: %s\n" % dbsFileTuples)
            logging.debug("dbsFileLoc: %s\n" % dbsFileLoc)
            logging.debug("Checksum binds: %s\n" % dbsCksumBinds)
            logging.debug("RunLumi binds: %s\n" % runLumiBinds)
            raise AccountantWorkerException(msg)

        # Now that we've created those files, clear the list
        self.dbsFilesToCreate = []
        return

    def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds):
        """
        _handleWMBSFiles_

        Do what can be done in bulk in bulk
        """
        if len(wmbsFilesToBuild) == 0:
            # Nothing to do
            return

        runLumiBinds = []
        fileCksumBinds = []
        fileLocations = []
        fileCreate = []

        for wmbsFile in wmbsFilesToBuild:
            lfn = wmbsFile['lfn']
            if lfn == None:
                continue

            selfChecksums = wmbsFile['checksums']
            # by jobType add to different parentage relation
            # if it is the merge job, don't include the parentage on failed input files.
            # otherwise parentage is set for all input files.
            parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']})

            if wmbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']})

            if len(wmbsFile.getLocations()) > 0:
                outpnn = wmbsFile.getLocations()[0]
                if self.pnn_to_psn.get(outpnn, None):
                    fileLocations.append({'lfn': lfn, 'location': outpnn})
                else:
                    msg = "PNN doesn't exist in wmbs_location_sename table: %s (investigate)" % outpnn
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    fileCksumBinds.append({
                        'lfn': lfn,
                        'cksum': selfChecksums[entry],
                        'cktype': entry
                    })

            fileCreate.append([
                lfn, wmbsFile['size'], wmbsFile['events'], None,
                wmbsFile["first_event"], wmbsFile['merged']
            ])

        if len(fileCreate) == 0:
            return

        try:

            self.addFileAction.execute(files=fileCreate,
                                       conn=self.getDBConn(),
                                       transaction=self.existingTransaction())

            if runLumiBinds:
                self.setFileRunLumi.execute(
                    file=runLumiBinds,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())

            self.setFileAddChecksum.execute(
                bulkList=fileCksumBinds,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

            self.setFileLocation.execute(
                lfn=fileLocations,
                location=self.fileLocation,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        except WMException:
            raise
        except Exception as ex:
            msg = "Error while adding files to WMBS!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Printing binds: \n")
            logging.debug("FileCreate binds: %s\n" % fileCreate)
            logging.debug("Runlumi binds: %s\n" % runLumiBinds)
            logging.debug("Checksum binds: %s\n" % fileCksumBinds)
            logging.debug("FileLocation binds: %s\n" % fileLocations)
            raise AccountantWorkerException(msg)

        # Clear out finished files
        wmbsFilesToBuild = []
        return

    def createFileFromDataStructsFile(self, file, jobID):
        """
        _createFileFromDataStructsFile_

        This function will create a WMBS File given a DataStructs file
        """
        wmbsFile = File()
        wmbsFile.update(file)

        if isinstance(file["locations"], set):
            pnn = list(file["locations"])[0]
        elif isinstance(file["locations"], list):
            if len(file['locations']) > 1:
                logging.error(
                    "Have more then one location for a file in job %i" %
                    (jobID))
                logging.error("Choosing location %s" % (file['locations'][0]))
            pnn = file["locations"][0]
        else:
            pnn = file["locations"]

        wmbsFile["locations"] = set()

        if pnn != None:
            wmbsFile.setLocation(pnn=pnn, immediateSave=False)
        wmbsFile['jid'] = jobID

        return wmbsFile

    def handleDBSBufferParentage(self):
        """
        _handleDBSBufferParentage_

        Handle all the DBSBuffer Parentage in bulk if you can
        """
        outputLFNs = [f['lfn'] for f in self.mergedOutputFiles]
        bindList = []
        for lfn in outputLFNs:
            newParents = self.findDBSParents(lfn=lfn)
            for parentLFN in newParents:
                bindList.append({'child': lfn, 'parent': parentLFN})

        # Now all the parents should exist
        # Commit them to DBSBuffer
        logging.info("About to commit all DBSBuffer Heritage information")
        logging.info(len(bindList))

        if len(bindList) > 0:
            try:
                self.dbsLFNHeritage.execute(
                    binds=bindList,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
            except WMException:
                raise
            except Exception as ex:
                msg = "Error while trying to handle the DBS LFN heritage\n"
                msg += str(ex)
                msg += "BindList: %s" % bindList
                logging.error(msg)
                raise AccountantWorkerException(msg)
        return

    def handleSkippedFiles(self):
        """
        _handleSkippedFiles_

        Handle all the skipped files in bulk,
        the way it handles the skipped files
        imposes an important restriction:
        Skipped files should have been processed by a single job
        in the task and no job mask exists in it.
        This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased
        splitting algorithms.
        Here ACDC records and created and the file are moved
        to wmbs_sub_files_failed from completed.
        """
        jobList = self.getFullJobInfo.execute(
            [{
                'jobid': x
            } for x in self.jobsWithSkippedFiles.keys()],
            fileSelection=self.jobsWithSkippedFiles,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        self.dataCollection.failedJobs(jobList, useMask=False)
        return
Beispiel #58
0
def updateRequestStatus(couchURL, requestList, status):
    ww = WMStatsWriter(couchURL)
    for request in requestList:
        ww.updateRequestStatus(request, status)
        print("%s is udated to %s" % (request, status))
Beispiel #59
0
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn_to_psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()

        return