Ejemplo n.º 1
0
    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.General.centralWMStatsURL
        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        if self.isT0agent:
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:
            # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True
Ejemplo n.º 2
0
 def __init__(self,
              db_url,
              db_name='workqueue',
              inbox_name='workqueue_inbox',
              parentQueue=None,
              queueUrl=None,
              logger=None):
     if logger:
         self.logger = logger
     else:
         import logging
         self.logger = logging
     self.server = CouchServer(db_url)
     self.parentCouchUrlWithAuth = parentQueue
     if parentQueue:
         self.parentCouchUrl = sanitizeURL(parentQueue)['url']
     else:
         self.parentCouchUrl = None
     self.db = self.server.connectDatabase(db_name,
                                           create=False,
                                           size=10000)
     self.hostWithAuth = db_url
     self.inbox = self.server.connectDatabase(inbox_name,
                                              create=False,
                                              size=10000)
     self.queueUrl = sanitizeURL(queueUrl
                                 or (db_url + '/' + db_name))['url']
Ejemplo n.º 3
0
def main():
    """
    _main_
    """
    if 'WMAGENT_CONFIG' not in os.environ:
        os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py'

    config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])

    # Instantiating central reqmgr and local workqueue
    print "ReqMgr2 URL  : %s" % sanitizeURL(config.JobUpdater.reqMgr2Url)['url']
    print "WorkQueue URL: %s and dbname %s" % (sanitizeURL(config.WorkQueueManager.couchurl)['url'],
                                               config.WorkQueueManager.dbname)

    reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url)
    workqueue = WorkQueue(config.WorkQueueManager.couchurl, config.WorkQueueManager.dbname)

    print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue"
    priorityCache = {}
    workflowsToUpdate = {}
    workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()]
    print "Retrieved %d workflows from workqueue" % len(workflowsToCheck)

    for workflow, priority in workflowsToCheck:
        if workflow not in priorityCache:
            try:
                priorityCache[workflow] = reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority']
            except Exception, ex:
                print "Couldn't retrieve the priority of request %s" % workflow
                print "Error: %s" % ex
                continue
        if priority != priorityCache[workflow]:
            workflowsToUpdate[workflow] = priorityCache[workflow]
Ejemplo n.º 4
0
    def __init__(self,
                 db_url,
                 db_name='workqueue',
                 inbox_name=None,
                 parentQueue=None,
                 queueUrl=None,
                 logger=None):
        if logger:
            self.logger = logger
        else:
            import logging
            self.logger = logging

        if inbox_name is None:
            inbox_name = "%s_inbox" % db_name

        self.server = CouchServer(db_url)
        self.parentCouchUrlWithAuth = parentQueue
        if parentQueue:
            self.parentCouchUrl = sanitizeURL(parentQueue)['url']
        else:
            self.parentCouchUrl = None
        self.db = self.server.connectDatabase(db_name,
                                              create=False,
                                              size=10000)
        self.hostWithAuth = db_url
        self.inbox = self.server.connectDatabase(inbox_name,
                                                 create=False,
                                                 size=10000)
        self.queueUrl = sanitizeURL(queueUrl
                                    or (db_url + '/' + db_name))['url']
        self.eleKey = 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement'
Ejemplo n.º 5
0
 def post(self, workload_pair_list, multi_update_flag = False):
     """
     Create and update couchDB with  a new request. 
     request argument is passed from validation 
     (validation convert cherrypy.request.body data to argument)
                     
     TODO:
     this method will have some parts factored out so that e.g. clone call
     can share functionality.
     
     NOTES:
     1) do not strip spaces, #4705 will fails upon injection with spaces ; 
         currently the chain relies on a number of things coming in #4705
     
     2) reqInputArgs = Utilities.unidecode(JsonWrapper.loads(body))
         (from ReqMgrRESTModel.putRequest)
             
     """
     
     # storing the request document into Couch
     
     if multi_update_flag:
         return self.put(workload_pair_list)
         
     out = []
     for workload, request_args in workload_pair_list:
         cherrypy.log("INFO: Create request, input args: %s ..." % request_args)
         request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], 
                                         request_args["CouchWorkloadDBName"], workload.name()))['url']
         workload.saveCouch(request_args["CouchURL"], request_args["CouchWorkloadDBName"],
                                           metadata=request_args)
         out.append({'request':workload.name()})
     return out
Ejemplo n.º 6
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     
     #TODO: we might need to use local db for Tier0
     self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
     
     if self.useReqMgrForCompletionCheck:
         self.deletableStates = ["announced"]
         self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
         #TODO: remove this for reqmgr2
         self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
     else:
         # Tier0 case
         self.deletableStates = ["completed"]
         # use local for update
         self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, 
                                                couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
     
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
     
     statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
     self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
Ejemplo n.º 7
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
     
     if self.useReqMgrForCompletionCheck:
         self.deletableStates = ["announced"]
         self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
         self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
     else:
         # Tier0 case
         self.deletableStates = ["completed"]
         self.centralCouchDBWriter = self.wmstatsCouchDB
     
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
     
     statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
     self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
Ejemplo n.º 8
0
    def checkReplicationStatus(self):
        """
        _checkReplicationStatus_

        Check if the workqueue replication is ok, if not
        then delete the documents so that new replications can be triggered
        when appropiate.
        It returns True if there is no error, and False otherwise.
        """

        status = self.server.status()
        replicationError = False
        replicationCount = 0
        expectedReplicationCount = 2 # GQ -> LQ-Inbox & LQ-Inbox -> GQ
        # Remove the protocol frm the sanitized url
        inboxUrl = sanitizeURL('%s/%s' % (self.server.url, self.inbox.name))['url'].split('/', 2)[2]
        try:
            for activeTasks in status['active_tasks']:
                if activeTasks['type'] == 'Replication':
                    if inboxUrl in activeTasks['task']:
                        replicationCount += 1
            if replicationCount < expectedReplicationCount:
                replicationError = True
        except:
            replicationError = True

        if replicationError:
            # Stop workqueue related replication
            self.logger.error("Stopping replication as it was in error state. It will be restarted.")
            self.pullFromParent(continuous = True, cancel = True)
            self.sendToParent(continuous = True, cancel = True)

        return not replicationError
Ejemplo n.º 9
0
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL)
        self.centralCouchDBReader = WMStatsReader(
            self.config.TaskArchiver.centralWMStatsURL)

        if self.useReqMgrForCompletionCheck:
            self.deletableStates = ["announced"]
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
            self.reqmgrSvc = RequestManager(
                {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableStates = ["completed"]
            self.centralCouchDBWriter = self.wmstatsCouchDB

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)
Ejemplo n.º 10
0
def _getCouchACDCHtmlBase(acdcCouchURL):
    """
    TODO: currently it is hard code to the front page of ACDC
    When there is more information is available, it can be added
    through
    """

    return '%s/_design/ACDC/collections.html' % sanitizeURL(acdcCouchURL)['url']
Ejemplo n.º 11
0
 def __init__(self, couchURL, reqdbURL = None, reqdbCouchApp = "ReqMgr"):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     self._commonInit(couchURL)
     if reqdbURL:
         self.reqDB = RequestDBReader(reqdbURL)
     else:
         self.reqDB = None
Ejemplo n.º 12
0
 def __init__(self, couchURL, reqdbURL=None, reqdbCouchApp="ReqMgr"):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     self._commonInit(couchURL)
     if reqdbURL:
         self.reqDB = RequestDBReader(reqdbURL)
     else:
         self.reqDB = None
Ejemplo n.º 13
0
def _getCouchACDCHtmlBase(acdcCouchURL):
    """
    TODO: currently it is hard code to the front page of ACDC
    When there is more information is available, it can be added
    through
    """

    return '%s/_design/ACDC/collections.html' % sanitizeURL(acdcCouchURL)['url']
Ejemplo n.º 14
0
 def __init__(self, db_url, db_name = 'workqueue',
              inbox_name = 'workqueue_inbox', parentQueue = None,
              queueUrl = None, logger = None):
     if logger:
         self.logger = logger
     else:
         import logging
         self.logger = logging
     self.server = CouchServer(db_url)
     self.parentCouchUrlWithAuth = parentQueue
     if parentQueue:
         self.parentCouchUrl = sanitizeURL(parentQueue)['url']
     else:
         self.parentCouchUrl = None
     self.db = self.server.connectDatabase(db_name, create = False, size = 10000)
     self.hostWithAuth = db_url
     self.inbox = self.server.connectDatabase(inbox_name, create = False, size = 10000)
     self.queueUrl = sanitizeURL(queueUrl or (db_url + '/' + db_name))['url']
Ejemplo n.º 15
0
 def __init__(self, couchURL, dbName = None):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     if dbName:
         self.couchURL = couchURL
         self.dbName = dbName
     else:
         self.couchURL, self.dbName = splitCouchServiceURL(couchURL)
     self.couchServer = CouchServer(self.couchURL)
     self.couchDB = CouchServer(self.couchURL).connectDatabase(self.dbName, False)
Ejemplo n.º 16
0
def main():
    """
    _main_
    """
    if 'WMAGENT_CONFIG' not in os.environ:
        os.environ[
            'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py'

    config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])

    # Instantiating central reqmgr and local workqueue
    print "ReqMgr2 URL  : %s" % sanitizeURL(
        config.JobUpdater.reqMgr2Url)['url']
    print "WorkQueue URL: %s and dbname %s" % (sanitizeURL(
        config.WorkQueueManager.couchurl)['url'],
                                               config.WorkQueueManager.dbname)

    reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url)
    workqueue = WorkQueue(config.WorkQueueManager.couchurl,
                          config.WorkQueueManager.dbname)

    print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue"
    priorityCache = {}
    workflowsToUpdate = {}
    workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()]
    print "Retrieved %d workflows from workqueue" % len(workflowsToCheck)

    for workflow, priority in workflowsToCheck:
        if workflow not in priorityCache:
            try:
                priorityCache[workflow] = reqmgr2.getRequestByNames(
                    workflow)[workflow]['RequestPriority']
            except Exception, ex:
                print "Couldn't retrieve the priority of request %s" % workflow
                print "Error: %s" % ex
                continue
        if priority != priorityCache[workflow]:
            workflowsToUpdate[workflow] = priorityCache[workflow]
Ejemplo n.º 17
0
 def setup(self, parameters):
     """
     Called at startup
     """
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL)
     self.centralCouchDBWriter = WMStatsWriter(self.config.TaskArchiver.centralWMStatsURL)
     self.centralCouchDBReader = WMStatsReader(self.config.TaskArchiver.centralWMStatsURL)
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
Ejemplo n.º 18
0
    def __init__(self, db_url, db_name='workqueue',
                 inbox_name=None, parentQueue=None,
                 queueUrl=None, logger=None):
        if logger:
            self.logger = logger
        else:
            import logging
            self.logger = logging

        if inbox_name == None:
            inbox_name = "%s_inbox" % db_name

        self.server = CouchServer(db_url)
        self.parentCouchUrlWithAuth = parentQueue
        if parentQueue:
            self.parentCouchUrl = sanitizeURL(parentQueue)['url']
        else:
            self.parentCouchUrl = None
        self.db = self.server.connectDatabase(db_name, create=False, size=10000)
        self.hostWithAuth = db_url
        self.inbox = self.server.connectDatabase(inbox_name, create=False, size=10000)
        self.queueUrl = sanitizeURL(queueUrl or (db_url + '/' + db_name))['url']
        self.eleKey = 'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement'
Ejemplo n.º 19
0
    def archiveCouchSummary(self, workflow, spec):
        """
        _archiveCouchSummary_

        For each workflow pull its information from couch and turn it into
        a summary for archiving
        """

        failedJobs = []
        jobErrors  = []
        outputLFNs = []

        workflowData = {}
        workflowName     = workflow.task.split('/')[1]

        # Set campaign
        workflowData['campaign'] = spec.getCampaign()

        # Get a list of failed job IDs
        # Make sure you get it for ALL tasks in the spec
        for taskName in spec.listAllTaskPathNames():
            failedTmp = self.jobsdatabase.loadView("JobDump", "failedJobsByWorkflowName",
                                                   options = {"startkey": [workflowName, taskName],
                                                              "endkey": [workflowName, taskName]})['rows']
            for entry in failedTmp:
                failedJobs.append(entry['value'])

        output = self.fwjrdatabase.loadView("FWJRDump", "outputByWorkflowName",
                                            options = {"group_level": 2,
                                                       "startkey": [workflowName],
                                                       "endkey": [workflowName, {}],
                                                       "group": True})['rows']

        perf = self.handleCouchPerformance(workflowName = workflowName)
        workflowData['performance'] = {}
        for key in perf:
            workflowData['performance'][key] = {}
            for attr in perf[key].keys():
                workflowData['performance'][key][attr] = perf[key][attr]


        workflowData["_id"]          = workflow.task.split('/')[1]
        try:
            workflowData["ACDCServer"]   = sanitizeURL(self.config.ACDC.couchurl)['url']
            workflowData["ACDCDatabase"] = self.config.ACDC.database
        except AttributeError, ex:
            # We're missing the ACDC info.
            # Keep going
            logging.error("ACDC info missing from config.  Skipping this step in the workflow summary.")
            logging.debug("Error: %s" % str(ex))
Ejemplo n.º 20
0
    def testResetWork(self):
        """Reset work in global to different child queue"""
        #TODO: This test sometimes fails - i suspect a race condition (maybe conflict in couch)
        # Cancel code needs reworking so this will hopefully be fixed then
        totalBlocks = 2
        self.globalQueue.queueWork(self.processingSpec.specUrl())
        self.globalQueue.updateLocationInfo()
        self.assertEqual(self.localQueue.pullWork({'T2_XX_SiteA' : 1000}),
                         totalBlocks)
        syncQueues(self.localQueue)
        work = self.localQueue.getWork({'T2_XX_SiteA' : 1000, 'T2_XX_SiteB' : 1000})
        self.assertEqual(len(work), totalBlocks)
        self.assertEqual(len(self.localQueue.status(status = 'Running')), 2)
        syncQueues(self.localQueue)
        self.assertEqual(len(self.globalQueue.status(status = 'Running')), 2)

        # Re-assign work in global
        self.globalQueue.resetWork([x.id for x in self.globalQueue.status(status = 'Running')])

        # work should be canceled in local
        #TODO: Note the work in local will be orphaned but not canceled
        syncQueues(self.localQueue)
        work_at_local = [x for x in self.globalQueue.status(status = 'Running') \
                         if x['ChildQueueUrl'] == sanitizeURL(self.localQueue.params['QueueURL'])['url']]
        self.assertEqual(len(work_at_local), 0)

        # now 2nd queue calls and acquires work
        self.assertEqual(self.localQueue2.pullWork({'T2_XX_SiteA' : 1000}),
                         totalBlocks)
        syncQueues(self.localQueue2)

        # check work in global assigned to local2
        self.assertEqual(len(self.localQueue2.status(status = 'Available')),
                         2) # work in local2
        work_at_local2 = [x for x in self.globalQueue.status(status = 'Acquired') \
                         if x['ChildQueueUrl'] == sanitizeURL(self.localQueue2.params['QueueURL'])['url']]
        self.assertEqual(len(work_at_local2), 2)
Ejemplo n.º 21
0
 def setUpCouchDBReplication(self):
     
     self.replicatorDocs = []
     # set up common replication code
     wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
     wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL
     
     self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 
                                 'filter':  "WMStatsAgent/repfilter"})
     #TODO: tier0 specific code - need to make it generic 
     if hasattr(self.config, "Tier0Feeder"):
         t0Source = self.config.Tier0Feeder.requestDBName
         t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
         self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 
                                     'filter': "T0Request/repfilter"})
     else: # set up workqueue replication
         wqfilter = 'WorkQueue/queueFilter'
         parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"]
         childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
         query_params = {'childUrl' : childURL, 'parentUrl' : sanitizeURL(parentQURL)['url']}
         localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
         self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 
                                     'filter': wqfilter, 'query_params': query_params})       
         self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 
                                     'filter': wqfilter, 'query_params': query_params})
     
     
 # delete or replicator docs befor setting up
     self.localCouchMonitor.deleteReplicatorDocs()
     
     for rp in self.replicatorDocs:
         self.localCouchMonitor.couchServer.replicate(
                                        rp['source'], rp['target'], filter = rp['filter'], 
                                        query_params = rp.get('query_params', False),
                                        continuous = True, useReplicator = True)
     # First cicle need to be skipped since document is not updated that fast
     self.skipReplicationCheck = True
Ejemplo n.º 22
0
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.archiveDelayHours = getattr(self.config.TaskArchiver,
                                         'archiveDelayHours', 0)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL, "WMStatsAgent")

        #TODO: we might need to use local db for Tier0
        self.centralRequestDBReader = RequestDBReader(
            self.config.AnalyticsDataCollector.centralRequestDBURL,
            couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)

        if self.useReqMgrForCompletionCheck:
            self.deletableState = "announced"
            self.centralRequestDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.centralRequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
            if self.config.TaskArchiver.reqmgr2Only:
                self.reqmgr2Svc = ReqMgr(
                    self.config.TaskArchiver.ReqMgr2ServiceURL)
            else:
                #TODO: remove this for reqmgr2
                self.reqmgrSvc = RequestManager(
                    {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableState = "completed"
            # use local for update
            self.centralRequestDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.localT0RequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)
Ejemplo n.º 23
0
 def _update_additional_request_args(self, workload, request_args):
     """
     add to request_args properties which is not initially set from user.
     This data will put in to couchdb. 
     Update request_args here if additional information need to be put in couchdb
     """
     request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"], 
                                         request_args["CouchWorkloadDBName"], workload.name()))['url']
     
     # Add the output datasets if necessary
     # for some bizarre reason OutpuDatasets is list of lists    
     request_args['OutputDatasets'] = workload.listOutputDatasets()
     
     #TODO: remove this after reqmgr2 replice reqmgr (reqmgr2Only)
     request_args['ReqMgr2Only'] = True
     return
Ejemplo n.º 24
0
    def __init__(self, url='http://localhost', idict=None):
        """
        url should really be host - TODO fix that when have sufficient code
        coverage and change _getURLOpener if needed
        """
        if not idict:
            idict = {}
        dict.__init__(self, idict)
        self.pycurl = idict.get('pycurl', None)
        self.capath = idict.get('capath', None)
        if self.pycurl:
            self.reqmgr = RequestHandler()

        # set up defaults
        self.setdefault("accept_type", 'text/html')
        self.setdefault("content_type", 'application/x-www-form-urlencoded')
        self.additionalHeaders = {}

        # check for basic auth early, as if found this changes the url
        urlComponent = sanitizeURL(url)
        if urlComponent['username'] is not None:
            self.addBasicAuth(urlComponent['username'],
                              urlComponent['password'])
            url = urlComponent['url']  # remove user, password from url

        self.setdefault("host", url)

        # then update with the incoming dict
        self.update(idict)

        self['endpoint_components'] = urlparse.urlparse(self['host'])

        # If cachepath = None disable caching
        if 'cachepath' in idict and idict['cachepath'] is None:
            self["req_cache_path"] = None
        else:
            cache_dir = (self.cachePath(idict.get('cachepath'),
                                        idict.get('service_name')))
            self["cachepath"] = cache_dir
            self["req_cache_path"] = os.path.join(cache_dir, '.cache')
        self.setdefault("cert", None)
        self.setdefault("key", None)
        self.setdefault('capath', None)
        self.setdefault("timeout", 300)
        self.setdefault("logger", logging)

        check_server_url(self['host'])
Ejemplo n.º 25
0
    def __init__(self, url = 'http://localhost', idict=None):
        """
        url should really be host - TODO fix that when have sufficient code
        coverage and change _getURLOpener if needed
        """
        if  not idict:
            idict = {}
        dict.__init__(self, idict)
        self.pycurl = idict.get('pycurl', None)
        self.capath = idict.get('capath', None)
        if self.pycurl:
            self.reqmgr = RequestHandler()

        #set up defaults
        self.setdefault("accept_type", 'text/html')
        self.setdefault("content_type", 'application/x-www-form-urlencoded')
        self.additionalHeaders = {}

        # check for basic auth early, as if found this changes the url
        urlComponent = sanitizeURL(url)
        if urlComponent['username'] is not None:
            self.addBasicAuth(\
                urlComponent['username'], urlComponent['password'])
            url = urlComponent['url'] # remove user, password from url

        self.setdefault("host", url)

        # then update with the incoming dict
        self.update(idict)

        self['endpoint_components'] = urlparse.urlparse(self['host'])

        # If cachepath = None disable caching
        if 'cachepath' in idict and idict['cachepath'] is None:
            self["req_cache_path"] = None
        else:
            cache_dir = (self.cachePath(idict.get('cachepath'), \
                        idict.get('service_name')))
            self["cachepath"] = cache_dir
            self["req_cache_path"] = os.path.join(cache_dir, '.cache')
        self.setdefault("timeout", 300)
        self.setdefault("logger", logging)

        check_server_url(self['host'])
        # and then get the URL opener
        self.setdefault("conn", self._getURLOpener())
Ejemplo n.º 26
0
    def _update_additional_request_args(self, workload, request_args):
        """
        add to request_args properties which is not initially set from user.
        This data will put in to couchdb.
        Update request_args here if additional information need to be put in couchdb
        """
        request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"],
                                                                         request_args["CouchWorkloadDBName"],
                                                                         workload.name()))['url']

        # Add the output datasets if necessary
        # for some bizarre reason OutpuDatasets is list of lists
        request_args['OutputDatasets'] = workload.listOutputDatasets()

        # TODO: remove this after reqmgr2 replice reqmgr (reqmgr2Only)
        request_args['ReqMgr2Only'] = True
        return
Ejemplo n.º 27
0
 def _commonInit(self, couchURL, couchapp):
     """
     setting up comon variables for inherited class.
     inherited class should call this in their init function
     """
     if isinstance(couchURL, Database):
         self.couchDB = couchURL
         self.couchURL = self.couchDB['host']
         self.dbName = self.couchDB.name
         self.couchServer = CouchServer(self.couchURL)
     else:
         couchURL = sanitizeURL(couchURL)['url']
         self.couchURL, self.dbName = splitCouchServiceURL(couchURL)
         self.couchServer = CouchServer(self.couchURL)
         self.couchDB = self.couchServer.connectDatabase(self.dbName, False)
     self.couchapp = couchapp
     self.defaultStale = {"stale": "update_after"}
Ejemplo n.º 28
0
    def _update_additional_request_args(self, workload, request_args):
        """
        add to request_args properties which is not initially set from user.
        This data will put in to couchdb.
        Update request_args here if additional information need to be put in couchdb
        """
        request_args['RequestWorkflow'] = sanitizeURL("%s/%s/%s/spec" % (request_args["CouchURL"],
                                                                         request_args["CouchWorkloadDBName"],
                                                                         workload.name()))['url']

        # Add the output datasets if necessary
        # for some bizarre reason OutpuDatasets is list of lists
        request_args['OutputDatasets'] = workload.listOutputDatasets()

        # Add initial priority only for the creation of the request
        request_args['InitialPriority'] = request_args["RequestPriority"]

        return
Ejemplo n.º 29
0
    def testProductionMultiQueue(self):
        """Test production with multiple queueus"""
        specfile = self.spec.specUrl()
        numUnit = 1
        jobSlot = [10] * numUnit # array of jobs per block
        total = sum(jobSlot)

        self.globalQueue.queueWork(specfile)
        self.assertEqual(numUnit, len(self.globalQueue))

        # pull work to localQueue2 - check local doesn't get any
        self.assertEqual(numUnit, self.localQueue2.pullWork({'T2_XX_SiteA' : total}))
        self.assertEqual(0, self.localQueue.pullWork({'T2_XX_SiteA' : total}))
        syncQueues(self.localQueue)
        syncQueues(self.localQueue2)
        self.assertEqual(numUnit, len(self.localQueue2.status(status = 'Available')))
        self.assertEqual(0, len(self.localQueue.status(status = 'Available')))
        self.assertEqual(numUnit, len(self.globalQueue.status(status = 'Acquired')))
        self.assertEqual(sanitizeURL(self.localQueue2.params['QueueURL'])['url'],
                         self.globalQueue.status()[0]['ChildQueueUrl'])
Ejemplo n.º 30
0
    def post(self, workload_pair_list, multi_update_flag=False):
        """
        Create and update couchDB with  a new request. 
        request argument is passed from validation 
        (validation convert cherrypy.request.body data to argument)
                        
        TODO:
        this method will have some parts factored out so that e.g. clone call
        can share functionality.
        
        NOTES:
        1) do not strip spaces, #4705 will fails upon injection with spaces ; 
            currently the chain relies on a number of things coming in #4705
        
        2) reqInputArgs = Utilities.unidecode(JsonWrapper.loads(body))
            (from ReqMgrRESTModel.putRequest)
                
        """

        # storing the request document into Couch

        if multi_update_flag:
            return self.put(workload_pair_list)

        out = []
        for workload, request_args in workload_pair_list:
            cherrypy.log("INFO: Create request, input args: %s ..." %
                         request_args)
            request_args['RequestWorkflow'] = sanitizeURL(
                "%s/%s/%s/spec" %
                (request_args["CouchURL"], request_args["CouchWorkloadDBName"],
                 workload.name()))['url']
            workload.saveCouch(request_args["CouchURL"],
                               request_args["CouchWorkloadDBName"],
                               metadata=request_args)
            out.append({'request': workload.name()})
        return out
Ejemplo n.º 31
0
 def setSpecUrl(self, url):
     self.data.persistency.specUrl = sanitizeURL(url)["url"]
Ejemplo n.º 32
0
    def testA_BasicFunctionTest(self):
        """
        _BasicFunctionTest_

        Tests the components, by seeing if they can process a simple set of closeouts
        """

        myThread = threading.currentThread()

        config = self.getConfig()
        workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl')
        workload     = self.createWorkload(workloadName = workloadPath)
        testJobGroup = self.createTestJobGroup(config = config,
                                               name = workload.name(),
                                               specLocation = workloadPath,
                                               error = False)

        # Create second workload
        testJobGroup2 = self.createTestJobGroup(config = config,
                                                name = workload.name(),
                                                filesetName = "TestFileset_2",
                                                specLocation = workloadPath,
                                                task = "/TestWorkload/ReReco/LogCollect")

        cachePath = os.path.join(config.JobCreator.jobCacheDir,
                                 "TestWorkload", "ReReco")
        os.makedirs(cachePath)
        self.assertTrue(os.path.exists(cachePath))

        cachePath2 = os.path.join(config.JobCreator.jobCacheDir,
                                 "TestWorkload", "LogCollect")
        os.makedirs(cachePath2)
        self.assertTrue(os.path.exists(cachePath2))


        result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 2)

        workflowName = "TestWorkload"
        dbname       = config.TaskArchiver.workloadSummaryCouchDBName
        couchdb      = CouchServer(config.JobStateMachine.couchurl)
        workdatabase = couchdb.connectDatabase(dbname)
        jobdb        = couchdb.connectDatabase("%s/jobs" % self.databaseName)
        fwjrdb       = couchdb.connectDatabase("%s/fwjrs" % self.databaseName)
        jobs = jobdb.loadView("JobDump", "jobsByWorkflowName",
                              options = {"startkey": [workflowName],
                                         "endkey": [workflowName, {}]})['rows']
        self.assertEqual(len(jobs), 2*self.nJobs)

        from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase
        create = CreateWMBSBase()
        tables = []
        for x in create.requiredTables:
            tables.append(x[2:])

        testTaskArchiver = TaskArchiverPoller(config = config)
        testTaskArchiver.algorithm()

        result = myThread.dbi.processData("SELECT * FROM wmbs_job")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_jobgroup")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_fileset")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_file_details")[0].fetchall()
        self.assertEqual(len(result), 0)

        # Make sure we deleted the directory
        self.assertFalse(os.path.exists(cachePath))
        self.assertFalse(os.path.exists(os.path.join(self.testDir, 'workloadTest/TestWorkload')))

        testWMBSFileset = Fileset(id = 1)
        self.assertEqual(testWMBSFileset.exists(), False)



        workloadSummary = workdatabase.document(id = "TestWorkload")
        # Check ACDC
        self.assertEqual(workloadSummary['ACDCServer'], sanitizeURL(config.ACDC.couchurl)['url'])

        # Check the output
        self.assertEqual(workloadSummary['output'].keys(), ['/Electron/MorePenguins-v0/RECO',
                                                            '/Electron/MorePenguins-v0/ALCARECO'])

        # Check performance
        # Check histograms
        self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['average'],
                                0.062651899999999996, places = 2)
        self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['nEvents'],
                         5)

        # Check standard performance
        self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['average'], 9.4950600000000005,
                                places = 2)
        self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['stdDev'], 8.2912400000000002,
                                places = 2)

        # Check worstOffenders
        self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['worstOffenders'],
                         [{'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1},
                          {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 2},
                          {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 3}])

        # Check retryData
        self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'], {'0': 10})

        # LogCollect task is made out of identical FWJRs
        # assert that it is identical
        for x in workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys():
            if x in config.TaskArchiver.histogramKeys:
                continue
            for y in ['average', 'stdDev']:
                self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y],
                                        workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'][x][y],
                                        places = 2)

        # The TestWorkload should have no jobs left
        workflowName = "TestWorkload"
        jobs = jobdb.loadView("JobDump", "jobsByWorkflowName",
                              options = {"startkey": [workflowName],
                                         "endkey": [workflowName, {}]})['rows']
        self.assertEqual(len(jobs), 0)
        jobs = fwjrdb.loadView("FWJRDump", "fwjrsByWorkflowName",
                               options = {"startkey": [workflowName],
                                          "endkey": [workflowName, {}]})['rows']
        self.assertEqual(len(jobs), 0)
        return
Ejemplo n.º 33
0
# Query destination DB for list of workflows
summaryBase = "%s/%s%%2Ffwjrs/_design/FWJRDump/_show/workflowSummary/%s"  # dest host, dest db base, workflow name
successBase = "%s/%s%%2Fjobs/_design/JobDump/_list/successJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false"  # dest host, dest db base, workflow, workflow
failedBase = "%s/%s%%2Fjobs/_design/JobDump/_list/failedJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false"  # dest host, dest db base, workflow, workflow

srcJobsDb = srcCouchServer.connectDatabase(srcDbBase + "/jobs")
statusResult = srcJobsDb.loadView("JobDump",
                                  "statusByWorkflowName",
                                  options={"group_level": 1})

fileHandle = open("archived.html", "w")
fileHandle.write("<html><head><title>Archived Workflows</title></head>\n")
fileHandle.write("<body>\n")

workflowNames = []
for statusRow in statusResult["rows"]:
    wfName = statusRow["key"][0]
    summaryUrl = summaryBase % (destCouchHost, destDbBase, wfName)
    successUrl = successBase % (destCouchHost, destDbBase, wfName, wfName)
    failedUrl = successBase % (destCouchHost, destDbBase, wfName, wfName)
    fileHandle.write("%s " % wfName)
    fileHandle.write("<a href=%s>(summary)</a>" %
                     sanitizeURL(summaryUrl)["url"])
    fileHandle.write(" <a href=%s>(success)</a>" %
                     sanitizeURL(successUrl)["url"])
    fileHandle.write(" <a href=%s>(failure)</a><br>\n" %
                     sanitizeURL(failedUrl)["url"])

fileHandle.write("</body></html>\n")
fileHandle.close()
Ejemplo n.º 34
0
    def testA_BasicFunctionTest(self):
        """
        _BasicFunctionTest_

        Tests the components, by seeing if they can process a simple set of closeouts
        """

        myThread = threading.currentThread()

        config = self.getConfig()
        workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl')
        workload     = self.createWorkload(workloadName = workloadPath)
        testJobGroup = self.createTestJobGroup(config = config,
                                               name = workload.name(),
                                               specLocation = workloadPath,
                                               error = False)

        # Create second workload
        testJobGroup2 = self.createTestJobGroup(config = config,
                                                name = workload.name(),
                                                filesetName = "TestFileset_2",
                                                specLocation = workloadPath,
                                                task = "/TestWorkload/ReReco/LogCollect", 
                                                type = "LogCollect")

        cachePath = os.path.join(config.JobCreator.jobCacheDir,
                                 "TestWorkload", "ReReco")
        os.makedirs(cachePath)
        self.assertTrue(os.path.exists(cachePath))

        cachePath2 = os.path.join(config.JobCreator.jobCacheDir,
                                 "TestWorkload", "LogCollect")
        os.makedirs(cachePath2)
        self.assertTrue(os.path.exists(cachePath2))

        result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 2)

        workflowName = "TestWorkload"
        dbname       = config.TaskArchiver.workloadSummaryCouchDBName
        couchdb      = CouchServer(config.JobStateMachine.couchurl)
        workdatabase = couchdb.connectDatabase(dbname)
        jobdb        = couchdb.connectDatabase("%s/jobs" % self.databaseName)
        fwjrdb       = couchdb.connectDatabase("%s/fwjrs" % self.databaseName)
        jobs = jobdb.loadView("JobDump", "jobsByWorkflowName",
                              options = {"startkey": [workflowName],
                                         "endkey": [workflowName, {}]})['rows']
        fwjrdb.loadView("FWJRDump", "fwjrsByWorkflowName",
                        options = {"startkey": [workflowName],
                                   "endkey": [workflowName, {}]})['rows']

        self.assertEqual(len(jobs), 2*self.nJobs)

        from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase
        create = CreateWMBSBase()
        tables = []
        for x in create.requiredTables:
            tables.append(x[2:])
 
        self.populateWorkflowWithCompleteStatus()
        testTaskArchiver = TaskArchiverPoller(config = config)
        testTaskArchiver.algorithm()
        
        cleanCouch = CleanCouchPoller(config = config)
        cleanCouch.setup()
        cleanCouch.algorithm()

        result = myThread.dbi.processData("SELECT * FROM wmbs_job")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_jobgroup")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_fileset")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_file_details")[0].fetchall()
        self.assertEqual(len(result), 0)

        # Make sure we deleted the directory
        self.assertFalse(os.path.exists(cachePath))
        self.assertFalse(os.path.exists(os.path.join(self.testDir, 'workloadTest/TestWorkload')))

        testWMBSFileset = Fileset(id = 1)
        self.assertEqual(testWMBSFileset.exists(), False)



        workloadSummary = workdatabase.document(id = "TestWorkload")
        # Check ACDC
        self.assertEqual(workloadSummary['ACDCServer'], sanitizeURL(config.ACDC.couchurl)['url'])

        # Check the output
        self.assertEqual(workloadSummary['output'].keys(), ['/Electron/MorePenguins-v0/RECO'])
        self.assertEqual(sorted(workloadSummary['output']['/Electron/MorePenguins-v0/RECO']['tasks']),
                        ['/TestWorkload/ReReco', '/TestWorkload/ReReco/LogCollect'])
        # Check performance
        # Check histograms
        self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['average'],
                                0.89405199999999996, places = 2)
        self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['histogram'][0]['nEvents'],
                         10)

        # Check standard performance
        self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['average'], 17.786300000000001,
                                places = 2)
        self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['TotalJobCPU']['stdDev'], 0.0,
                                places = 2)

        # Check worstOffenders
        self.assertEqual(workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']['AvgEventTime']['worstOffenders'],
                         [{'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1},
                          {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1},
                          {'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 2}])

        # Check retryData
        self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'], {'1': 10})
        logCollectPFN = 'srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms/store/logs/prod/2012/11/WMAgent/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8-AlcaSkimLogCollect-1-logs.tar'
        self.assertEqual(workloadSummary['logArchives'], {'/TestWorkload/ReReco/LogCollect' : [logCollectPFN for _ in range(10)]})

        # LogCollect task is made out of identical FWJRs
        # assert that it is identical
        for x in workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys():
            if x in config.TaskArchiver.histogramKeys:
                continue
            for y in ['average', 'stdDev']:
                self.assertAlmostEquals(workloadSummary['performance']['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y],
                                        workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'][x][y],
                                        places = 2)

        return
Ejemplo n.º 35
0
 def _sanitizeURL(self, couchURL):
     return sanitizeURL(couchURL)['url']
Ejemplo n.º 36
0
    def recordInCouch(self, jobs, newstate, oldstate, updatesummary = False):
        """
        _recordInCouch_

        Record relevant job information in couch. If the job does not yet exist
        in couch it will be saved as a seperate document.  If the job has a FWJR
        attached that will be saved as a seperate document.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        timestamp = int(time.time())
        couchRecordsToUpdate = []

        for job in jobs:
            couchDocID = job.get("couch_record", None)

            if newstate == "new":
                oldstate = "none"

            if job.get("site_cms_name", None):
                if newstate == "executing":
                    jobLocation = job["site_cms_name"]
                else:
                    jobLocation = "Agent"
            else:
                jobLocation = "Agent"

            if couchDocID == None:
                jobDocument = {}
                jobDocument["_id"] = str(job["id"])
                job["couch_record"] = jobDocument["_id"]
                jobDocument["jobid"] = job["id"]
                jobDocument["workflow"] = job["workflow"]
                jobDocument["task"] = job["task"]
                jobDocument["owner"] = job["owner"]

                jobDocument["inputfiles"] = []
                for inputFile in job["input_files"]:
                    docInputFile = inputFile.json()

                    docInputFile["parents"] = []
                    for parent in inputFile["parents"]:
                        docInputFile["parents"].append({"lfn": parent["lfn"]})

                    jobDocument["inputfiles"].append(docInputFile)

                jobDocument["states"] = {"0": {"oldstate": oldstate,
                                               "newstate": newstate,
                                               "location": jobLocation,
                                               "timestamp": timestamp}}

                jobDocument["jobgroup"] = job["jobgroup"]
                jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"],
                                       "LastEvent": job["mask"]["LastEvent"],
                                       "FirstLumi": job["mask"]["FirstLumi"],
                                       "LastLumi": job["mask"]["LastLumi"],
                                       "FirstRun": job["mask"]["FirstRun"],
                                       "LastRun": job["mask"]["LastRun"]}

                if job['mask']['runAndLumis'] != {}:
                    # Then we have to save the mask runAndLumis
                    jobDocument['mask']['runAndLumis'] = {}
                    for key in job['mask']['runAndLumis'].keys():
                        jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key]

                jobDocument["name"] = job["name"]
                jobDocument["type"] = "job"
                jobDocument["user"] = job.get("user", None)
                jobDocument["group"] = job.get("group", None)
                jobDocument["taskType"] = job.get("taskType", "Unknown")
                jobDocument["jobType"] = job.get("jobType", "Unknown")

                couchRecordsToUpdate.append({"jobid": job["id"],
                                             "couchid": jobDocument["_id"]})
                self.jobsdatabase.queue(jobDocument, callback = discardConflictingDocument)
            else:
                # We send a PUT request to the stateTransition update handler.
                # Couch expects the parameters to be passed as arguments to in
                # the URI while the Requests class will only encode arguments
                # this way for GET requests.  Changing the Requests class to
                # encode PUT arguments as couch expects broke a bunch of code so
                # we'll just do our own encoding here.
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    newstate,
                                                                                    jobLocation,
                                                                                    timestamp)
                self.jobsdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)

            # updating the status of the summary doc only when it is explicitely requested
            # doc is already in couch
            if updatesummary:
                jobSummaryId = job["name"]
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobSummaryState/" + jobSummaryId
                # map retrydone state to jobfailed state for monitoring
                if newstate == "retrydone":
                    monitorState = "jobfailed"
                else:
                    monitorState = newstate
                updateUri += "?newstate=%s&timestamp=%s" % (monitorState, timestamp)
                self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
                logging.debug("Updated job summary status for job %s" % jobSummaryId)
                
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobStateTransition/" + jobSummaryId
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    monitorState,
                                                                                    job["location"],
                                                                                    timestamp)
                self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
                logging.debug("Updated job summary state history for job %s" % jobSummaryId)

            if job.get("fwjr", None):

                # If there are too many input files, strip them out
                # of the FWJR, as they should already
                # be in the database
                # This is not critical
                try:
                    if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles:
                        job['fwjr'].stripInputFiles()
                except:
                    logging.error("Error while trying to strip input files from FWJR.  Ignoring.")
                    pass

                # complete fwjr document
                job["fwjr"].setTaskName(job["task"])
                fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]),
                                "jobid": job["id"],
                                "retrycount": job["retry_count"],
                                "fwjr": job["fwjr"].__to_json__(None),
                                "type": "fwjr"}
                self.fwjrdatabase.queue(fwjrDocument, timestamp = True, callback = discardConflictingDocument)
                updateSummaryDB(self.statsumdatabase, job)

                #TODO: can add config switch to swich on and off
                # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'):
                if (job["retry_count"] > 0) or (newstate != 'success'):
                    jobSummaryId = job["name"]
                    # building a summary of fwjr
                    logging.debug("Pushing job summary for job %s" % jobSummaryId)
                    errmsgs = {}
                    inputs = []
                    if "steps" in fwjrDocument["fwjr"]:
                        for step in fwjrDocument["fwjr"]["steps"]:
                            if "errors" in fwjrDocument["fwjr"]["steps"][step]:
                                errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]]
                            if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in fwjrDocument["fwjr"]["steps"][step]["input"]:
                                inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source] )
    
                    outputs = []
                    outputDataset = None
                    for singlestep in job["fwjr"].listSteps():
                        for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep):
                            if singlefile:
                                outputs.append({'type': 'output' if CMSSTEP.match(singlestep) else singlefile.get('module_label', None),
                                                'lfn': singlefile.get('lfn', None),
                                                'location': list(singlefile.get('locations', set([]))) if len(singlefile.get('locations', set([]))) > 1
                                                                                                       else singlefile['locations'].pop(),
                                                'checksums': singlefile.get('checksums', {}),
                                                'size': singlefile.get('size', None) })
                                #it should have one output dataset for all the files
                                outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset
                    inputFiles = []
                    for inputFileStruct in job["fwjr"].getAllInputFiles():
                        # check if inputFileSummary needs to be extended
                        inputFileSummary = {}
                        inputFileSummary["lfn"] = inputFileStruct["lfn"]
                        inputFileSummary["input_type"] = inputFileStruct["input_type"]
                        inputFiles.append(inputFileSummary)
                    
                    # Don't record intermediate jobfailed status in the jobsummary
                    # change to jobcooloff which will be overwritten by error handler anyway
                    if (job["retry_count"] > 0) and (newstate == 'jobfailed'):
                        summarystate = 'jobcooloff'
                    else:
                        summarystate = newstate
                           
                    jobSummary = {"_id": jobSummaryId,
                                  "wmbsid": job["id"],
                                  "type": "jobsummary",
                                  "retrycount": job["retry_count"],
                                  "workflow": job["workflow"],
                                  "task": job["task"],
                                  "jobtype": job["jobType"],
                                  "state": summarystate,
                                  "site": job.get("location", None),
                                  "cms_location": job["fwjr"].getSiteName(),
                                  "exitcode": job["fwjr"].getExitCode(),
                                  "errors": errmsgs,
                                  "lumis": inputs,
                                  "outputdataset": outputDataset,
                                  "inputfiles": inputFiles,
                                  "acdc_url": "%s/%s" % (sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database),
                                  "agent_name": self.config.Agent.hostName,
                                  "output": outputs }
                    if couchDocID is not None:
                        try:
                            currentJobDoc = self.jsumdatabase.document(id = jobSummaryId)
                            jobSummary['_rev'] = currentJobDoc['_rev']
                            jobSummary['state_history'] = currentJobDoc.get('state_history', [])
                            # record final status transition
                            if newstate == 'success':
                                finalStateDict = {'oldstate': oldstate,
                                                  'newstate': newstate,
                                                  'location': job["location"],
                                                  'timestamp': timestamp}
                                jobSummary['state_history'].append(finalStateDict)
                                
                            noEmptyList = ["inputfiles", "lumis"]
                            for prop in noEmptyList:
                                jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, [])
                        except CouchNotFoundError:
                            pass
                    self.jsumdatabase.queue(jobSummary, timestamp = True)

        if len(couchRecordsToUpdate) > 0:
            self.setCouchDAO.execute(bulkList = couchRecordsToUpdate,
                                     conn = self.getDBConn(),
                                     transaction = self.existingTransaction())

        self.jobsdatabase.commit(callback = discardConflictingDocument)
        self.fwjrdatabase.commit(callback = discardConflictingDocument)
        self.jsumdatabase.commit()
        return
Ejemplo n.º 37
0
destJobsDb.loadView("JobDump", "statusByWorkflowName", options = {"limit": 1})
print "  Triggering view generation for fwjrs database..."
destFwjrsDb.loadView("FWJRDump", "outputByWorkflowName", options = {"limit": 1})

print ""
# Query destination DB for list of workflows
summaryBase = "%s/%s%%2Ffwjrs/_design/FWJRDump/_show/workflowSummary/%s" # dest host, dest db base, workflow name
successBase = "%s/%s%%2Fjobs/_design/JobDump/_list/successJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false" # dest host, dest db base, workflow, workflow
failedBase = "%s/%s%%2Fjobs/_design/JobDump/_list/failedJobs/statusByWorkflowName?startkey=%%5B%%22%s%%22%%5D&endkey=%%5B%%22%s%%22%%2C%%7B%%7D%%5D&reduce=false" # dest host, dest db base, workflow, workflow

srcJobsDb = srcCouchServer.connectDatabase(srcDbBase + "/jobs")
statusResult = srcJobsDb.loadView("JobDump", "statusByWorkflowName", options = {"group_level": 1})

fileHandle = open("archived.html", "w")
fileHandle.write("<html><head><title>Archived Workflows</title></head>\n")
fileHandle.write("<body>\n")

workflowNames = []
for statusRow in statusResult["rows"]:
    wfName = statusRow["key"][0]
    summaryUrl = summaryBase % (destCouchHost, destDbBase, wfName)
    successUrl = successBase % (destCouchHost, destDbBase, wfName, wfName)
    failedUrl = successBase % (destCouchHost, destDbBase, wfName, wfName)    
    fileHandle.write("%s " % wfName)
    fileHandle.write("<a href=%s>(summary)</a>" % sanitizeURL(summaryUrl)["url"])
    fileHandle.write(" <a href=%s>(success)</a>" % sanitizeURL(successUrl)["url"])
    fileHandle.write(" <a href=%s>(failure)</a><br>\n" % sanitizeURL(failedUrl)["url"])    

fileHandle.write("</body></html>\n")
fileHandle.close()
Ejemplo n.º 38
0
 def __init__(self, couchURL, couchapp="ReqMgr"):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     self._commonInit(couchURL, couchapp)
Ejemplo n.º 39
0
 def __init__(self, couchURL, dbName = None):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     self._commonInit(couchURL, dbName)
Ejemplo n.º 40
0
    def testA_BasicFunctionTest(self):
        """
        _BasicFunctionTest_

        Tests the components, by seeing if they can process a simple set of closeouts
        """

        myThread = threading.currentThread()

        config = self.getConfig()
        workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl')
        workload = self.createWorkload(workloadName=workloadPath)
        testJobGroup = self.createTestJobGroup(config=config,
                                               name=workload.name(),
                                               specLocation=workloadPath,
                                               error=False)

        # Create second workload
        testJobGroup2 = self.createTestJobGroup(
            config=config,
            name=workload.name(),
            filesetName="TestFileset_2",
            specLocation=workloadPath,
            task="/TestWorkload/ReReco/LogCollect")

        cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload",
                                 "ReReco")
        os.makedirs(cachePath)
        self.assertTrue(os.path.exists(cachePath))

        cachePath2 = os.path.join(config.JobCreator.jobCacheDir,
                                  "TestWorkload", "LogCollect")
        os.makedirs(cachePath2)
        self.assertTrue(os.path.exists(cachePath2))

        result = myThread.dbi.processData(
            "SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 2)

        workflowName = "TestWorkload"
        dbname = config.TaskArchiver.workloadSummaryCouchDBName
        couchdb = CouchServer(config.JobStateMachine.couchurl)
        workdatabase = couchdb.connectDatabase(dbname)
        jobdb = couchdb.connectDatabase("%s/jobs" % self.databaseName)
        fwjrdb = couchdb.connectDatabase("%s/fwjrs" % self.databaseName)
        jobs = jobdb.loadView("JobDump",
                              "jobsByWorkflowName",
                              options={
                                  "startkey": [workflowName],
                                  "endkey": [workflowName, {}]
                              })['rows']
        self.assertEqual(len(jobs), 2 * self.nJobs)

        from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase
        create = CreateWMBSBase()
        tables = []
        for x in create.requiredTables:
            tables.append(x[2:])

        testTaskArchiver = TaskArchiverPoller(config=config)
        testTaskArchiver.algorithm()

        result = myThread.dbi.processData(
            "SELECT * FROM wmbs_job")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData(
            "SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData(
            "SELECT * FROM wmbs_jobgroup")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData(
            "SELECT * FROM wmbs_fileset")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData(
            "SELECT * FROM wmbs_file_details")[0].fetchall()
        self.assertEqual(len(result), 0)

        # Make sure we deleted the directory
        self.assertFalse(os.path.exists(cachePath))
        self.assertFalse(
            os.path.exists(
                os.path.join(self.testDir, 'workloadTest/TestWorkload')))

        testWMBSFileset = Fileset(id=1)
        self.assertEqual(testWMBSFileset.exists(), False)

        workloadSummary = workdatabase.document(id="TestWorkload")
        # Check ACDC
        self.assertEqual(workloadSummary['ACDCServer'],
                         sanitizeURL(config.ACDC.couchurl)['url'])

        # Check the output
        self.assertEqual(workloadSummary['output'].keys(),
                         ['/Electron/MorePenguins-v0/RECO'])
        self.assertEqual(
            sorted(workloadSummary['output']['/Electron/MorePenguins-v0/RECO']
                   ['tasks']),
            ['/TestWorkload/ReReco', '/TestWorkload/ReReco/LogCollect'])
        # Check performance
        # Check histograms
        self.assertAlmostEquals(
            workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']
            ['AvgEventTime']['histogram'][0]['average'],
            0.89405199999999996,
            places=2)
        self.assertEqual(
            workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']
            ['AvgEventTime']['histogram'][0]['nEvents'], 10)

        # Check standard performance
        self.assertAlmostEquals(
            workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']
            ['TotalJobCPU']['average'],
            17.786300000000001,
            places=2)
        self.assertAlmostEquals(
            workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']
            ['TotalJobCPU']['stdDev'],
            0.0,
            places=2)

        # Check worstOffenders
        self.assertEqual(
            workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1']
            ['AvgEventTime']['worstOffenders'], [{
                'logCollect': None,
                'log': None,
                'value': '0.894052',
                'jobID': 1
            }, {
                'logCollect': None,
                'log': None,
                'value': '0.894052',
                'jobID': 1
            }, {
                'logCollect': None,
                'log': None,
                'value': '0.894052',
                'jobID': 2
            }])

        # Check retryData
        self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'],
                         {'1': 10})
        logCollectPFN = 'srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms/store/logs/prod/2012/11/WMAgent/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8-AlcaSkimLogCollect-1-logs.tar'
        self.assertEqual(workloadSummary['logArchives'], {
            '/TestWorkload/ReReco/LogCollect':
            [logCollectPFN for _ in range(10)]
        })

        # LogCollect task is made out of identical FWJRs
        # assert that it is identical
        for x in workloadSummary['performance'][
                '/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys():
            if x in config.TaskArchiver.histogramKeys:
                continue
            for y in ['average', 'stdDev']:
                self.assertAlmostEquals(
                    workloadSummary['performance']
                    ['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y],
                    workloadSummary['performance']['/TestWorkload/ReReco']
                    ['cmsRun1'][x][y],
                    places=2)

        return
Ejemplo n.º 41
0
    def archiveWorkflowSummary(self, spec):
        """
        _archiveWorkflowSummary_

        For each workflow pull its information from couch and WMBS and turn it into
        a summary for archiving
        """

        failedJobs = []

        workflowData = {'retryData': {}}
        workflowName = spec.name()

        #First make sure that we didn't upload something already
        #Could be the that the WMBS deletion epic failed,
        #so we can skip this if there is a summary already up there
        #TODO: With multiple agents sharing workflows, we will need to differentiate and combine summaries for a request
        if self.workdatabase.documentExists(workflowName):
            logging.info(
                "Couch summary for %s already exists, proceeding only with cleanup"
                % workflowName)
            return

        # Set campaign
        workflowData['campaign'] = spec.getCampaign()
        # Set inputdataset
        workflowData['inputdatasets'] = spec.listInputDatasets()
        # Set histograms
        histograms = {
            'workflowLevel': {
                'failuresBySite':
                DiscreteSummaryHistogram('Failed jobs by site', 'Site')
            },
            'taskLevel': {},
            'stepLevel': {}
        }

        # Get a list of failed job IDs
        # Make sure you get it for ALL tasks in the spec
        for taskName in spec.listAllTaskPathNames():
            failedTmp = self.jobsdatabase.loadView(
                "JobDump",
                "failedJobsByWorkflowName",
                options={
                    "startkey": [workflowName, taskName],
                    "endkey": [workflowName, taskName]
                })['rows']
            for entry in failedTmp:
                failedJobs.append(entry['value'])

        retryData = self.jobsdatabase.loadView("JobDump",
                                               "retriesByTask",
                                               options={
                                                   'group_level': 3,
                                                   'startkey': [workflowName],
                                                   'endkey':
                                                   [workflowName, {}]
                                               })['rows']
        for row in retryData:
            taskName = row['key'][2]
            count = str(row['key'][1])
            if not taskName in workflowData['retryData'].keys():
                workflowData['retryData'][taskName] = {}
            workflowData['retryData'][taskName][count] = row['value']

        output = self.fwjrdatabase.loadView("FWJRDump",
                                            "outputByWorkflowName",
                                            options={
                                                "group_level": 2,
                                                "startkey": [workflowName],
                                                "endkey": [workflowName, {}],
                                                "group": True
                                            })['rows']
        outputListStr = self.fwjrdatabase.loadList("FWJRDump",
                                                   "workflowOutputTaskMapping",
                                                   "outputByWorkflowName",
                                                   options={
                                                       "startkey":
                                                       [workflowName],
                                                       "endkey":
                                                       [workflowName, {}],
                                                       "reduce": False
                                                   })
        outputList = json.loads(outputListStr)
        perf = self.handleCouchPerformance(workflowName=workflowName)
        workflowData['performance'] = {}
        for key in perf:
            workflowData['performance'][key] = {}
            for attr in perf[key].keys():
                workflowData['performance'][key][attr] = perf[key][attr]

        workflowData["_id"] = workflowName
        try:
            workflowData["ACDCServer"] = sanitizeURL(
                self.config.ACDC.couchurl)['url']
            workflowData["ACDCDatabase"] = self.config.ACDC.database
        except AttributeError, ex:
            # We're missing the ACDC info.
            # Keep going
            logging.error(
                "ACDC info missing from config.  Skipping this step in the workflow summary."
            )
            logging.error("Error: %s" % str(ex))
Ejemplo n.º 42
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.config = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, 'WorkQueueParams'):
                self.workQueue = localQueue(
                    **self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.maxProcessSize = getattr(self.config.TaskArchiver,
                                      'maxProcessSize', 250)
        self.timeout = getattr(self.config.TaskArchiver, "timeOut", None)
        self.nOffenders = getattr(self.config.TaskArchiver, 'nOffenders', 3)
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.uploadPublishInfo = getattr(self.config.TaskArchiver,
                                         'uploadPublishInfo', False)
        self.uploadPublishDir = getattr(self.config.TaskArchiver,
                                        'uploadPublishDir', None)
        self.userFileCacheURL = getattr(self.config.TaskArchiver,
                                        'userFileCacheURL', None)

        # Set up optional histograms
        self.histogramKeys = getattr(self.config.TaskArchiver, "histogramKeys",
                                     [])
        self.histogramBins = getattr(self.config.TaskArchiver, "histogramBins",
                                     10)
        self.histogramLimit = getattr(self.config.TaskArchiver,
                                      "histogramLimit", 5.0)

        if not self.useReqMgrForCompletionCheck:
            #sets the local monitor summary couch db
            self.wmstatsCouchDB = WMStatsWriter(
                self.config.TaskArchiver.localWMStatsURL)
            self.centralCouchDBWriter = self.wmstatsCouchDB
        else:
            self.centralCouchDBWriter = WMStatsWriter(
                self.config.TaskArchiver.centralWMStatsURL)
        # Start a couch server for getting job info
        # from the FWJRs for committal to archive
        try:
            workDBName = getattr(self.config.TaskArchiver,
                                 'workloadSummaryCouchDBName',
                                 'workloadsummary')
            workDBurl = getattr(self.config.TaskArchiver,
                                'workloadSummaryCouchURL')
            jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
            jobDBName = self.config.JobStateMachine.couchDBName
            self.jobCouchdb = CouchServer(jobDBurl)
            self.workCouchdb = CouchServer(workDBurl)

            self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                                jobDBName)
            self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                                jobDBName)
            self.workdatabase = self.workCouchdb.connectDatabase(workDBName)

            logging.debug("Using url %s/%s for job" % (jobDBurl, jobDBName))
            logging.debug("Writing to  %s/%s for workloadSummary" %
                          (sanitizeURL(workDBurl)['url'], workDBName))
            self.requireCouch = getattr(self.config.TaskArchiver,
                                        'requireCouch', False)
        except Exception, ex:
            msg = "Error in connecting to couch.\n"
            msg += str(ex)
            logging.error(msg)
            self.jobsdatabase = None
            self.fwjrdatabase = None
            if getattr(self.config.TaskArchiver, 'requireCouch', False):
                raise TaskArchiverPollerException(msg)
Ejemplo n.º 43
0
    def availableWork(self, thresholds, siteJobCounts, teams=None, wfs=None):
        """
        Get work which is available to be run

        Assume thresholds is a dictionary; keys are the site name, values are
        the maximum number of running jobs at that site.

        Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site
        name and task priorities.  The value is the number of jobs running at that
        priority.
        """
        self.logger.info("Getting available work from %s/%s" %
                         (sanitizeURL(self.server.url)['url'], self.db.name))
        elements = []

        # We used to pre-filter sites, looking to see if there are idle job slots
        # We don't do this anymore, as we may over-allocate
        # jobs to sites if the new jobs have a higher priority.

        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, thresholds, siteJobCounts

        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['resources'] = thresholds
        if teams:
            options['teams'] = teams
            self.logger.info("setting teams %s" % teams)
        if wfs:
            result = []
            for i in xrange(0, len(wfs), 20):
                options['wfs'] = wfs[i:i + 20]
                data = self.db.loadList('WorkQueue', 'workRestrictions',
                                        'availableByPriority', options)
                result.extend(json.loads(data))
            # sort final list
            result.sort(key=lambda x: x[
                'WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement'
            ]['Priority'])
        else:
            result = self.db.loadList('WorkQueue', 'workRestrictions',
                                      'availableByPriority', options)
            result = json.loads(result)
            if len(result) == 0:
                self.logger.info(
                    """No available work in WQ or didn't pass workqueue restriction 
                                    - check Pileup, site white list, etc""")
            self.logger.debug("Available Work:\n %s \n for resources\n %s" %
                              (result, thresholds))
        # Iterate through the results; apply whitelist / blacklist / data
        # locality restrictions.  Only assign jobs if they are high enough
        # priority.
        for i in result:
            element = CouchWorkQueueElement.fromDocument(self.db, i)
            prio = element['Priority']

            possibleSite = None
            sites = thresholds.keys()
            random.shuffle(sites)
            for site in sites:
                if element.passesSiteRestriction(site):
                    # Count the number of jobs currently running of greater priority
                    prio = element['Priority']
                    curJobCount = sum(
                        map(lambda x: x[1] if x[0] >= prio else 0,
                            siteJobCounts.get(site, {}).items()))
                    self.logger.debug("Job Count: %s, site: %s threshods: %s" %
                                      (curJobCount, site, thresholds[site]))
                    if curJobCount < thresholds[site]:
                        possibleSite = site
                        break

            if possibleSite:
                elements.append(element)
                if site not in siteJobCounts:
                    siteJobCounts[site] = {}
                siteJobCounts[site][prio] = siteJobCounts[site].setdefault(
                    prio, 0) + element['Jobs']
            else:
                self.logger.info("No possible site for %s" % element)
        # sort elements to get them in priority first and timestamp order
        elements.sort(key=lambda element: element['CreationTime'])
        elements.sort(key=lambda x: x['Priority'], reverse=True)

        return elements, thresholds, siteJobCounts
Ejemplo n.º 44
0
 def __init__(self, couchURL, couchapp="ReqMgr"):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     self._commonInit(couchURL, couchapp)
Ejemplo n.º 45
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        self.config      = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, 'WorkQueueParams'):
                self.workQueue = localQueue(**self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.maxProcessSize    = getattr(self.config.TaskArchiver, 'maxProcessSize', 250)
        self.timeout           = getattr(self.config.TaskArchiver, "timeOut", None)
        self.nOffenders        = getattr(self.config.TaskArchiver, 'nOffenders', 3)
        self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.uploadPublishInfo = getattr(self.config.TaskArchiver, 'uploadPublishInfo', False)
        self.uploadPublishDir  = getattr(self.config.TaskArchiver, 'uploadPublishDir', None)
        self.userFileCacheURL  = getattr(self.config.TaskArchiver, 'userFileCacheURL', None)

        # Set up optional histograms
        self.histogramKeys  = getattr(self.config.TaskArchiver, "histogramKeys", [])
        self.histogramBins  = getattr(self.config.TaskArchiver, "histogramBins", 10)
        self.histogramLimit = getattr(self.config.TaskArchiver, "histogramLimit", 5.0)
        
        if not self.useReqMgrForCompletionCheck:
            #sets the local monitor summary couch db
            self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL);
        # Start a couch server for getting job info
        # from the FWJRs for committal to archive
        try:
            workDBName       = getattr(self.config.TaskArchiver, 'workloadSummaryCouchDBName',
                                        'workloadsummary')
            workDBurl        = getattr(self.config.TaskArchiver, 'workloadSummaryCouchURL')
            jobDBurl         = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
            jobDBName        = self.config.JobStateMachine.couchDBName
            self.jobCouchdb  = CouchServer(jobDBurl)
            self.workCouchdb = CouchServer(workDBurl)

            self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
            self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
            self.workdatabase = self.workCouchdb.connectDatabase(workDBName)

            logging.debug("Using url %s/%s for job" % (jobDBurl, jobDBName))
            logging.debug("Writing to  %s/%s for workloadSummary" % (sanitizeURL(workDBurl)['url'], workDBName))
            self.requireCouch = getattr(self.config.TaskArchiver, 'requireCouch', False)
        except Exception, ex:
            msg =  "Error in connecting to couch.\n"
            msg += str(ex)
            logging.error(msg)
            self.jobsdatabase = None
            self.fwjrdatabase = None
            if getattr(self.config.TaskArchiver, 'requireCouch', False):
                raise TaskArchiverPollerException(msg)
Ejemplo n.º 46
0
    def availableWork(self, thresholds, siteJobCounts, teams = None, wfs = None):
        """
        Get work which is available to be run

        Assume thresholds is a dictionary; keys are the site name, values are
        the maximum number of running jobs at that site.

        Assumes site_job_counts is a dictionary-of-dictionaries; keys are the site
        name and task priorities.  The value is the number of jobs running at that
        priority.
        """
        self.logger.info("Getting available work from %s/%s" % 
                         (sanitizeURL(self.server.url)['url'], self.db.name))
        elements = []

        # We used to pre-filter sites, looking to see if there are idle job slots
        # We don't do this anymore, as we may over-allocate
        # jobs to sites if the new jobs have a higher priority.

        # If there are no sites, punt early.
        if not thresholds:
            self.logger.error("No thresholds is set: Please check")
            return elements, thresholds, siteJobCounts

        options = {}
        options['include_docs'] = True
        options['descending'] = True
        options['resources'] = thresholds
        if teams:
            options['teams'] = teams
            self.logger.info("setting teams %s" % teams)
        if wfs:
            result = []
            for i in xrange(0, len(wfs), 20):
                options['wfs'] = wfs[i:i+20]
                data = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options)
                result.extend(json.loads(data))
            # sort final list
            result.sort(key = lambda x: x['WMCore.WorkQueue.DataStructs.WorkQueueElement.WorkQueueElement']['Priority'])
        else:
            result = self.db.loadList('WorkQueue', 'workRestrictions', 'availableByPriority', options)
            result = json.loads(result)
            if len(result) == 0:
                self.logger.info("""No available work in WQ or didn't pass workqueue restriction 
                                    - check Pileup, site white list, etc""")
            self.logger.debug("Available Work:\n %s \n for resources\n %s" % (result, thresholds))
        # Iterate through the results; apply whitelist / blacklist / data
        # locality restrictions.  Only assign jobs if they are high enough
        # priority.
        for i in result:
            element = CouchWorkQueueElement.fromDocument(self.db, i)
            prio = element['Priority']

            possibleSite = None
            sites = thresholds.keys()
            random.shuffle(sites)
            for site in sites:
                if element.passesSiteRestriction(site):
                    # Count the number of jobs currently running of greater priority
                    prio = element['Priority']
                    curJobCount = sum(map(lambda x : x[1] if x[0] >= prio else 0, siteJobCounts.get(site, {}).items()))
                    self.logger.debug("Job Count: %s, site: %s threshods: %s" % (curJobCount, site, thresholds[site]))
                    if curJobCount < thresholds[site]:
                        possibleSite = site
                        break

            if possibleSite:
                self.logger.debug("Possible site exists %s" % str(possibleSite))
                elements.append(element)
                if site not in siteJobCounts:
                    siteJobCounts[site] = {}
                siteJobCounts[site][prio] = siteJobCounts[site].setdefault(prio, 0) + element['Jobs']*element.get('blowupFactor', 1.0)
            else:
                self.logger.info("No possible site for %s" % element['RequestName'])
        # sort elements to get them in priority first and timestamp order
        elements.sort(key=lambda element: element['CreationTime'])
        elements.sort(key = lambda x: x['Priority'], reverse = True)
        
        return elements, thresholds, siteJobCounts
Ejemplo n.º 47
0
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer",
                                        logger = myThread.logger,
                                        dbinterface = myThread.dbi)

        self.getOutputMapAction      = self.daofactory(classname = "Jobs.GetOutputMap")
        self.bulkAddToFilesetAction  = self.daofactory(classname = "Fileset.BulkAddByLFN")
        self.bulkParentageAction     = self.daofactory(classname = "Files.AddBulkParentage")
        self.getJobTypeAction        = self.daofactory(classname = "Jobs.GetType")
        self.getParentInfoAction     = self.daofactory(classname = "Files.GetParentInfo")
        self.setParentageByJob       = self.daofactory(classname = "Files.SetParentageByJob")
        self.setFileRunLumi          = self.daofactory(classname = "Files.AddRunLumi")
        self.setFileLocation         = self.daofactory(classname = "Files.SetLocationByLFN")
        self.setFileAddChecksum      = self.daofactory(classname = "Files.AddChecksumByLFN")
        self.addFileAction           = self.daofactory(classname = "Files.Add")
        self.jobCompleteInput        = self.daofactory(classname = "Jobs.CompleteInput")
        self.setBulkOutcome          = self.daofactory(classname = "Jobs.SetOutcomeBulk")
        self.getWorkflowSpec         = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID         = self.daofactory(classname = "Jobs.LoadFromID")
        self.getFullJobInfo         = self.daofactory(classname = "Jobs.LoadForErrorHandler")

        self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren")
        self.dbsCreateFiles    = self.dbsDaoFactory(classname = "DBSBufferFiles.Add")
        self.dbsSetLocation    = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation")
        self.dbsSetChecksum    = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi     = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow    = self.dbsDaoFactory(classname = "ListWorkflow")

        self.dbsLFNHeritage      = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # ACDC service
        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb  = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL)

        # Hold data for later commital
        self.dbsFilesToCreate  = []
        self.wmbsFilesToBuild  = []
        self.fileLocation      = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave  = []
        self.listOfJobsToFail  = []
        self.filesetAssoc      = []
        self.parentageBinds    = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID     = collections.deque(maxlen = 1000)
        self.datasetAlgoPaths  = collections.deque(maxlen = 1000)
        self.dbsLocations      = collections.deque(maxlen = 1000)
        self.workflowIDs       = collections.deque(maxlen = 1000)
        self.workflowPaths     = collections.deque(maxlen = 1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()


        return
Ejemplo n.º 48
0
 def __init__(self, couchURL, dbName=None):
     couchURL = sanitizeURL(couchURL)['url']
     # set the connection for local couchDB call
     self._commonInit(couchURL, dbName)
Ejemplo n.º 49
0
    def testA_BasicFunctionTest(self):
        """
        _BasicFunctionTest_
        
        Tests the components, by seeing if they can process a simple set of closeouts
        """

        myThread = threading.currentThread()

        config = self.getConfig()
        workloadPath = os.path.join(self.testDir, "specDir", "spec.pkl")
        workload = self.createWorkload(workloadName=workloadPath)
        testJobGroup = self.createTestJobGroup(
            config=config, name=workload.name(), specLocation=workloadPath, error=False
        )

        # Create second workload
        testJobGroup2 = self.createTestJobGroup(
            config=config,
            name="%s_2" % workload.name(),
            specLocation=workloadPath,
            task="/TestWorkload/ReReco/LogCollect",
        )

        cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco")
        os.makedirs(cachePath)
        self.assertTrue(os.path.exists(cachePath))

        cachePath2 = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "LogCollect")
        os.makedirs(cachePath2)
        self.assertTrue(os.path.exists(cachePath2))

        result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 2)

        testTaskArchiver = TaskArchiverPoller(config=config)
        testTaskArchiver.algorithm()

        result = myThread.dbi.processData("SELECT * FROM wmbs_job")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_subscription")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_jobgroup")[0].fetchall()
        self.assertEqual(len(result), 0)
        result = myThread.dbi.processData("SELECT * FROM wmbs_fileset")[0].fetchall()
        result = myThread.dbi.processData("SELECT * FROM wmbs_file_details")[0].fetchall()
        self.assertEqual(len(result), 0)

        # Make sure we deleted the directory
        self.assertFalse(os.path.exists(cachePath))
        self.assertFalse(os.path.exists(os.path.join(self.testDir, "workloadTest/TestWorkload")))

        testWMBSFileset = Fileset(id=1)
        self.assertEqual(testWMBSFileset.exists(), False)

        dbname = getattr(config.JobStateMachine, "couchDBName")
        couchdb = CouchServer(config.JobStateMachine.couchurl)
        workdatabase = couchdb.connectDatabase(dbname)

        workloadSummary = workdatabase.document(id="TestWorkload")
        # Check ACDC
        self.assertEqual(workloadSummary["ACDCServer"], sanitizeURL(config.ACDC.couchurl)["url"])

        # Check the output
        self.assertEqual(
            workloadSummary["output"].keys(), ["/Electron/MorePenguins-v0/RECO", "/Electron/MorePenguins-v0/ALCARECO"]
        )

        # Check performance
        # Check histograms
        self.assertEqual(
            workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["AvgEventTime"]["histogram"][0][
                "average"
            ],
            0.062651899999999996,
        )
        self.assertEqual(
            workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["AvgEventTime"]["histogram"][0][
                "nEvents"
            ],
            5,
        )

        # Check standard performance
        self.assertEqual(
            workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["TotalJobCPU"]["average"],
            9.4950600000000005,
        )
        self.assertEqual(
            workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["TotalJobCPU"]["stdDev"],
            8.2912400000000002,
        )

        # Check worstOffenders
        self.assertEqual(
            workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"]["AvgEventTime"]["worstOffenders"],
            [
                {"logCollect": None, "log": None, "value": "0.894052", "jobID": 1},
                {"logCollect": None, "log": None, "value": "0.894052", "jobID": 2},
                {"logCollect": None, "log": None, "value": "0.894052", "jobID": 3},
            ],
        )

        # LogCollect task is made out of identical FWJRs
        # assert that it is identical
        for x in workloadSummary["performance"]["/TestWorkload/ReReco/LogCollect"]["cmsRun1"].keys():
            if x in config.TaskArchiver.histogramKeys:
                continue
            for y in ["average", "stdDev"]:
                self.assertEqual(
                    workloadSummary["performance"]["/TestWorkload/ReReco/LogCollect"]["cmsRun1"][x][y],
                    workloadSummary["performance"]["/TestWorkload/ReReco"]["cmsRun1"][x][y],
                )

        return
Ejemplo n.º 50
0
 def _sanitizeURL(self, couchURL):
     return sanitizeURL(couchURL)['url']
Ejemplo n.º 51
0
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn_to_psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()

        return
Ejemplo n.º 52
0
    def archiveWorkflowSummary(self, spec):
        """
        _archiveWorkflowSummary_

        For each workflow pull its information from couch and WMBS and turn it into
        a summary for archiving
        """

        failedJobs = []

        workflowData = {'retryData': {}}
        workflowName = spec.name()

        #First make sure that we didn't upload something already
        #Could be the that the WMBS deletion epic failed,
        #so we can skip this if there is a summary already up there
        #TODO: With multiple agents sharing workflows, we will need to differentiate and combine summaries for a request
        if self.workdatabase.documentExists(workflowName):
            logging.info("Couch summary for %s already exists, proceeding only with cleanup" % workflowName)
            return

        # Set campaign
        workflowData['campaign'] = spec.getCampaign()

        # Get a list of failed job IDs
        # Make sure you get it for ALL tasks in the spec
        for taskName in spec.listAllTaskPathNames():
            failedTmp = self.jobsdatabase.loadView("JobDump", "failedJobsByWorkflowName",
                                                   options = {"startkey": [workflowName, taskName],
                                                              "endkey": [workflowName, taskName]})['rows']
            for entry in failedTmp:
                failedJobs.append(entry['value'])

        retryData = self.jobsdatabase.loadView("JobDump", "retriesByTask",
                                               options = {'group_level': 3,
                                                          'startkey': [workflowName],
                                                          'endkey': [workflowName, {}]})['rows']
        for row in retryData:
            taskName = row['key'][2]
            count    = str(row['key'][1])
            if not taskName in workflowData['retryData'].keys():
                workflowData['retryData'][taskName] = {}
            workflowData['retryData'][taskName][count] = row['value']

        output = self.fwjrdatabase.loadView("FWJRDump", "outputByWorkflowName",
                                            options = {"group_level": 2,
                                                       "startkey": [workflowName],
                                                       "endkey": [workflowName, {}],
                                                       "group": True})['rows']

        perf = self.handleCouchPerformance(workflowName = workflowName)
        workflowData['performance'] = {}
        for key in perf:
            workflowData['performance'][key] = {}
            for attr in perf[key].keys():
                workflowData['performance'][key][attr] = perf[key][attr]


        workflowData["_id"] = workflowName
        try:
            workflowData["ACDCServer"]   = sanitizeURL(self.config.ACDC.couchurl)['url']
            workflowData["ACDCDatabase"] = self.config.ACDC.database
        except AttributeError, ex:
            # We're missing the ACDC info.
            # Keep going
            logging.error("ACDC info missing from config.  Skipping this step in the workflow summary.")
            logging.error("Error: %s" % str(ex))
Ejemplo n.º 53
0
    def recordInCouch(self, jobs, newstate, oldstate, updatesummary=False):
        """
        _recordInCouch_

        Record relevant job information in couch. If the job does not yet exist
        in couch it will be saved as a seperate document.  If the job has a FWJR
        attached that will be saved as a seperate document.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        timestamp = int(time.time())
        couchRecordsToUpdate = []

        for job in jobs:
            couchDocID = job.get("couch_record", None)

            if newstate == "new":
                oldstate = "none"

            if job.get("site_cms_name", None):
                if newstate == "executing":
                    jobLocation = job["site_cms_name"]
                else:
                    jobLocation = "Agent"
            else:
                jobLocation = "Agent"

            if couchDocID is None:
                jobDocument = {}
                jobDocument["_id"] = str(job["id"])
                job["couch_record"] = jobDocument["_id"]
                jobDocument["jobid"] = job["id"]
                jobDocument["workflow"] = job["workflow"]
                jobDocument["task"] = job["task"]
                jobDocument["owner"] = job["owner"]

                jobDocument["inputfiles"] = []
                for inputFile in job["input_files"]:
                    docInputFile = inputFile.json()

                    docInputFile["parents"] = []
                    for parent in inputFile["parents"]:
                        docInputFile["parents"].append({"lfn": parent["lfn"]})

                    jobDocument["inputfiles"].append(docInputFile)

                jobDocument["states"] = {"0": {"oldstate": oldstate,
                                               "newstate": newstate,
                                               "location": jobLocation,
                                               "timestamp": timestamp}}

                jobDocument["jobgroup"] = job["jobgroup"]
                jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"],
                                       "LastEvent": job["mask"]["LastEvent"],
                                       "FirstLumi": job["mask"]["FirstLumi"],
                                       "LastLumi": job["mask"]["LastLumi"],
                                       "FirstRun": job["mask"]["FirstRun"],
                                       "LastRun": job["mask"]["LastRun"]}

                if job['mask']['runAndLumis'] != {}:
                    # Then we have to save the mask runAndLumis
                    jobDocument['mask']['runAndLumis'] = {}
                    for key in job['mask']['runAndLumis'].keys():
                        jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key]

                jobDocument["name"] = job["name"]
                jobDocument["type"] = "job"
                jobDocument["user"] = job.get("user", None)
                jobDocument["group"] = job.get("group", None)
                jobDocument["taskType"] = job.get("taskType", "Unknown")
                jobDocument["jobType"] = job.get("jobType", "Unknown")

                couchRecordsToUpdate.append({"jobid": job["id"],
                                             "couchid": jobDocument["_id"]})
                self.jobsdatabase.queue(jobDocument, callback=discardConflictingDocument)
            else:
                # We send a PUT request to the stateTransition update handler.
                # Couch expects the parameters to be passed as arguments to in
                # the URI while the Requests class will only encode arguments
                # this way for GET requests.  Changing the Requests class to
                # encode PUT arguments as couch expects broke a bunch of code so
                # we'll just do our own encoding here.
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    newstate,
                                                                                    jobLocation,
                                                                                    timestamp)
                self.jobsdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)

            # updating the status of the summary doc only when it is explicitely requested
            # doc is already in couch
            if updatesummary:
                jobSummaryId = job["name"]
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobSummaryState/" + jobSummaryId
                # map retrydone state to jobfailed state for monitoring
                if newstate == "retrydone":
                    monitorState = "jobfailed"
                else:
                    monitorState = newstate
                updateUri += "?newstate=%s&timestamp=%s" % (monitorState, timestamp)
                self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
                logging.debug("Updated job summary status for job %s", jobSummaryId)

                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobStateTransition/" + jobSummaryId
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    monitorState,
                                                                                    job["location"],
                                                                                    timestamp)
                self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
                logging.debug("Updated job summary state history for job %s", jobSummaryId)

            if job.get("fwjr", None):

                cachedByWorkflow = self.workloadCache.setdefault(job['workflow'],
                                                                 getDataFromSpecFile(
                                                                     self.getWorkflowSpecDAO.execute(job['task'])[
                                                                         job['task']]['spec']))
                job['fwjr'].setCampaign(cachedByWorkflow.get('Campaign', ''))
                job['fwjr'].setPrepID(cachedByWorkflow.get(job['task'], ''))
                # If there are too many input files, strip them out
                # of the FWJR, as they should already
                # be in the database
                # This is not critical
                try:
                    if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles:
                        job['fwjr'].stripInputFiles()
                except Exception as ex:
                    logging.error("Error while trying to strip input files from FWJR.  Ignoring. : %s", str(ex))

                if newstate == "retrydone":
                    jobState = "jobfailed"
                else:
                    jobState = newstate

                # there is race condition updating couch record location and job is completed.
                # for the fast fail job, it could miss the location update
                job["location"] = job["fwjr"].getSiteName() or job.get("location", "Unknown")
                # complete fwjr document
                job["fwjr"].setTaskName(job["task"])
                jsonFWJR = job["fwjr"].__to_json__(None)

                # Don't archive cleanup job report
                if job["jobType"] == "Cleanup":
                    archStatus = "skip"
                else:
                    archStatus = "ready"

                fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]),
                                "jobid": job["id"],
                                "jobtype": job["jobType"],
                                "jobstate": jobState,
                                "retrycount": job["retry_count"],
                                "archivestatus": archStatus,
                                "fwjr": jsonFWJR,
                                "type": "fwjr"}
                self.fwjrdatabase.queue(fwjrDocument, timestamp=True, callback=discardConflictingDocument)

                updateSummaryDB(self.statsumdatabase, job)

                # TODO: can add config switch to swich on and off
                # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'):
                if (job["retry_count"] > 0) or (newstate != 'success'):
                    jobSummaryId = job["name"]
                    # building a summary of fwjr
                    logging.debug("Pushing job summary for job %s", jobSummaryId)
                    errmsgs = {}
                    inputs = []
                    if "steps" in fwjrDocument["fwjr"]:
                        for step in fwjrDocument["fwjr"]["steps"]:
                            if "errors" in fwjrDocument["fwjr"]["steps"][step]:
                                errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]]
                            if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in \
                                    fwjrDocument["fwjr"]["steps"][step]["input"]:
                                inputs.extend(
                                    [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"]
                                     if "runs" in source])

                    outputs = []
                    outputDataset = None
                    for singlestep in job["fwjr"].listSteps():
                        for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep):
                            if singlefile:
                                if len(singlefile.get('locations', set())) > 1:
                                    locations = list(singlefile.get('locations'))
                                elif singlefile.get('locations'):
                                    locations = singlefile['locations'].pop()
                                else:
                                    locations = set()
                                if CMSSTEP.match(singlestep):
                                    outType = 'output'
                                else:
                                    outType = singlefile.get('module_label', None)
                                outputs.append({'type': outType,
                                                'lfn': singlefile.get('lfn', None),
                                                'location': locations,
                                                'checksums': singlefile.get('checksums', {}),
                                                'size': singlefile.get('size', None)})
                                # it should have one output dataset for all the files
                                outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset
                    inputFiles = []
                    for inputFileStruct in job["fwjr"].getAllInputFiles():
                        # check if inputFileSummary needs to be extended
                        inputFileSummary = {}
                        inputFileSummary["lfn"] = inputFileStruct["lfn"]
                        inputFileSummary["input_type"] = inputFileStruct["input_type"]
                        inputFiles.append(inputFileSummary)

                    # Don't record intermediate jobfailed status in the jobsummary
                    # change to jobcooloff which will be overwritten by error handler anyway
                    if (job["retry_count"] > 0) and (newstate == 'jobfailed'):
                        summarystate = 'jobcooloff'
                    else:
                        summarystate = newstate

                    jobSummary = {"_id": jobSummaryId,
                                  "wmbsid": job["id"],
                                  "type": "jobsummary",
                                  "retrycount": job["retry_count"],
                                  "workflow": job["workflow"],
                                  "task": job["task"],
                                  "jobtype": job["jobType"],
                                  "state": summarystate,
                                  "site": job.get("location", None),
                                  "cms_location": job["fwjr"].getSiteName(),
                                  "exitcode": job["fwjr"].getExitCode(),
                                  "eos_log_url": job["fwjr"].getLogURL(),
                                  "worker_node_info": job["fwjr"].getWorkerNodeInfo(),
                                  "errors": errmsgs,
                                  "lumis": inputs,
                                  "outputdataset": outputDataset,
                                  "inputfiles": inputFiles,
                                  "acdc_url": "%s/%s" % (
                                  sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database),
                                  "agent_name": self.config.Agent.hostName,
                                  "output": outputs}
                    if couchDocID is not None:
                        try:
                            currentJobDoc = self.jsumdatabase.document(id=jobSummaryId)
                            jobSummary['_rev'] = currentJobDoc['_rev']
                            jobSummary['state_history'] = currentJobDoc.get('state_history', [])
                            # record final status transition
                            if newstate == 'success':
                                finalStateDict = {'oldstate': oldstate,
                                                  'newstate': newstate,
                                                  'location': job["location"],
                                                  'timestamp': timestamp}
                                jobSummary['state_history'].append(finalStateDict)

                            noEmptyList = ["inputfiles", "lumis"]
                            for prop in noEmptyList:
                                jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, [])
                        except CouchNotFoundError:
                            pass
                    self.jsumdatabase.queue(jobSummary, timestamp=True)

        if len(couchRecordsToUpdate) > 0:
            self.setCouchDAO.execute(bulkList=couchRecordsToUpdate,
                                     conn=self.getDBConn(),
                                     transaction=self.existingTransaction())

        self.jobsdatabase.commit(callback=discardConflictingDocument)
        self.fwjrdatabase.commit(callback=discardConflictingDocument)
        self.jsumdatabase.commit()
        return