Exemple #1
0
    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s", self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0}, logger=self.logger)

        self.phedex = None
        self.rucio = None
        if not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={"logger": self.logger, "user_agent": "wmcore-microservices"})
Exemple #2
0
    def setUp(self):
        self.setConfig(config)
        self.setCouchDBs([(config.views.data.couch_reqmgr_db, "ReqMgr"),
                          (config.views.data.couch_reqmgr_aux_db, None)])
        self.setSchemaModules([])

        RESTBaseUnitTestWithDBBackend.setUp(self)

        self.setFakeDN()

        requestPath = os.path.join(getWMBASE(), "test", "data", "ReqMgr",
                                   "requests", "DMWM")
        rerecoFile = open(os.path.join(requestPath, "ReReco.json"), 'r')

        rerecoArgs = json.load(rerecoFile)
        self.rerecoCreateArgs = rerecoArgs["createRequest"]
        self.rerecoAssignArgs = rerecoArgs["assignRequest"]
        cmsswDoc = {"_id": "software"}
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]] = []
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]].append(
            self.rerecoCreateArgs["CMSSWVersion"])
        insertDataToCouch(os.getenv("COUCHURL"),
                          config.views.data.couch_reqmgr_aux_db, cmsswDoc)
        self.reqSvc = ReqMgr(self.jsonSender["host"])
        self.reqSvc._noStale = True
        self.reqSvc['requests'].additionalHeaders = self.create_header
def main():
    """
    _main_
    """
    if 'WMAGENT_CONFIG' not in os.environ:
        os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py'

    config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])

    # Instantiating central reqmgr and local workqueue
    print "ReqMgr2 URL  : %s" % sanitizeURL(config.JobUpdater.reqMgr2Url)['url']
    print "WorkQueue URL: %s and dbname %s" % (sanitizeURL(config.WorkQueueManager.couchurl)['url'],
                                               config.WorkQueueManager.dbname)

    reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url)
    workqueue = WorkQueue(config.WorkQueueManager.couchurl, config.WorkQueueManager.dbname)

    print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue"
    priorityCache = {}
    workflowsToUpdate = {}
    workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()]
    print "Retrieved %d workflows from workqueue" % len(workflowsToCheck)

    for workflow, priority in workflowsToCheck:
        if workflow not in priorityCache:
            try:
                priorityCache[workflow] = reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority']
            except Exception, ex:
                print "Couldn't retrieve the priority of request %s" % workflow
                print "Error: %s" % ex
                continue
        if priority != priorityCache[workflow]:
            workflowsToUpdate[workflow] = priorityCache[workflow]
Exemple #4
0
    def __init__(self, msConfig, logger=None):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param logger: logger object (optional)
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), logger)
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                   httpDict={'cacheduration': 1.0},
                                   logger=self.logger)

        # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data
        dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        if usingRucio():
            # FIXME: we cannot use Rucio in write mode yet
            # self.rucio = Rucio(self.msConfig['rucioAccount'], configDict={"logger": self.logger})
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
        else:
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
Exemple #5
0
    def setUp(self):
        self.setConfig(config)
        self.setCouchDBs([(config.views.data.couch_reqmgr_db, "ReqMgr"),
                          (config.views.data.couch_reqmgr_aux_db, None)])
        self.setSchemaModules([])

        RESTBaseUnitTestWithDBBackend.setUp(self)

        self.setFakeDN()

        normPath = os.path.normpath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '..'))
        rerecoPath = os.path.join(
            normPath, 'data/ReqMgr/requests/DMWM/ReReco_RunBlockWhite.json')
        with open(rerecoPath) as jObj:
            rerecoArgs = json.load(jObj)

        self.rerecoCreateArgs = rerecoArgs["createRequest"]
        self.rerecoAssignArgs = rerecoArgs["assignRequest"]
        cmsswDoc = {"_id": "software"}
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]] = []
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]].append(
            self.rerecoCreateArgs["CMSSWVersion"])
        insertDataToCouch(os.getenv("COUCHURL"),
                          config.views.data.couch_reqmgr_aux_db, cmsswDoc)
        self.reqSvc = ReqMgr(self.jsonSender["host"])
        self.reqSvc._noStale = True
        self.reqSvc['requests'].additionalHeaders = self.create_header
Exemple #6
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.config = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, 'WorkQueueParams'):
                self.workQueue = localQueue(
                    **self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.timeout = getattr(self.config.TaskArchiver, "timeOut", None)
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if not self.useReqMgrForCompletionCheck:
            #sets the local monitor summary couch db
            self.requestLocalCouchDB = RequestDBWriter(
                self.config.AnalyticsDataCollector.localT0RequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
            self.centralCouchDBWriter = self.requestLocalCouchDB
        else:
            self.centralCouchDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.centralRequestDBURL)

            self.reqmgr2Svc = ReqMgr(
                self.config.TaskArchiver.ReqMgr2ServiceURL)
            #TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only)
            self.reqmgrSvc = RequestManager(
                {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})

        #Load the cleanout state ID and save it
        stateIDDAO = self.daoFactory(classname="Jobs.GetStateID")
        self.stateID = stateIDDAO.execute("cleanout")

        return
 def __init__(self, queue, config):
     """
     Initialise class members
     """
     BaseWorkerThread.__init__(self)
     self.queue = queue
     self.config = config
     self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
     # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
     self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
     )
Exemple #8
0
 def __init__(self, queue, config):
     """
     Initialise class members
     """
     BaseWorkerThread.__init__(self)
     self.queue = queue
     self.config = config
     self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
     myThread = threading.currentThread()
     daoFactory = DAOFactory(package="WMCore.WMBS",
                             logger=myThread.logger,
                             dbinterface=myThread.dbi)
     self.finishedWorflowCheck = daoFactory(
         classname="Subscriptions.CountFinishedSubscriptionsByWorkflow")
Exemple #9
0
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.archiveDelayHours = getattr(self.config.TaskArchiver,
                                         'archiveDelayHours', 0)
        self.wmstatsCouchDB = WMStatsWriter(
            self.config.TaskArchiver.localWMStatsURL, "WMStatsAgent")

        #TODO: we might need to use local db for Tier0
        self.centralRequestDBReader = RequestDBReader(
            self.config.AnalyticsDataCollector.centralRequestDBURL,
            couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)

        if self.useReqMgrForCompletionCheck:
            self.deletableState = "announced"
            self.centralRequestDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.centralRequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
            if self.config.TaskArchiver.reqmgr2Only:
                self.reqmgr2Svc = ReqMgr(
                    self.config.TaskArchiver.ReqMgr2ServiceURL)
            else:
                #TODO: remove this for reqmgr2
                self.reqmgrSvc = RequestManager(
                    {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableState = "completed"
            # use local for update
            self.centralRequestDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.localT0RequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)

        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" %
                                                            jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                            jobDBName)

        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(
            statSummaryDBName)
Exemple #10
0
 def __init__(self, microConfig, status, logger=None):
     """
     Runs the basic setup and initialization for the MS Transferor module
     :param microConfig: microservice configuration
     """
     self.msConfig = microConfig
     self.status = status
     self.uConfig = {}
     self.logger = getMSLogger(microConfig['verbose'], logger=logger)
     self.reqmgr2 = ReqMgr(microConfig['reqmgrUrl'], logger=self.logger)
     self.reqmgrAux = ReqMgrAux(microConfig['reqmgrUrl'],
                                httpDict={'cacheduration': 60},
                                logger=self.logger)
     # eventually will change it to Rucio
     self.phedex = PhEDEx(httpDict={'cacheduration': 10 * 60},
                          dbsUrl=microConfig['dbsUrl'],
                          logger=self.logger)
Exemple #11
0
class WorkQueueManagerCleaner(BaseWorkerThread):
    """
    Cleans expired items, updates element status.
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.queue = queue
        self.config = config
        self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMCore.WMBS",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.finishedWorflowCheck = daoFactory(
            classname="Subscriptions.CountFinishedSubscriptionsByWorkflow")

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    @timeFunction
    def algorithm(self, parameters):
        """
        Check & expire negotiation failures
        """
        self.queue.logger.info("Start updating & cleaning...")
        try:
            self.queue.performQueueCleanupActions()
            # this will clean up whatever left over from above clean up.
            # also if the wq replication has problem it won't delay the killing jobs in condor
            # and updating wmbs status
            # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
            # added completed status in the list due to the race condition
            requests = self.reqmgr2Svc.getRequestByStatusFromMemoryCache([
                "aborted", "aborted-completed", "force-complete", "completed"
            ]).getData()
            results = self.finishedWorflowCheck.execute(workflowNames=requests)

            requestsToKill = [
                reqInfo["workflow"] for reqInfo in results
                if reqInfo["open"] > 0
            ]

            for wf in requestsToKill:
                self.queue.killWMBSWorkflow(wf)

        except Exception as ex:
            self.queue.logger.exception("Error cleaning queue: %s" % str(ex))
        self.queue.logger.info("Finished updating & cleaning.")
Exemple #12
0
    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
            skipPhEDEx: boolean to skip PhEDEx initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False),
                                  kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'],
                                  logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0},
                                       logger=self.logger)

        self.phedex = None
        self.rucio = None
        if self.msConfig.get('useRucio',
                             False) and not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={
                                   "logger": self.logger,
                                   "user_agent": "wmcore-microservices"
                               })
        elif not kwargs.get("skipPhEDEx", False):
            # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data
            dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
 def __init__(self, queue, config):
     """
     Initialise class members
     """
     BaseWorkerThread.__init__(self)
     self.queue = queue
     self.config = config
     self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
     # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
     self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(expire=120)
    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        # this will break all in one test
        self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None))

        centralurl = kwargs.get("central_logdb_url", "")
        identifier = kwargs.get("log_reporter", "")

        # set the thread name before creat the log db.
        # only sets that when it is not set already
        myThread = threading.currentThread()
        if myThread.getName() == "MainThread":
            myThread.setName(self.__class__.__name__)

        self.logdb = LogDB(centralurl, identifier, logger=self.logger)
        self.previous_state = {}
class WorkQueueManagerWMBSFileFeeder(BaseWorkerThread):
    """
    Polls for Work
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        self.queue = queue
        self.config = config
        self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
        # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
        self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()


    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    @timeFunction
    def algorithm(self, parameters):
        """
        Pull in work
        """
        try:
            self.getWorks()
        except Exception as ex:
            self.queue.logger.error("Error in wmbs inject loop: %s" % str(ex))

    def getWorks(self):
        """
        Inject work into wmbs for idle sites
        """
        self.queue.logger.info("Getting work and feeding WMBS files")

        # need to make sure jobs are created
        resources, jobCounts = freeSlots(minusRunning = True, allowedStates = ['Normal', 'Draining'],
                              knownCmsSites = cmsSiteNames())

        for site in resources:
            self.queue.logger.info("I need %d jobs on site %s" % (resources[site], site))

        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()

        previousWorkList = self.queue.getWork(resources, jobCounts, excludeWorkflows=abortedAndForceCompleteRequests)
        self.queue.logger.info("%s of units of work acquired for file creation"
                               % len(previousWorkList))
        return
class WorkQueueManagerWMBSFileFeeder(BaseWorkerThread):
    """
    Polls for Work
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        self.queue = queue
        self.config = config
        self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
        # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
        self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
        )

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop', t)
        time.sleep(t)

    @timeFunction
    def algorithm(self, parameters):
        """
        Get work from local workqueue to be injected into WMBS/DBSBuffer
        """
        self.queue.logger.info("Getting work and feeding WMBS files...")
        try:
            # need to make sure jobs are created
            resources, jobCounts = freeSlots(
                minusRunning=True,
                allowedStates=['Normal', 'Draining'],
                knownCmsSites=cmsSiteNames())

            for site in resources:
                self.queue.logger.info("I need %d jobs on site %s" %
                                       (resources[site], site))

            abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
            )

            previousWorkList = self.queue.getWork(
                resources,
                jobCounts,
                excludeWorkflows=abortedAndForceCompleteRequests)
            self.queue.logger.info(
                "Acquired %s units of work for WMBS file creation",
                len(previousWorkList))
        except Exception as ex:
            self.queue.logger.error("Error in wmbs inject loop: %s" % str(ex))
Exemple #17
0
    def __init__(self, config=None, logger=None):
        """
        Setup a bunch of things, like:
         * logger for this service
         * initialize all the necessary service helpers
         * fetch the unified configuration from central couch
         * update the unified configuration with some deployment and default settings
         * start both transfer and monitor threads
        :param config: reqmgr2ms service configuration
        :param logger:
        """
        self.uConfig = {}
        self.config = config
        self.logger = getMSLogger(getattr(config, 'verbose', False), logger)
        self._parseConfig(config)
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        self.reqmgr2 = ReqMgr(self.msConfig['reqmgrUrl'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgrUrl'],
                                   httpDict={'cacheduration': 60},
                                   logger=self.logger)

        # transferor has to look at workflows in assigned status
        self.msTransferor = MSTransferor(self.msConfig,
                                         "assigned",
                                         logger=self.logger)

        ### Last but not least, get the threads started
        thname = 'MSTransferor'
        self.transfThread = start_new_thread(
            thname, daemon, (self.transferor, 'assigned',
                             self.msConfig['interval'], self.logger))
        self.logger.debug("### Running %s thread %s", thname,
                          self.transfThread.running())

        thname = 'MSTransferorMonit'
        self.monitThread = start_new_thread(
            thname, daemon, (self.monitor, 'staging',
                             self.msConfig['interval'] * 2, self.logger))
        self.logger.debug("+++ Running %s thread %s", thname,
                          self.monitThread.running())
Exemple #18
0
    def __init__(self, config):
        """
        __init__
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.bossAir = BossAirAPI(config=self.config)
        self.reqmgr2 = ReqMgr(self.config.General.ReqMgr2ServiceURL)
        self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl,
                                   self.config.WorkQueueManager.dbname)

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater")
        self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority")
        self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")
class WorkQueueManagerCleaner(BaseWorkerThread):
    """
    Cleans expired items, updates element status.
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.forbiddenStatus = ["aborted", "aborted-completed", "force-complete", "completed"]
        self.queue = queue
        self.config = config
        self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMCore.WMBS",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.finishedWorflowCheck = daoFactory(classname="Subscriptions.CountFinishedSubscriptionsByWorkflow")

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    @timeFunction
    def algorithm(self, parameters):
        """
        Check & expire negotiation failures
        """
        self.queue.logger.info("Start updating & cleaning...")
        try:
            self.queue.performQueueCleanupActions()
            # this will clean up whatever left over from above clean up.
            # also if the wq replication has problem it won't delay the killing jobs in condor
            # and updating wmbs status
            # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
            # added completed status in the list due to the race condition
            requests = self.reqmgr2Svc.getRequestByStatusFromMemoryCache(self.forbiddenStatus).getData()
            results = self.finishedWorflowCheck.execute(workflowNames=requests)

            requestsToKill = [reqInfo["workflow"] for reqInfo in results if reqInfo["open"] > 0]

            self.queue.logger.info("Killing %d requests in WMBS ...", len(requestsToKill))
            self.queue.killWMBSWorkflows(requestsToKill)
        except Exception as ex:
            self.queue.logger.exception("Error cleaning queue: %s", str(ex))

        self.queue.logger.info("Finished updating & cleaning.")
 def __init__(self, queue, config):
     """
     Initialise class members
     """
     BaseWorkerThread.__init__(self)
     self.queue = queue
     self.config = config
     self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
     myThread = threading.currentThread()
     daoFactory = DAOFactory(package="WMCore.WMBS",
                             logger=myThread.logger,
                             dbinterface=myThread.dbi)
     self.finishedWorflowCheck = daoFactory(classname="Subscriptions.CountFinishedSubscriptionsByWorkflow")
def main():
    """
    _main_
    """
    if 'WMAGENT_CONFIG' not in os.environ:
        os.environ[
            'WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py'

    config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])

    # Instantiating central reqmgr and local workqueue
    print "ReqMgr2 URL  : %s" % sanitizeURL(
        config.JobUpdater.reqMgr2Url)['url']
    print "WorkQueue URL: %s and dbname %s" % (sanitizeURL(
        config.WorkQueueManager.couchurl)['url'],
                                               config.WorkQueueManager.dbname)

    reqmgr2 = ReqMgr(config.JobUpdater.reqMgr2Url)
    workqueue = WorkQueue(config.WorkQueueManager.couchurl,
                          config.WorkQueueManager.dbname)

    print "\nFirst attempt to update prio of wfs that are not in WMBS and only in local queue"
    priorityCache = {}
    workflowsToUpdate = {}
    workflowsToCheck = [x for x in workqueue.getAvailableWorkflows()]
    print "Retrieved %d workflows from workqueue" % len(workflowsToCheck)

    for workflow, priority in workflowsToCheck:
        if workflow not in priorityCache:
            try:
                priorityCache[workflow] = reqmgr2.getRequestByNames(
                    workflow)[workflow]['RequestPriority']
            except Exception, ex:
                print "Couldn't retrieve the priority of request %s" % workflow
                print "Error: %s" % ex
                continue
        if priority != priorityCache[workflow]:
            workflowsToUpdate[workflow] = priorityCache[workflow]
Exemple #22
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)

        self.dbsDaoFactory = DAOFactory(
            package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi
        )

        self.config = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, "WorkQueueParams"):
                self.workQueue = localQueue(**self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig

                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.timeout = getattr(self.config.TaskArchiver, "timeOut", None)
        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, "useReqMgrForCompletionCheck", True)

        if not self.useReqMgrForCompletionCheck:
            # sets the local monitor summary couch db
            self.requestLocalCouchDB = RequestDBWriter(
                self.config.AnalyticsDataCollector.localT0RequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp,
            )
            self.centralCouchDBWriter = self.requestLocalCouchDB
        else:
            self.centralCouchDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL)

            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            # TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only)
            self.reqmgrSvc = RequestManager({"endpoint": self.config.TaskArchiver.ReqMgrServiceURL})

        # Load the cleanout state ID and save it
        stateIDDAO = self.daoFactory(classname="Jobs.GetStateID")
        self.stateID = stateIDDAO.execute("cleanout")

        return
class WorkQueueManagerCleaner(BaseWorkerThread):
    """
    Cleans expired items, updates element status.
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.queue = queue
        self.config = config
        self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
        # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
        self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
        )

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    def algorithm(self, parameters):
        """
        Check & expire negotiation failures
        """
        self.queue.logger.info("Start updating & cleaning...")
        try:
            self.queue.performQueueCleanupActions()
            # this will clean up whatever left over from above clean up.
            # also if the wq replication has problem it won't delay the killing jobs in condor
            # and updating wmbs status
            abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
            )

            for wf in abortedAndForceCompleteRequests:
                self.queue.killWMBSWorkflow(wf)

        except Exception as ex:
            self.queue.logger.exception("Error cleaning queue: %s" % str(ex))
        self.queue.logger.info("Finished updating & cleaning.")
Exemple #24
0
    def advanceStatus(self, config):
        """
        Advance the request status based on the global workqueue elements status
        """
        reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger)
        gqService = WorkQueue(config.workqueue_url)

        self.logger.info("Getting GQ data for status check")
        wfStatusDict = gqService.getWorkflowStatusFromWQE()

        self.logger.info("Advancing statuses")
        if getattr(config, "enableMSStatusTransition", False):
            moveTransferorStatus(reqmgrSvc, self.logger)
        moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger)
        moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger)

        self.logger.info("Done advancing status")

        return
Exemple #25
0
    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        # this will break all in one test
        self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None))

        centralurl = kwargs.get("central_logdb_url", "")
        identifier = kwargs.get("log_reporter", "")

        # set the thread name before creat the log db.
        # only sets that when it is not set already
        myThread = threading.currentThread()
        if myThread.getName() == "MainThread":
            myThread.setName(self.__class__.__name__)

        self.logdb = LogDB(centralurl, identifier, logger=self.logger)
        self.previous_state = {}
class WorkQueueManagerCleaner(BaseWorkerThread):
    """
    Cleans expired items, updates element status.
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.queue = queue
        self.config = config
        self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
        # state lists which shouldn't be populated in wmbs. (To prevent creating work before WQE status updated)
        self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(expire=120)

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    def algorithm(self, parameters):
        """
        Check & expire negotiation failures
        """
        self.queue.logger.info("Start updating & cleaning...")
        try:
            self.queue.performQueueCleanupActions()
            # this will clean up whatever left over from above clean up.
            # also if the wq replication has problem it won't delay the killing jobs in condor
            # and updating wmbs status
            abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()

            for wf in abortedAndForceCompleteRequests:
                self.queue.killWMBSWorkflow(wf)

        except Exception as ex:
            self.queue.logger.exception("Error cleaning queue: %s" % str(ex))
        self.queue.logger.info("Finished updating & cleaning.")
Exemple #27
0
    def __init__(self, config):
        """
        __init__
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.bossAir = BossAirAPI(config=self.config)
        self.reqmgr2 = ReqMgr(self.config.JobUpdater.reqMgr2Url)
        self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl,
                                   self.config.WorkQueueManager.dbname)

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater")
        self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority")
        self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")
Exemple #28
0
    def advanceStatus(self, config):
        """
        gather active data statistics
        """

        reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger)
        gqService = WorkQueue(config.workqueue_url)
        wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger)

        self.logger.info("Getting GQ data for status check")
        wfStatusDict = gqService.getWorkflowStatusFromWQE()

        self.logger.info("Advancing status")
        moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger)
        moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger)
        moveToArchived(wmstatsSvc, reqmgrSvc, config.archiveDelayHours,
                       self.logger)

        self.logger.info("Done advancing status")

        return
Exemple #29
0
    def advanceStatus(self, config):
        """
        Advance the request status based on the global workqueue elements status
        """
        reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger)
        gqService = WorkQueue(config.workqueue_url)
        wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger)
        logdb = LogDB(config.central_logdb_url, config.log_reporter)

        self.logger.info("Getting GQ data for status check")
        wfStatusDict = gqService.getWorkflowStatusFromWQE()

        self.logger.info("Advancing status")
        moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger)
        moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger)
        moveToArchived(wmstatsSvc, reqmgrSvc, logdb, config.archiveDelayHours,
                       self.logger)

        self.logger.info("Done advancing status")

        return
Exemple #30
0
 def setup(self, parameters):
     """
     Called at startup
     """
     self.teamName = self.config.Agent.teamName
     # set the connection for local couchDB call
     self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
     self.archiveDelayHours = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0)
     self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL, 
                                         "WMStatsAgent")
     
     #TODO: we might need to use local db for Tier0
     self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                   couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
     
     if self.useReqMgrForCompletionCheck:
         self.deletableState = "announced"
         self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                       couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
         if self.config.TaskArchiver.reqmgr2Only:
             self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
         else:
             #TODO: remove this for reqmgr2
             self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
     else:
         # Tier0 case
         self.deletableState = "completed"
         # use local for update
         self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, 
                                                       couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
     
     jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
     jobDBName = self.config.JobStateMachine.couchDBName
     self.jobCouchdb  = CouchServer(jobDBurl)
     self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
     self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
     
     statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
     self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)
Exemple #31
0
    def setUp(self):
        self.setConfig(config)
        self.setCouchDBs([(config.views.data.couch_reqmgr_db, "ReqMgr"), 
                          (config.views.data.couch_reqmgr_aux_db, None)])
        self.setSchemaModules([])
        
        RESTBaseUnitTestWithDBBackend.setUp(self)

        self.setFakeDN()
        
        requestPath = os.path.join(getWMBASE(), "test", "data", "ReqMgr", "requests", "DMWM")
        rerecoFile = open(os.path.join(requestPath, "ReReco.json"), 'r')
        
        rerecoArgs = JsonWrapper.load(rerecoFile)
        self.rerecoCreateArgs = rerecoArgs["createRequest"]
        self.rerecoAssignArgs = rerecoArgs["assignRequest"]
        cmsswDoc = {"_id": "software"}
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]] =  []
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]].append(self.rerecoCreateArgs["CMSSWVersion"])
        insertDataToCouch(os.getenv("COUCHURL"), config.views.data.couch_reqmgr_aux_db, cmsswDoc) 
        self.reqSvc = ReqMgr(self.jsonSender["host"]) 
        self.reqSvc._noStale = True      
        self.reqSvc['requests'].additionalHeaders = self.create_header
Exemple #32
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """

    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        # DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        # Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config, insertStates=True)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsToCache = int(getattr(self.config.JobSubmitter, 'maxJobsToCache', 50000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')
        self.drainGracePeriod = getattr(self.config.JobSubmitter, 'drainGraceTime', 2 * 24 * 60 * 60)  # 2 days

        # Used for speed draining the agent
        self.enableAllSites = False

        # Additions for caching-based JobSubmitter
        self.jobsByPrio = {}  # key'ed by the final job priority, which contains a set of job ids
        self.jobDataCache = {}  # key'ed by the job id, containing the whole job info dict
        self.jobsToPackage = {}
        self.locationDict = {}
        self.drainSites = dict()
        self.drainSitesSet = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(sandboxDir,
                                         'PackageCollection_%i' % collectionIndex,
                                         'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory=collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def hasToRefreshCache(self):
        """
        _hasToRefreshCache_

        Check whether we should update the job data cache (or update it
        with new jobs in the created state) or if we just skip it.
        """
        if self.cacheRefreshSize == -1 or len(self.jobDataCache) < self.cacheRefreshSize or\
                        self.refreshPollingCount >= self.skipRefreshCount:
            self.refreshPollingCount = 0
            return True
        else:
            self.refreshPollingCount += 1
            logging.info("Skipping cache update to be submitted. (%s job in cache)", len(self.jobDataCache))
        return False

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        # make a counter for jobs pending to sites in drain mode within the grace period
        countDrainingJobs = 0
        timeNow = int(time.time())
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        newJobIds = set()

        logging.info("Refreshing priority cache with currently %i jobs", len(self.jobDataCache))

        newJobs = self.listJobsAction.execute(limitRows=self.maxJobsToCache)
        if self.useReqMgrForCompletionCheck:
            # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
            abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        else:
            abortedAndForceCompleteRequests = []

        logging.info("Found %s new jobs to be submitted.", len(newJobs))

        if self.enableAllSites:
            logging.info("Agent is in speed drain mode. Submitting jobs to all possible locations.")

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs))

            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if newJob['request_name'] in abortedAndForceCompleteRequests and \
                            newJob['task_type'] not in ['LogCollect', "Cleanup"]:
                continue

            jobID = newJob['id']
            newJobIds.add(jobID)
            if jobID in self.jobDataCache:
                continue

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s", pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                with open(pickledJobPath, 'r') as jobHandle:
                    loadedJob = pickle.load(jobHandle)
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                logging.warning("Input data location doesn't pass the site restrictions for job id: %s", jobID)
                badJobs[71101].append(newJob)
                continue

            # if agent is in speed drain and has hit the threshold to submit to all sites, we can skip the logic below that exclude sites
            if not self.enableAllSites:
                # check for sites in aborted state and adjust the possible locations
                nonAbortSites = [x for x in possibleLocations if x not in self.abortSites]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['possibleSites'] = possibleLocations
                    logging.warning("Job id %s can only run at a site in Aborted state", jobID)
                    badJobs[71102].append(newJob)
                    continue

                # try to remove draining sites if possible, this is needed to stop
                # jobs that could run anywhere blocking draining sites
                # if the job type is Merge, LogCollect or Cleanup this is skipped
                if newJob['task_type'] not in self.ioboundTypes:
                    nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites]
                    if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                        possibleLocations = nonDrainingSites
                    elif self.failJobDrain(timeNow, possibleLocations):
                        newJob['possibleSites'] = possibleLocations
                        logging.warning("Job id %s can only run at a sites in Draining state", jobID)
                        badJobs[71104].append(newJob)
                        continue
                    else:
                        countDrainingJobs += 1
                        continue

            # Sigh...make sure the job added to the package has the proper retry_count
            loadedJob['retry_count'] = newJob['retry_count']
            batchDir = self.addJobsToPackage(loadedJob)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = newJob['task_prio'] * self.maxTaskPriority + newJob['wf_priority']
            self.jobsByPrio.setdefault(jobPrio, set())
            self.jobsByPrio[jobPrio].add(jobID)

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {'taskPriority': newJob['task_prio'],
                       'custom': {'location': None},  # update later
                       'packageDir': batchDir,
                       'retry_count': newJob["retry_count"],
                       'sandbox': loadedJob["sandbox"],  # remove before submit
                       'userdn': loadedJob.get("ownerDN", None),
                       'usergroup': loadedJob.get("ownerGroup", ''),
                       'userrole': loadedJob.get("ownerRole", ''),
                       'possibleSites': frozenset(possibleLocations),  # abort and drain sites filtered out
                       'potentialSites': frozenset(potentialLocations),  # original list of sites
                       'scramArch': loadedJob.get("scramArch", None),
                       'swVersion': loadedJob.get("swVersion", []),
                       'proxyPath': loadedJob.get("proxyPath", None),
                       'estimatedJobTime': loadedJob.get("estimatedJobTime", None),
                       'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None),
                       'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None),
                       'numberOfCores': loadedJob.get("numberOfCores"),  # may update it later
                       'inputDataset': loadedJob.get('inputDataset', None),
                       'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None),
                       'inputPileup': loadedJob.get('inputPileup', None),
                       'allowOpportunistic': loadedJob.get('allowOpportunistic', False)}
            # then update it with the info retrieved from the database
            jobInfo.update(newJob)

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # Persist remaining job packages to disk
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = set(self.jobDataCache.keys()) - newJobIds
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Found %d jobs pending to sites in drain within the grace period", countDrainingJobs)
        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def failJobDrain(self, timeNow, possibleLocations):
        """
        Check whether sites are in drain for too long such that the job
        has to be marked as failed or not.
        :param timeNow: timestamp for this cycle
        :param possibleLocations: list of possible locations where the job can run
        :return: a boolean saying whether the job has to fail or not
        """
        fail = True
        for siteName in set(possibleLocations).union(self.drainSitesSet):
            if timeNow - self.drainSites[siteName] < self.drainGracePeriod:
                # then let this job be, it's a fresh draining site
                fail = False
                break
        return fail

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['request_name'] in abortedAndForceCompleteRequests) and \
                    (jobInfo['task_type'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.jobsByPrio:
                if jobid in self.jobsByPrio[jobPrio]:
                    # then the jobid was found, go to the next one
                    self.jobsByPrio[jobPrio].discard(jobid)
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleSites']),
                                     ', '.join(job['possibleSites']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] +
                                         ": file locations: " + ', '.join(job['fileLocations']) +
                                         ": site white list: " + ', '.join(job['siteWhitelist']) +
                                         ": site black list: " + ', '.join(job['siteBlacklist']))
                else:
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                         WM_JOB_ERROR_CODES[exitCode] + ', and empty fileLocations')

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode])

            fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        # lets store also a timestamp for when a site joined the Drain state
        newDrainSites = dict()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.update({siteName: rcThresholds[siteName]["state_time"]})
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if set(newDrainSites.keys()) != self.drainSitesSet or newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.jobsByPrio = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites
        self.drainSitesSet = set(newDrainSites.keys())

        return

    def checkZeroTaskThresholds(self, jobType, siteList):
        """
        _checkZeroTaskThresholds_

        Given a job type and a list of sites, remove sites from the list
        if that site + task has 0 pending thresholds.
        Returns a new list of sites
        """
        newSiteList = []
        for site in siteList:
            try:
                taskPendingSlots = self.currentRcThresholds[site]['thresholds'][jobType]["pending_slots"]
            except KeyError as ex:
                msg = "Invalid key for site %s and job type %s. Error: %s" % (site, jobType, str(ex))
                logging.warning(msg)
            else:
                if taskPendingSlots > 0:
                    newSiteList.append(site)

        return newSiteList

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault("init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][jobType].setdefault("init_task_pending_jobs",
                                                                                 taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName]["init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType)
            msg += str(ex)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(totalPendingSlots, initTotalPending) + (
                totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{"Condition": "NoPendingSlot",
                     "Current": totalPendingJobs,
                     "Threshold": totalPendingThreshold},
                    {"Condition": "NoTaskPendingSlot",
                     "Current": taskPendingJobs,
                     "Threshold": taskPendingThreshold},
                    {"Condition": "NoRunningSlot",
                     "Current": totalPendingJobs + totalRunningJobs,
                     "Threshold": totalJobThreshold},
                    {"Condition": "NoTaskRunningSlot",
                     "Current": taskPendingJobs + taskRunningJobs,
                     "Threshold": totalTaskTheshold}]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(lambda: defaultdict(Counter))
        jobSubmitLogByPriority = defaultdict(lambda: defaultdict(Counter))

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.jobsByPrio, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # can we assume jobid=1 is older than jobid=3? I think so...
            for jobid in sorted(self.jobsByPrio[jobPrio]):
                jobType = self.jobDataCache[jobid]['task_type']
                possibleSites = self.jobDataCache[jobid]['possibleSites']
                # remove sites with 0 task thresholds
                possibleSites = self.checkZeroTaskThresholds(jobType, possibleSites)
                jobSubmitLogByPriority[jobPrio][jobType]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    condition = self._getJobSubmitCondition(jobPrio, siteName, jobType)
                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][jobType][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName, condition)
                        continue

                    # pop the job dictionary object and update it
                    cachedJob = self.jobDataCache.pop(jobid)
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['possibleSites'] = possibleSites

                    # Sort jobs by jobPackage and get it in place to be submitted by the plugin
                    package = cachedJob['packageDir']
                    jobsToSubmit.setdefault(package, [])
                    jobsToSubmit[package].append(cachedJob)

                    # update site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName]["total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1
                    jobsCount += 1
                    jobSubmitLogBySites[siteName][jobType]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio][jobType]['submitted'] += 1

                    # jobs that will be submitted must leave the job data cache
                    self.jobsByPrio[jobPrio].discard(jobid)

                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info("Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle)
                    exitLoop = True
                    break

        logging.info("Site submission report ...")
        for site in jobSubmitLogBySites:
            logging.info("    %s : %s", site, json.dumps(jobSubmitLogBySites[site]))
        logging.info("Priority submission report ...")
        for prio in jobSubmitLogByPriority:
            logging.info("    %s : %s", prio, json.dumps(jobSubmitLogByPriority[prio]))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')
        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if jobSite not in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName)
            if agentConfig.get("UserDrainMode") and agentConfig.get("SpeedDrainMode"):
                self.enableAllSites = agentConfig.get("SpeedDrainConfig")['EnableAllSites']['Enabled']
            else:
                self.enableAllSites = False
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get("CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True)
            self.getThresholds()
            if self.hasToRefreshCache():
                self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            # msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(dbi=myThread.dbi, logger=logging,
                                               condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Exemple #33
0
class ReqMgrService(TemplatedPage):
    """
    Request Manager web service class
    """

    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd() + '/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd() + '/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir = os.environ.get('RM_JSPATH', os.getcwd() + '/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir = os.environ.get('RM_SPECPATH', os.getcwd() + '/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd() + '/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr', 600)  # put reasonable 10 min interval
        self.sdict = {'ts': time.time()}  # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir)
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types = ['text/css']
        mime_types += ['application/javascript', 'text/javascript',
                       'application/x-javascript', 'text/x-javascript']
        cherryconf.update({'tools.encode.on': True,
                           'tools.gzip.on': True,
                           'tools.gzip.mime_types': mime_types,
                           })
        self._cache = {}

        # initialize access to reqmgr2 APIs
        self.reqmgr_url = config.reqmgr.reqmgr2_url
        self.reqmgr = ReqMgr(self.reqmgr_url)
        # only gets current view (This might cause to reponse time much longer,
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url', self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_7_6_1')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc6_amd64_gcc493')

        # LogDB holder
        centralurl = cdict.get("central_logdb_url", "")
        identifier = cdict.get("log_reporter", "reqmgr2")
        self.logdb = LogDB(centralurl, identifier)

        # local team cache which will request data from wmstats
        base, uri = self.reqmgr_url.split('://')
        base_url = '%s://%s' % (base, uri.split('/')[0])
        self.wmstatsurl = cdict.get('wmstats_url', '%s/wmstatsserver' % base_url)
        if not self.wmstatsurl:
            raise Exception('ReqMgr2 configuration file does not provide wmstats url')
        self.team_cache = []

        # fetch assignment arguments specification from StdBase
        self.assignArgs = StdBase().getWorkloadAssignArgs()
        self.assignArgs = {key: val['default'] for key, val in self.assignArgs.items()}

    def getTeams(self):
        "Helper function to get teams from wmstats or local cache"
        teams = self.team_cache
        url = '%s/data/teams' % self.wmstatsurl
        params = {}
        headers = {'Accept': 'application/json'}
        try:
            data = getdata(url, params, headers)
            if 'error' in data:
                print("WARNING: fail to get teams from %s" % url)
                print(data)
            teams = data.get('result', [])
            self.team_cache = teams
        except Exception as exp:
            print("WARNING: fail to get teams from %s" % url)
            print(str(exp))
        return teams

    def update_scripts(self, force=False):
        "Update scripts dict"
        if force or abs(time.time() - self.sdict['ts']) > self.sdict_thr:
            for item in os.listdir(self.sdir):
                with open(os.path.join(self.sdir, item), 'r') as istream:
                    self.sdict[item.split('.')[0]] = istream.read()
            self.sdict['ts'] = time.time()

    def abs_page(self, tmpl, content):
        """generate abstract page"""
        menu = self.templatepage('menu', menus=menus(), tmpl=tmpl)
        body = self.templatepage('generic', menu=menu, content=content)
        page = self.templatepage('main', content=body, user=user())
        return page

    def page(self, content):
        """
        Provide page wrapped with top/bottom templates.
        """
        return self.templatepage('main', content=content)

    def error(self, content):
        "Generate common error page"
        content = self.templatepage('error', content=content)
        return self.abs_page('error', content)

    @expose
    def index(self):
        """Main page"""
        content = self.templatepage('index', requests=ACTIVE_STATUS, rdict=REQUEST_STATE_TRANSITION)
        return self.abs_page('main', content)

    @expose
    def home(self, **kwds):
        """Main page"""
        return self.index(**kwds)

    ### Request actions ###

    @expose
    @checkargs(['status', 'sort'])
    def assign(self, **kwds):
        """assign page"""
        if not kwds:
            kwds = {}
        if 'status' not in kwds:
            kwds.update({'status': 'assignment-approved'})
        docs = []
        attrs = ['RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus']
        dataResult = self.reqmgr.getRequestByStatus(statusList=[kwds['status']])
        for data in dataResult:
            for val in data.values():
                docs.append(request_attr(val, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        assignDict = deepcopy(self.assignArgs)
        assignDict.update(getPropValueMap())
        assignDict['Team'] = self.getTeams()
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('assign', sort=sortby,
                                    filter_sort_table=filter_sort,
                                    sites=SITE_CACHE.getData(),
                                    site_white_list=site_white_list(),
                                    site_black_list=site_black_list(),
                                    user=user(), user_dn=user_dn(), requests=toString(docs),
                                    misc_table=json2table(assignDict, web_ui_names(), "all_attributes"),
                                    misc_json=json2form(assignDict, indent=2, keep_first_value=True))
        return self.abs_page('assign', content)

    @expose
    @checkargs(['status', 'sort'])
    def approve(self, **kwds):
        """
        Approve page: get list of request associated with user DN.
        Fetch their status list from ReqMgr and display if requests
        were seen by data-ops.
        """
        if not kwds:
            kwds = {}
        if 'status' not in kwds:
            kwds.update({'status': 'new'})
        kwds.update({'_nostale': True})
        docs = []
        attrs = ['RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus', 'Campaign']
        dataResult = self.reqmgr.getRequestByStatus(statusList=[kwds['status']])
        for data in dataResult:
            for val in data.values():
                docs.append(request_attr(val, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('approve', requests=toString(docs), date=tstamp(),
                                    sort=sortby, filter_sort_table=filter_sort,
                                    gen_color=gen_color)
        return self.abs_page('approve', content)

    @expose
    def create(self, **kwds):
        """create page"""
        # get list of standard specs from WMCore and new ones from local area
        # loc_specs_dir = os.path.join(self.spdir, 'Specs') # local specs
        # loc_specs = spec_list(loc_specs_dir, 'Specs')
        # all_specs = list(set(self.std_specs + loc_specs))
        # all_specs.sort()
        all_specs = list(self.std_specs)
        spec = kwds.get('form', '')
        if not spec:
            spec = self.std_specs[0]
        # make spec first in all_specs list
        if spec in all_specs:
            all_specs.remove(spec)
        all_specs = [spec] + all_specs
        jsondata = get_request_template_from_type(spec)
        # create templatized page out of provided forms
        self.update_scripts()
        content = self.templatepage('create', table=json2table(jsondata, web_ui_names(), jsondata),
                                    jsondata=json2form(jsondata, indent=2, keep_first_value=True), name=spec,
                                    scripts=[s for s in self.sdict.keys() if s != 'ts'],
                                    specs=all_specs)
        return self.abs_page('create', content)

    def generate_objs(self, script, jsondict):
        """Generate objects from givem JSON template"""
        self.update_scripts()
        code = self.sdict.get(script, '')
        if code.find('def genobjs(jsondict)') == -1:
            return self.error(
                "Improper python snippet, your code should start with <b>def genobjs(jsondict)</b> function")
        exec (code)  # code snippet must starts with genobjs function
        return [r for r in genobjs(jsondict)]

    @expose
    def config(self, name):
        "Fetch config for given request name"
        result = self.reqmgr.getConfig(name)
        if len(result) == 1:
            result = result[0]
        else:
            result = 'Configuration not found for: %s' % name
        return result.replace('\n', '<br/>')

    @expose
    def fetch(self, rid):
        "Fetch document for given id"
        rid = rid.replace('request-', '')
        doc = self.reqmgr.getRequestByNames(rid)
        transitions = []
        tst = time.time()
        # get request tasks
        tasks = self.reqmgr.getRequestTasks(rid)
        if len(doc) == 1:
            try:
                doc = doc[0][rid]
            except:
                pass
            name = doc.get('RequestName', 'NA')
            title = 'Request %s' % name
            status = doc.get('RequestStatus', '')
            transitions = REQUEST_STATE_TRANSITION.get(status, [])
            if status in transitions:
                transitions.remove(status)
            visible_attrs = get_modifiable_properties(status)
            filterout_attrs = get_protected_properties()
            # extend filterout list with "RequestStatus" since it is passed separately
            filterout_attrs.append("RequestStatus")

            for key, val in self.assignArgs.items():
                if not doc.get(key):
                    doc[key] = val

            if visible_attrs == "all_attributes":
                filteredDoc = doc
                for prop in filterout_attrs:
                    if prop in filteredDoc:
                        del filteredDoc[prop]
            else:
                filteredDoc = {}
                for prop in visible_attrs:
                    filteredDoc[prop] = doc.get(prop, "")

            propValueMap = getPropValueMap()
            propValueMap['Team'] = self.getTeams()

            selected = {}
            for prop in propValueMap:
                if prop in filteredDoc:
                    filteredDoc[prop], selected[prop] = reorder_list(propValueMap[prop], filteredDoc[prop])

            content = self.templatepage('doc', title=title, status=status, name=name, rid=rid,
                                        tasks=json2form(tasks, indent=2, keep_first_value=False),
                                        table=json2table(filteredDoc, web_ui_names(), visible_attrs, selected),
                                        jsondata=json2form(doc, indent=2, keep_first_value=False),
                                        doc=json.dumps(doc), time=time,
                                        tasksConfigs=tasks_configs(doc, html=True),
                                        sTransition=state_transition(doc),
                                        pTransition=priority_transition(doc),
                                        transitions=transitions, humanStates=REQUEST_HUMAN_STATES,
                                        ts=tst, user=user(), userdn=user_dn())
        elif len(doc) > 1:
            jsondata = [pprint.pformat(d) for d in doc]
            content = self.templatepage('doc', title='Series of docs: %s' % rid,
                                        table="", jsondata=jsondata, time=time,
                                        tasksConfigs=tasks_configs(doc, html=True),
                                        sTransition=state_transition(doc),
                                        pTransition=priority_transition(doc),
                                        transitions=transitions, humanStates=REQUEST_HUMAN_STATES,
                                        ts=tst, user=user(), userdn=user_dn())
        else:
            doc = 'No request found for name=%s' % rid
        return self.abs_page('request', content)

    @expose
    def record2logdb(self, **kwds):
        """LogDB submission page"""
        print(kwds)
        request = kwds['request']
        msg = kwds['message']
        self.logdb.post(request, msg)
        msg = '<h6>Confirmation</h6>Your request has been entered to LogDB.'
        return self.abs_page('generic', msg)

    @expose
    def requests(self, **kwds):
        """Page showing requests"""
        if not kwds:
            kwds = {}
        if 'status' not in kwds:
            kwds.update({'status': 'acquired'})
        dataResult = self.reqmgr.getRequestByStatus(kwds['status'])
        attrs = ['RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus', 'Campaign']
        docs = []
        for data in dataResult:
            for doc in data.values():
                docs.append(request_attr(doc, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('requests', requests=toString(docs), sort=sortby,
                                    status=kwds['status'], filter_sort_table=filter_sort)
        return self.abs_page('requests', content)

    @expose
    def request(self, **kwargs):
        "Get data example and expose it as json"
        dataset = kwargs.get('uinput', '')
        if not dataset:
            return {'error': 'no input dataset'}
        url = 'https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s' % dataset
        params = {}
        headers = {'Accept': 'application/json'}
        wdata = getdata(url, params, headers)
        wdict = dict(date=time.ctime(), team='Team-A', status='Running', ID=genid(wdata))
        winfo = self.templatepage('workflow', wdict=wdict,
                                  dataset=dataset, code=pprint.pformat(wdata))
        content = self.templatepage('search', content=winfo)
        return self.abs_page('request', content)

    @expose
    def batch(self, **kwds):
        """batch page"""
        # TODO: we need a template for batch attributes
        #       and read it from separate area, like DASMaps
        name = kwds.get('name', '')
        batch = {}
        if name:
            #            batch = self.reqmgr.getBatchesByName(name)
            batch = {'Name': 'Batch1', 'Description': 'Bla-bla', 'Creator': 'valya', 'Group': 'test',
                     'Workflows': ['workflow1', 'workflow2'],
                     'Attributes': {'HeavyIon': ['true', 'false']}}
        attributes = batch.get('Attributes', {})
        workflows = batch.get('Workflows', [])
        description = batch.get('Description', '')
        creator = batch.get('Creator', user_dn())
        content = self.templatepage('batch', name=name,
                                    attributes=json2table(attributes, web_ui_names()),
                                    workflows=workflows, creator=creator,
                                    description=description)
        return self.abs_page('batch', content)

    @expose
    def batches(self, **kwds):
        """Page showing batches"""
        if not kwds:
            kwds = {}
        if 'name' not in kwds:
            kwds.update({'name': ''})
        sortby = kwds.get('sort', 'name')
        #        results = self.reqmgr.getBatchesByName(kwds['name'])
        results = [
            {'Name': 'Batch1', 'Description': 'Bla-bla', 'Creator': 'valya', 'Group': 'test',
             'Workflows': ['workflow1', 'workflow2'],
             'Date': 'Fri Feb 13 10:36:41 EST 2015',
             'Attributes': {'HeavyIon': ['true', 'false']}},
            {'Name': 'Batch2', 'Description': 'lksdjflksjdf', 'Creator': 'valya', 'Group': 'test',
             'Workflows': ['workflow1', 'workflow2'],
             'Date': 'Fri Feb 10 10:36:41 EST 2015',
             'Attributes': {'HeavyIon': ['true', 'false']}},
        ]
        docs = [r for r in sort(results, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('batches', batches=docs, sort=sortby,
                                    filter_sort_table=filter_sort)
        return self.abs_page('batches', content)

    ### Aux methods ###

    @expose
    def put_request(self, **kwds):
        "PUT request callback to reqmgr server, should be used in AJAX"
        reqname = kwds.get('RequestName', '')
        status = kwds.get('RequestStatus', '')
        if not reqname:
            msg = 'Unable to update request status, empty request name'
            raise cherrypy.HTTPError(406, msg)
        if not status:
            msg = 'Unable to update request status, empty status value'
            raise cherrypy.HTTPError(406, msg)
        return self.reqmgr.updateRequestStatus(reqname, status)

    @expose
    def images(self, *args):
        """
        Serve static images.
        """
        args = list(args)
        check_scripts(args, self.imgmap, self.imgdir)
        mime_types = ['*/*', 'image/gif', 'image/png',
                      'image/jpg', 'image/jpeg']
        accepts = cherrypy.request.headers.elements('Accept')
        for accept in accepts:
            if accept.value in mime_types and len(args) == 1 \
                    and args[0] in self.imgmap:
                image = self.imgmap[args[0]]
                # use image extension to pass correct content type
                ctype = 'image/%s' % image.split('.')[-1]
                cherrypy.response.headers['Content-type'] = ctype
                return serve_file(image, content_type=ctype)

    def serve(self, kwds, imap, idir, datatype='', minimize=False):
        "Serve files for high level APIs (yui/css/js)"
        args = []
        for key, val in kwds.items():
            if key == 'f':  # we only look-up files from given kwds dict
                if isinstance(val, list):
                    args += val
                else:
                    args.append(val)
        scripts = check_scripts(args, imap, idir)
        return self.serve_files(args, scripts, imap, datatype, minimize)

    @exposecss
    @tools.gzip()
    def css(self, **kwargs):
        """
        Serve provided CSS files. They can be passed as
        f=file1.css&f=file2.css
        """
        resource = kwargs.get('resource', 'css')
        if resource == 'css':
            return self.serve(kwargs, self.cssmap, self.cssdir, 'css', True)

    @exposejs
    @tools.gzip()
    def js(self, **kwargs):
        """
        Serve provided JS scripts. They can be passed as
        f=file1.js&f=file2.js with optional resource parameter
        to speficy type of JS files, e.g. resource=yui.
        """
        resource = kwargs.get('resource', 'js')
        if resource == 'js':
            return self.serve(kwargs, self.jsmap, self.jsdir)

    def serve_files(self, args, scripts, resource, datatype='', minimize=False):
        """
        Return asked set of files for JS, YUI, CSS.
        """
        idx = "-".join(scripts)
        if idx not in self._cache.keys():
            data = ''
            if datatype == 'css':
                data = '@CHARSET "UTF-8";'
            for script in args:
                path = os.path.join(sys.path[0], resource[script])
                path = os.path.normpath(path)
                with open(path) as ifile:
                    data = "\n".join([data, ifile.read().replace('@CHARSET "UTF-8";', '')])
            if datatype == 'css':
                set_headers("text/css")
            if minimize:
                self._cache[idx] = minify(data)
            else:
                self._cache[idx] = data
        return self._cache[idx]
Exemple #34
0
    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd() + '/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd() + '/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir = os.environ.get('RM_JSPATH', os.getcwd() + '/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir = os.environ.get('RM_SPECPATH', os.getcwd() + '/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd() + '/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr',
                                        600)  # put reasonable 10 min interval
        self.sdict = {'ts': time.time()}  # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir)
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types = ['text/css']
        mime_types += [
            'application/javascript', 'text/javascript',
            'application/x-javascript', 'text/x-javascript'
        ]
        cherryconf.update({
            'tools.encode.on': True,
            'tools.gzip.on': True,
            'tools.gzip.mime_types': mime_types,
        })
        self._cache = {}

        # initialize access to reqmgr2 APIs
        self.reqmgr_url = config.reqmgr.reqmgr2_url
        self.reqmgr = ReqMgr(self.reqmgr_url)
        # only gets current view (This might cause to reponse time much longer,
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url',
                                         self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_7_6_1')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc6_amd64_gcc493')

        # LogDB holder
        centralurl = cdict.get("central_logdb_url", "")
        identifier = cdict.get("log_reporter", "reqmgr2")
        self.logdb = LogDB(centralurl, identifier)

        # local team cache which will request data from wmstats
        base, uri = self.reqmgr_url.split('://')
        base_url = '%s://%s' % (base, uri.split('/')[0])
        self.wmstatsurl = cdict.get('wmstats_url',
                                    '%s/wmstatsserver' % base_url)
        if not self.wmstatsurl:
            raise Exception(
                'ReqMgr2 configuration file does not provide wmstats url')
        self.team_cache = []

        # fetch assignment arguments specification from StdBase
        self.assignArgs = StdBase().getWorkloadAssignArgs()
        self.assignArgs = {
            key: val['default']
            for key, val in self.assignArgs.items()
        }
Exemple #35
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs",
                     len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
                )
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info(
                "Skipping cache update to be submitted. (%s job in cache)",
                len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount,
                             len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s",
                              pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in self.ioboundTypes:
                nonDrainingSites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'],
                                               0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {
                'id':
                jobID,
                'requestName':
                newJob['request_name'],
                'taskName':
                newJob['task_name'],
                'taskType':
                newJob['type'],
                'cache_dir':
                newJob["cache_dir"],
                'priority':
                newJob['wf_priority'],
                'taskID':
                newJob['task_id'],
                'retry_count':
                newJob["retry_count"],
                'taskPriority':
                None,  # update from the thresholds
                'custom': {
                    'location': None
                },  # update later
                'packageDir':
                batchDir,
                'sandbox':
                loadedJob["sandbox"],  # remove before submit
                'userdn':
                loadedJob.get("ownerDN", None),
                'usergroup':
                loadedJob.get("ownerGroup", ''),
                'userrole':
                loadedJob.get("ownerRole", ''),
                'possibleSites':
                frozenset(
                    possibleLocations),  # abort and drain sites filtered out
                'potentialSites':
                frozenset(potentialLocations),  # original list of sites
                'scramArch':
                loadedJob.get("scramArch", None),
                'swVersion':
                loadedJob.get("swVersion", None),
                'name':
                loadedJob["name"],
                'proxyPath':
                loadedJob.get("proxyPath", None),
                'estimatedJobTime':
                loadedJob.get("estimatedJobTime", None),
                'estimatedDiskUsage':
                loadedJob.get("estimatedDiskUsage", None),
                'estimatedMemoryUsage':
                loadedJob.get("estimatedMemoryUsage", None),
                'numberOfCores':
                loadedJob.get("numberOfCores", 1),  # may update it later
                'inputDataset':
                loadedJob.get('inputDataset', None),
                'inputDatasetLocations':
                loadedJob.get('inputDatasetLocations', None),
                'allowOpportunistic':
                loadedJob.get('allowOpportunistic', False)
            }

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d",
                    badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
        )
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items(
                ):
                    self.taskTypePrioMap[task] = value.get(
                        'priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName][
                "total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName][
                "total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName][
                "total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName][
                "total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName][
                'thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault(
                "init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][
                jobType].setdefault("init_task_pending_jobs", taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName][
                "init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName,
                                                                 jobType)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (
                jobPrio <= highestPriorityInJobs) or (jobType
                                                      in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(
                totalPendingSlots, initTotalPending) + (
                    totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{
            "Condition": "NoPendingSlot",
            "Current": totalPendingJobs,
            "Threshold": totalPendingThreshold
        }, {
            "Condition": "NoTaskPendingSlot",
            "Current": taskPendingJobs,
            "Threshold": taskPendingThreshold
        }, {
            "Condition": "NoRunningSlot",
            "Current": totalPendingJobs + totalRunningJobs,
            "Threshold": totalJobThreshold
        }, {
            "Condition": "NoTaskRunningSlot",
            "Current": taskPendingJobs + taskRunningJobs,
            "Threshold": totalTaskTheshold
        }]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(),
                              key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn(
                            "Have a job for %s which is not in the resource control",
                            siteName)
                        continue

                    condition = self._getJobSubmitCondition(
                        jobPrio, siteName, jobType)

                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName,
                                      condition)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName][
                        "total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType][
                        "task_pending_jobs"] += 1

                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = self.currentRcThresholds[
                        siteName]['thresholds'][jobType]["priority"]

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info(
                        "Submitter reached limit of submit slots for this cycle: %i",
                        self.maxJobsThisCycle)
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s",
                     dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.",
                     len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName)
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get(
                "CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg,
                                      "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork",
                                        "warning",
                                        this_thread=True)
            self.getThresholds()
            self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(
            dbi=myThread.dbi,
            logger=logging,
            condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)


        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(sandboxDir,
                                         'PackageCollection_%i' % collectionIndex,
                                         'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory=collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info("Skipping cache update to be submitted. (%s job in cache)", len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s", pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [x for x in possibleLocations if x not in self.abortSites]
                if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in ('LogCollect', 'Merge', 'Cleanup', 'Harvesting'):
                nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites]
                if nonDrainingSites: # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {'id': jobID,
                       'requestName': newJob['request_name'],
                       'taskName': newJob['task_name'],
                       'taskType': newJob['type'],
                       'cache_dir': newJob["cache_dir"],
                       'priority': newJob['wf_priority'],
                       'taskID': newJob['task_id'],
                       'retry_count': newJob["retry_count"],
                       'taskPriority': None,                                # update from the thresholds
                       'custom': {'location': None},                        # update later
                       'packageDir': batchDir,
                       'sandbox': loadedJob["sandbox"],                     # remove before submit
                       'userdn': loadedJob.get("ownerDN", None),
                       'usergroup': loadedJob.get("ownerGroup", ''),
                       'userrole': loadedJob.get("ownerRole", ''),
                       'possibleSites': frozenset(possibleLocations),       # abort and drain sites filtered out
                       'potentialSites': frozenset(potentialLocations),     # original list of sites
                       'scramArch': loadedJob.get("scramArch", None),
                       'swVersion': loadedJob.get("swVersion", None),
                       'name': loadedJob["name"],
                       'proxyPath': loadedJob.get("proxyPath", None),
                       'estimatedJobTime': loadedJob.get("estimatedJobTime", None),
                       'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None),
                       'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None),
                       'numberOfCores': loadedJob.get("numberOfCores", 1),  # may update it later
                       'inputDataset': loadedJob.get('inputDataset', None),
                       'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None),
                       'allowOpportunistic': loadedJob.get('allowOpportunistic', False)}

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": file locations: " + ', '.join(job['fileLocations']) +
                                         ": site white list: " + ', '.join(job['siteWhitelist']) +
                                         ": site black list: " + ', '.join(job['siteBlacklist']))
                else:
                    # This is temporary addition if this is patched for existing agent.
                    # If jobs are created before the patch is applied fileLocations is not set.
                    # TODO. remove this later for new agent
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": Job is created before this patch. Please check this input for the jobs: %s " %
                                         job['fwjr'].getAllInputFiles())

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items():
                    self.taskTypePrioMap[task] = value.get('priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or  newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return


    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn("Have a job for %s which is not in the resource control", siteName)
                        continue

                    try:
                        totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"]
                        totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"]
                        totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"]
                        totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"]

                        taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"]
                        taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"]
                        taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"]
                        taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"]
                        taskPriority = self.currentRcThresholds[siteName]['thresholds'][jobType]["priority"]
                    except KeyError as ex:
                        msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType)
                        msg += str(ex)
                        logging.error(msg)
                        continue

                    # check if site has free pending slots AND free pending task slots
                    if totalPendingJobs >= totalPendingSlots or taskPendingJobs >= taskPendingSlots:
                        jobSubmitLogBySites[siteName]["NoPendingSlot"] += 1
                        logging.debug("Found a job for %s which has no free pending slots", siteName)
                        continue
                    # check if site overall thresholds have free slots
                    if totalPendingJobs + totalRunningJobs >= totalPendingSlots + totalRunningSlots:
                        jobSubmitLogBySites[siteName]["NoRunningSlot"] += 1
                        logging.debug("Found a job for %s which has no free overall slots", siteName)
                        continue
                    # finally, check whether task has free overall slots
                    if taskPendingJobs + taskRunningJobs >= taskPendingSlots + taskRunningSlots:
                        jobSubmitLogBySites[siteName]["NoTaskSlot"] += 1
                        logging.debug("Found a job for %s which has no free task slots", siteName)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName]["total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1
                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = taskPriority

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsPerPoll:
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit


    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return


    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """

        try:
            myThread = threading.currentThread()
            self.getThresholds()
            self.refreshCache()

            if self.useReqMgrForCompletionCheck:
                # only runs when reqmgr is used (not Tier0)
                self.removeAbortedForceCompletedWorkflowFromCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return



    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
class WorkQueueReqMgrInterface(object):
    """Helper class for ReqMgr interaction"""
    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        # this will break all in one test
        self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None))

        centralurl = kwargs.get("central_logdb_url", "")
        identifier = kwargs.get("log_reporter", "")

        # set the thread name before creat the log db.
        # only sets that when it is not set already
        myThread = threading.currentThread()
        if myThread.getName() == "MainThread":
            myThread.setName(self.__class__.__name__)

        self.logdb = LogDB(centralurl, identifier, logger=self.logger)
        self.previous_state = {}

    def __call__(self, queue):
        """Synchronize WorkQueue and RequestManager"""
        msg = ''
        try:  # pull in new work
            self.logger.info("queueing new work")
            work = self.queueNewRequests(queue)
            msg += "New Work: %d\n" % work
        except Exception as ex:
            errorMsg = "Error caught during RequestManager pull"
            self.logger.exception("%s: %s", errorMsg, str(ex))

        try:  # get additional open-running work
            self.logger.info("adding new element to open requests")
            extraWork = self.addNewElementsToOpenRequests(queue)
            msg += "Work added: %d\n" % extraWork
        except Exception as ex:
            errorMsg = "Error caught during RequestManager split"
            self.logger.exception("%s: %s", errorMsg, str(ex))

        try:  # report back to ReqMgr
            self.logger.info("cancel aborted requests")
            count = self.cancelWork(queue)
            self.logger.info("finised canceling requests")
            msg += "Work canceled: %s " % count
        except Exception as ex:
            errorMsg = "Error caught during canceling the request"
            self.logger.exception("%s: %s", errorMsg, str(ex))

        queue.backend.recordTaskActivity('reqmgr_sync', msg)

    def queueNewRequests(self, queue):
        """Get requests from regMgr and queue to workqueue"""
        self.logger.info("Contacting Request manager for more work")
        work = 0
        workLoads = []

        try:
            workLoads = self.getAvailableRequests()
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for team, reqName, workLoadUrl in workLoads:
            try:
                try:
                    Lexicon.couchurl(workLoadUrl)
                except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                    # check its not a local file
                    if not os.path.exists(workLoadUrl):
                        error = WorkQueueWMSpecError(
                            None,
                            "Workflow url validation error: %s" % str(ex))
                        raise error

                self.logger.info("Processing request %s at %s" %
                                 (reqName, workLoadUrl))
                units = queue.queueWork(workLoadUrl,
                                        request=reqName,
                                        team=team)
                self.logdb.delete(reqName,
                                  "error",
                                  this_thread=True,
                                  agent=False)
            except TERMINAL_EXCEPTIONS as ex:
                # fatal error - report back to ReqMgr
                self.logger.error(
                    'Permanent failure processing request "%s": %s' %
                    (reqName, str(ex)))
                self.logger.info("Marking request %s as failed in ReqMgr" %
                                 reqName)
                self.reportRequestStatus(reqName, 'Failed', message=str(ex))
                continue
            except (IOError, socket.error, CouchError,
                    CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nSee log for details.\nError: "%s"' % str(ex)
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

            self.logger.info("%s element(s) obtained from RequestManager" %
                             work)
        return work

    def cancelWork(self, queue):
        requests = self.reqMgr2.getRequestByStatus(
            ['aborted', 'force-complete'], detail=False)
        count = 0
        for req in requests:
            try:
                queue.cancelWork(req)
                count += 1
            except Exception as ex:
                msg = 'Error to cancel the request "%s": %s' % (req, str(ex))
                self.logger.exception(msg)
        return count

    def report(self, queue):
        """Report queue status to ReqMgr."""
        new_state = {}
        uptodate_elements = []
        now = time.time()

        elements = queue.statusInbox(dictKey="RequestName")
        if not elements:
            return new_state

        for ele in elements:
            ele = elements[ele][0]  # 1 element tuple
            try:
                request = self.reqMgr2.getRequestByNames(ele['RequestName'])
                if not request:
                    msg = 'Failed to get request "%s" from ReqMgr2. Will try again later.' % ele[
                        'RequestName']
                    self.logger.warning(msg)
                    continue
                request = request[0][ele['RequestName']]
                if request['RequestStatus'] in ('failed', 'completed',
                                                'announced', 'closed-out',
                                                'rejected'):
                    # requests can be done in reqmgr but running in workqueue
                    # if request has been closed but agent cleanup actions
                    # haven't been run (or agent has been retired)
                    # Prune out obviously too old ones to avoid build up
                    if queue.params.get('reqmgrCompleteGraceTime', -1) > 0:
                        if (now - float(ele.updatetime)
                            ) > queue.params['reqmgrCompleteGraceTime']:
                            # have to check all elements are at least running and are old enough
                            request_elements = queue.statusInbox(
                                WorkflowName=request['RequestName'])
                            if not any([
                                    x for x in request_elements
                                    if x['Status'] != 'Running'
                                    and not x.inEndState()
                            ]):
                                last_update = max([
                                    float(x.updatetime)
                                    for x in request_elements
                                ])
                                if (
                                        now - last_update
                                ) > queue.params['reqmgrCompleteGraceTime']:
                                    self.logger.info(
                                        "Finishing request %s as it is done in reqmgr"
                                        % request['RequestName'])
                                    queue.doneWork(
                                        WorkflowName=request['RequestName'])
                                    continue
                    else:
                        pass  # assume workqueue status will catch up later
                elif request['RequestStatus'] in ['aborted', 'force-complete']:
                    queue.cancelWork(WorkflowName=request['RequestName'])
                # Check consistency of running-open/closed and the element closure status
                elif request['RequestStatus'] == 'running-open' and not ele.get(
                        'OpenForNewData', False):
                    self.reportRequestStatus(ele['RequestName'],
                                             'running-closed')
                elif request['RequestStatus'] == 'running-closed' and ele.get(
                        'OpenForNewData', False):
                    queue.closeWork(ele['RequestName'])
                # we do not want to move the request to 'failed' status
                elif ele['Status'] == 'Failed':
                    continue
                elif ele['Status'] not in self._reqMgrToWorkQueueStatus(
                        request['RequestStatus']):
                    self.reportElement(ele)

                uptodate_elements.append(ele)
            except Exception as ex:
                msg = 'Error talking to ReqMgr about request "%s": %s' % (
                    ele['RequestName'], str(ex))
                self.logger.exception(msg)

        return uptodate_elements

    def deleteFinishedWork(self, queue, elements):
        """Delete work from queue that is finished in ReqMgr"""
        finished = []
        for element in elements:
            if element.inEndState():
                finished.append(element['RequestName'])
        return queue.deleteWorkflows(*finished)

    def getAvailableRequests(self):
        """
        Get available requests and sort by team and priority
        returns [(team, request_name, request_spec_url)]
        """
        tempResults = self.reqMgr2.getRequestByStatus("staged")
        filteredResults = []
        for requests in tempResults:
            for request in requests.values():
                filteredResults.append(request)
        filteredResults.sort(key=itemgetter('RequestPriority'), reverse=True)
        filteredResults.sort(key=lambda r: r["Team"])

        results = [(x["Team"], x["RequestName"], x["RequestWorkflow"])
                   for x in filteredResults]

        return results

    def reportRequestStatus(self, request, status, message=None):
        """Change state in RequestManager
           Optionally, take a message to append to the request
        """
        if message:
            logType = "error" if status == "Failed" else "info"
            self.logdb.post(request, str(message), logType)
        reqmgrStatus = self._workQueueToReqMgrStatus(status)

        if reqmgrStatus:  # only send known states
            try:
                self.reqMgr2.updateRequestStatus(request, reqmgrStatus)
            except Exception as ex:
                msg = "%s : fail to update status will try later: %s" % (
                    request, str(ex))
                msg += traceback.format_exc()
                self.logdb.post(request, msg, 'warning')
                raise ex
        return

    def _workQueueToReqMgrStatus(self, status):
        """Map WorkQueue Status to that reported to ReqMgr"""
        statusMapping = {
            'Acquired': 'acquired',
            'Running': 'running-open',
            'Failed': 'failed',
            'Canceled': 'aborted',
            'CancelRequested': 'aborted',
            'Done': 'completed'
        }
        if status in statusMapping:
            # if wq status passed convert to reqmgr status
            return statusMapping[status]
        elif status in REQUEST_STATE_LIST:
            # if reqmgr status passed return reqmgr status
            return status
        else:
            # unknown status
            return None

    def _reqMgrToWorkQueueStatus(self, status):
        """Map ReqMgr status to that in a WorkQueue element, it is not a 1-1 relation"""
        statusMapping = {
            'acquired': ['Acquired'],
            'running': ['Running'],
            'running-open': ['Running'],
            'running-closed': ['Running'],
            'failed': ['Failed'],
            'aborted': ['Canceled', 'CancelRequested'],
            'force-complete': ['Canceled', 'CancelRequested'],
            'completed': ['Done']
        }
        if status in statusMapping:
            return statusMapping[status]
        else:
            return []

    def reportElement(self, element):
        """Report element to ReqMgr"""
        self.reportRequestStatus(element['RequestName'], element['Status'])

    def addNewElementsToOpenRequests(self, queue):
        """Add new elements to open requests which are in running-open state, only works adding new blocks from the input dataset"""
        self.logger.info(
            "Checking Request Manager for open requests and closing old ones")

        work = 0
        requests = []

        try:
            requests = self.reqMgr2.getRequestByStatus("running-open",
                                                       detail=False)
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for reqName in requests:
            try:
                self.logger.info("Processing request %s" % (reqName))
                units = queue.addWork(requestName=reqName)
                self.logdb.delete(reqName, 'error', True, agent=False)
            except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex:
                # fatal error - but at least it was split the first time. Log and skip.
                msg = 'Error adding further work to request "%s". Will try again later' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except (IOError, socket.error, CouchError,
                    CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nSee log for details.\nError: "%s"' % str(ex)
                traceMsg = traceback.format_exc()
                msg = "%s\n%s" % (msg, traceMsg)
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

        self.logger.info("%s element(s) added to open requests" % work)
        return work
Exemple #38
0
class ReqMgrTest(RESTBaseUnitTestWithDBBackend):
    """
    Test WorkQueue Service client
    It will start WorkQueue RESTService
    Server DB sets from environment variable.
    Client DB sets from environment variable.

    This checks whether DS call makes without error and return the results.
    Not the correctness of functions. That will be tested in different module.
    
    """
    
        
    def setFakeDN(self):
        # put into ReqMgr auxiliary database under "software" document scram/cmsms
        # which we'll need a little for request injection                
        #Warning: this assumes the same structure in jenkins wmcore_root/test
        self.admin_header = getAuthHeader(self.test_authz_key.data, ADMIN_PERMISSION)
        self.create_header = getAuthHeader(self.test_authz_key.data, CREATE_PERMISSION)
        self.default_header = getAuthHeader(self.test_authz_key.data, DEFAULT_PERMISSION)
        self.assign_header = getAuthHeader(self.test_authz_key.data, ASSIGN_PERMISSION)
        self.default_status_header = getAuthHeader(self.test_authz_key.data, DEFAULT_STATUS_PERMISSION)
        
    def setUp(self):
        self.setConfig(config)
        self.setCouchDBs([(config.views.data.couch_reqmgr_db, "ReqMgr"), 
                          (config.views.data.couch_reqmgr_aux_db, None)])
        self.setSchemaModules([])
        
        RESTBaseUnitTestWithDBBackend.setUp(self)

        self.setFakeDN()
        
        requestPath = os.path.join(getWMBASE(), "test", "data", "ReqMgr", "requests", "DMWM")
        rerecoFile = open(os.path.join(requestPath, "ReReco.json"), 'r')
        
        rerecoArgs = JsonWrapper.load(rerecoFile)
        self.rerecoCreateArgs = rerecoArgs["createRequest"]
        self.rerecoAssignArgs = rerecoArgs["assignRequest"]
        cmsswDoc = {"_id": "software"}
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]] =  []
        cmsswDoc[self.rerecoCreateArgs["ScramArch"]].append(self.rerecoCreateArgs["CMSSWVersion"])
        insertDataToCouch(os.getenv("COUCHURL"), config.views.data.couch_reqmgr_aux_db, cmsswDoc) 
        self.reqSvc = ReqMgr(self.jsonSender["host"]) 
        self.reqSvc._noStale = True      
        self.reqSvc['requests'].additionalHeaders = self.create_header
        
    def tearDown(self):
        RESTBaseUnitTestWithDBBackend.tearDown(self)


    def testRequestSimpleCycle(self):
        """
        test request cycle with one request without composite get condition.
        post, get, put
        """
        
        # test post method
        response = self.reqSvc.insertRequests(self.rerecoCreateArgs)
        self.assertEqual(len(response), 1)
        requestName = response[0]['request']
        
        
        ## test get method
        # get by name
        response = self.reqSvc.getRequestByNames(requestName)
        self.assertEqual(response[requestName]['RequestPriority'], 10000)
        self.assertEqual(len(response), 1)
        
        # get by status
        response = self.reqSvc.getRequestByStatus('new')
        self.assertEqual(len(response), 1)
        print(response)
        

        self.reqSvc.updateRequestStatus(requestName, 'assignment-approved')
        response = self.reqSvc.getRequestByStatus('assignment-approved')
        self.assertEqual(len(response), 1)
        
        self.reqSvc.updateRequestProperty(requestName, {'RequestStatus': 'assigned', 
                                                        "AcquisitionEra": "TEST_ERA",
                                                        "Team": "unittest",
                                                        "SiteWhitelist": ["T1_US_CBS"], 
                                                        "SiteBlacklist": ["T1_US_FOX"]})
        response = self.reqSvc.getRequestByStatus('assignment-approved')
        self.assertEqual(len(response), 0)
        response = self.reqSvc.getRequestByStatus('assigned')
        self.assertEqual(len(response), 1)
        self.assertEqual(response.values()[0]["SiteWhitelist"], ["T1_US_CBS"])
        
        self.reqSvc.updateRequestStats(requestName, {'total_jobs': 100, 'input_lumis': 100,
                               'input_events': 100, 'input_num_files': 100})
Exemple #39
0
class ReqMgrService(TemplatedPage):
    """
    Request Manager web service class
    """
    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd() + '/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd() + '/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir = os.environ.get('RM_JSPATH', os.getcwd() + '/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir = os.environ.get('RM_SPECPATH', os.getcwd() + '/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd() + '/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr',
                                        600)  # put reasonable 10 min interval
        self.sdict = {'ts': time.time()}  # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir, 'WMSpec.StdSpecs')
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types = ['text/css']
        mime_types += [
            'application/javascript', 'text/javascript',
            'application/x-javascript', 'text/x-javascript'
        ]
        cherryconf.update({
            'tools.encode.on': True,
            'tools.gzip.on': True,
            'tools.gzip.mime_types': mime_types,
        })
        self._cache = {}

        # initialize rest API
        statedir = '/tmp'
        app = RESTMain(config, statedir)  # REST application
        mount = '/rest'  # mount point for cherrypy service
        api = RestApiHub(app, config.reqmgr, mount)

        # initialize access to reqmgr2 APIs
        self.reqmgr = ReqMgr(config.reqmgr.reqmgr2_url)
        # only gets current view (This might cause to reponse time much longer,
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # admin helpers
        self.admin_info = Info(app, api, config.reqmgr, mount=mount + '/info')
        self.admin_group = Group(app,
                                 api,
                                 config.reqmgr,
                                 mount=mount + '/group')
        self.admin_team = Team(app, api, config.reqmgr, mount=mount + '/team')

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url',
                                         self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_5_2_5')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc5_amd64_gcc434')

    def user(self):
        """
        Return user name associated with this instance.
        """
        try:
            return cherrypy.request.user['login']
        except:
            return 'testuser'

    def user_dn(self):
        "Return user DN"
        try:
            return cherrypy.request.user['dn']
        except:
            return '/CN/bla/foo'

    def update_scripts(self, force=False):
        "Update scripts dict"
        if force or abs(time.time() - self.sdict['ts']) > self.sdict_thr:
            for item in os.listdir(self.sdir):
                with open(os.path.join(self.sdir, item), 'r') as istream:
                    self.sdict[item.split('.')[0]] = istream.read()
            self.sdict['ts'] = time.time()

    def abs_page(self, tmpl, content):
        """generate abstract page"""
        menu = self.templatepage('menu', menus=menus(), tmpl=tmpl)
        body = self.templatepage('generic', menu=menu, content=content)
        page = self.templatepage('main', content=body, user=self.user())
        return page

    def page(self, content):
        """
        Provide page wrapped with top/bottom templates.
        """
        return self.templatepage('main', content=content)

    def error(self, content):
        "Generate common error page"
        content = self.templatepage('error', content=content)
        return self.abs_page('error', content)

    @expose
    def index(self, **kwds):
        """Main page"""
        content = self.templatepage('index',
                                    requests=ACTIVE_STATUS,
                                    rdict=REQUEST_STATE_TRANSITION)
        return self.abs_page('main', content)

    @expose
    def home(self, **kwds):
        """Main page"""
        return self.index(**kwds)

    ### Admin actions ###

    @expose
    def admin(self, **kwds):
        """admin page"""
        print "\n### ADMIN PAGE"
        rows = self.admin_info.get()
        print "rows", [r for r in rows]

        content = self.templatepage('admin')
        return self.abs_page('admin', content)

    @expose
    def add_user(self, **kwds):
        """add_user action"""
        rid = genid(kwds)
        status = "ok"  # chagne to whatever it would be
        content = self.templatepage('confirm',
                                    ticket=rid,
                                    user=self.user(),
                                    status=status)
        return self.abs_page('admin', content)

    @expose
    def add_group(self, **kwds):
        """add_group action"""
        rows = self.admin_group.get()
        print "\n### GROUPS", [r for r in rows]
        rid = genid(kwds)
        status = "ok"  # chagne to whatever it would be
        content = self.templatepage('confirm',
                                    ticket=rid,
                                    user=self.user(),
                                    status=status)
        return self.abs_page('admin', content)

    @expose
    def add_team(self, **kwds):
        """add_team action"""
        rows = self.admin_team.get()
        print "\n### TEAMS", kwds, [r for r in rows]
        print "request to add", kwds
        rid = genid(kwds)
        status = "ok"  # chagne to whatever it would be
        content = self.templatepage('confirm',
                                    ticket=rid,
                                    user=self.user(),
                                    status=status)
        return self.abs_page('admin', content)

    ### Request actions ###

    @expose
    @checkargs(['status', 'sort'])
    def assign(self, **kwds):
        """assign page"""
        if not kwds:
            kwds = {}
        if 'status' not in kwds:
            kwds.update({'status': 'assignment-approved'})
        docs = []
        attrs = [
            'RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus'
        ]
        data = self.reqmgr.getRequestByStatus(statusList=[kwds['status']])
        for key, val in data.items():
            docs.append(request_attr(val, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        misc_json = {
            'CMSSW Releases': releases(),
            'CMSSW architectures': architectures(),
            'SubscriptionPriority': ['Low', 'Normal', 'High'],
            'CustodialSubType': ['Move', 'Replica'],
            'NonCustodialSubType': ['Move', 'Replica'],
            'MinMergeSize': 2147483648,
            'MaxMergeSize': 4294967296,
            'MaxMergeEvents': 50000,
            'MaxRSS': 20411724,
            'MaxVSize': 20411724,
            'SoftTimeout': 129600,
            'GracePeriod': 300,
            'BlockCloseMaxWaitTime': 66400,
            'BlockCloseMaxFiles': 500,
            'BlockCloseMaxEvents': 250000000,
            'BlockCloseMaxSize': 5000000000000,
            'AcquisitionEra': '',
            'ProcessingVersion': 1,
            'ProcessingString': '',
            'MergedLFNBase': lfn_bases(),
            'UnmergedLFNBase': lfn_unmerged_bases(),
        }
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('assign',
                                    sort=sortby,
                                    filter_sort_table=filter_sort,
                                    sites=sites(),
                                    site_white_list=site_white_list(),
                                    site_black_list=site_black_list(),
                                    user=self.user(),
                                    user_dn=self.user_dn(),
                                    requests=docs,
                                    misc_table=json2table(
                                        misc_json, web_ui_names()),
                                    misc_json=json2form(misc_json,
                                                        indent=2,
                                                        keep_first_value=True))
        return self.abs_page('assign', content)

    @expose
    @checkargs(['status', 'sort'])
    def approve(self, **kwds):
        """
        Approve page: get list of request associated with user DN.
        Fetch their status list from ReqMgr and display if requests
        were seen by data-ops.
        """
        if not kwds:
            kwds = {}
        if 'status' not in kwds:
            kwds.update({'status': 'new'})
        kwds.update({'_nostale': True})
        docs = []
        attrs = [
            'RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus'
        ]
        data = self.reqmgr.getRequestByStatus(statusList=[kwds['status']])
        for key, val in data.items():
            docs.append(request_attr(val, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('approve',
                                    requests=docs,
                                    date=tstamp(),
                                    sort=sortby,
                                    filter_sort_table=filter_sort)
        return self.abs_page('approve', content)

    @expose
    def create(self, **kwds):
        """create page"""
        # get list of standard specs from WMCore and new ones from local area
        #loc_specs_dir = os.path.join(self.spdir, 'Specs') # local specs
        #loc_specs = spec_list(loc_specs_dir, 'Specs')
        #all_specs = list(set(self.std_specs + loc_specs))
        #all_specs.sort()
        all_specs = self.std_specs
        spec = kwds.get('form', '')
        if not spec:
            spec = self.std_specs[0]
        # make spec first in all_specs list
        if spec in all_specs:
            all_specs.remove(spec)
        all_specs = [spec] + all_specs
        jsondata = get_request_template_from_type(spec)
        # create templatized page out of provided forms
        self.update_scripts()
        content = self.templatepage(
            'create',
            table=json2table(jsondata, web_ui_names()),
            jsondata=json2form(jsondata, indent=2, keep_first_value=True),
            name=spec,
            scripts=[s for s in self.sdict.keys() if s != 'ts'],
            specs=all_specs)
        return self.abs_page('create', content)

    def generate_objs(self, script, jsondict):
        """Generate objects from givem JSON template"""
        self.update_scripts()
        code = self.sdict.get(script, '')
        if code.find('def genobjs(jsondict)') == -1:
            return self.error(
                "Improper python snippet, your code should start with <b>def genobjs(jsondict)</b> function"
            )
        exec(code)  # code snippet must starts with genobjs function
        return [r for r in genobjs(jsondict)]

    @expose
    def fetch(self, rid, **kwds):
        "Fetch document for given id"
        rid = rid.replace('request-', '')
        doc = self.reqmgr.getRequestByNames(rid)
        transitions = []
        if len(doc) == 1:
            try:
                doc = doc[rid]
            except:
                pass
            name = doc.get('RequestName', 'NA')
            title = 'Request %s' % name
            status = doc.get('RequestStatus', '')
            transitions = REQUEST_STATE_TRANSITION.get(status, [])
            if status in transitions:
                transitions.remove(status)
            content = self.templatepage('doc',
                                        title=title,
                                        status=status,
                                        name=name,
                                        table=json2table(doc, web_ui_names()),
                                        jsondata=json2form(
                                            doc,
                                            indent=2,
                                            keep_first_value=False),
                                        transitions=transitions)
        elif len(doc) > 1:
            jsondata = [pprint.pformat(d) for d in doc]
            content = self.templatepage('doc',
                                        title='Series of docs: %s' % rid,
                                        table="",
                                        jsondata=jsondata,
                                        transitions=transitions)
        else:
            doc = 'No request found for name=%s' % rid
        return self.abs_page('request', content)

    @expose
    def requests(self, **kwds):
        """Page showing requests"""
        if not kwds:
            kwds = {}
        if 'status' not in kwds:
            kwds.update({'status': 'acquired'})
        results = self.reqmgr.getRequestByStatus(kwds['status'])
        docs = []
        for key, doc in results.items():
            docs.append(request_attr(doc))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('requests',
                                    requests=docs,
                                    sort=sortby,
                                    status=kwds['status'],
                                    filter_sort_table=filter_sort)
        return self.abs_page('requests', content)

    @expose
    def request(self, **kwargs):
        "Get data example and expose it as json"
        dataset = kwargs.get('uinput', '')
        if not dataset:
            return {'error': 'no input dataset'}
        url = 'https://cmsweb.cern.ch/reqmgr/rest/outputdataset/%s' % dataset
        params = {}
        headers = {'Accept': 'application/json;text/json'}
        wdata = getdata(url, params)
        wdict = dict(date=time.ctime(),
                     team='Team-A',
                     status='Running',
                     ID=genid(wdata))
        winfo = self.templatepage('workflow',
                                  wdict=wdict,
                                  dataset=dataset,
                                  code=pprint.pformat(wdata))
        content = self.templatepage('search', content=winfo)
        return self.abs_page('request', content)

    @expose
    def batch(self, **kwds):
        """batch page"""
        # TODO: we need a template for batch attributes
        #       and read it from separate area, like DASMaps
        name = kwds.get('name', '')
        batch = {}
        if name:
            #            batch = self.reqmgr.getBatchesByName(name)
            batch = {
                'Name': 'Batch1',
                'Description': 'Bla-bla',
                'Creator': 'valya',
                'Group': 'test',
                'Workflows': ['workflow1', 'workflow2'],
                'Attributes': {
                    'HeavyIon': ['true', 'false']
                }
            }
        attributes = batch.get('Attributes', {})
        workflows = batch.get('Workflows', [])
        description = batch.get('Description', '')
        creator = batch.get('Creator', self.user_dn())
        content = self.templatepage('batch',
                                    name=name,
                                    attributes=json2table(
                                        attributes, web_ui_names()),
                                    workflows=workflows,
                                    creator=creator,
                                    description=description)
        return self.abs_page('batch', content)

    @expose
    def batches(self, **kwds):
        """Page showing batches"""
        if not kwds:
            kwds = {}
        if 'name' not in kwds:
            kwds.update({'name': ''})
        sortby = kwds.get('sort', 'name')
        #        results = self.reqmgr.getBatchesByName(kwds['name'])
        results = [
            {
                'Name': 'Batch1',
                'Description': 'Bla-bla',
                'Creator': 'valya',
                'Group': 'test',
                'Workflows': ['workflow1', 'workflow2'],
                'Date': 'Fri Feb 13 10:36:41 EST 2015',
                'Attributes': {
                    'HeavyIon': ['true', 'false']
                }
            },
            {
                'Name': 'Batch2',
                'Description': 'lksdjflksjdf',
                'Creator': 'valya',
                'Group': 'test',
                'Workflows': ['workflow1', 'workflow2'],
                'Date': 'Fri Feb 10 10:36:41 EST 2015',
                'Attributes': {
                    'HeavyIon': ['true', 'false']
                }
            },
        ]
        docs = [r for r in sort(results, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('batches',
                                    batches=docs,
                                    sort=sortby,
                                    filter_sort_table=filter_sort)
        return self.abs_page('batches', content)

    ### Aux methods ###

    @expose
    def put_request(self, *args, **kwds):
        "PUT request callback to reqmgr server, should be used in AJAX"
        reqname = kwds.get('RequestName', '')
        status = kwds.get('RequestStatus', '')
        if not reqname:
            msg = 'Unable to update request status, empty request name'
            raise cherrypy.HTTPError(406, msg)
        if not status:
            msg = 'Unable to update request status, empty status value'
            raise cherrypy.HTTPError(406, msg)
        return self.reqmgr.updateRequestStatus(reqname, status)

    @expose
    def images(self, *args, **kwargs):
        """
        Serve static images.
        """
        args = list(args)
        self.check_scripts(args, self.imgmap, self.imgdir)
        mime_types = [
            '*/*', 'image/gif', 'image/png', 'image/jpg', 'image/jpeg'
        ]
        accepts = cherrypy.request.headers.elements('Accept')
        for accept in accepts:
            if  accept.value in mime_types and len(args) == 1 \
                and args[0] in self.imgmap:
                image = self.imgmap[args[0]]
                # use image extension to pass correct content type
                ctype = 'image/%s' % image.split('.')[-1]
                cherrypy.response.headers['Content-type'] = ctype
                return serve_file(image, content_type=ctype)

    def serve(self, kwds, imap, idir, datatype='', minimize=False):
        "Serve files for high level APIs (yui/css/js)"
        args = []
        for key, val in kwds.items():
            if key == 'f':  # we only look-up files from given kwds dict
                if isinstance(val, list):
                    args += val
                else:
                    args.append(val)
        scripts = self.check_scripts(args, imap, idir)
        return self.serve_files(args, scripts, imap, datatype, minimize)

    @exposecss
    @tools.gzip()
    def css(self, **kwargs):
        """
        Serve provided CSS files. They can be passed as
        f=file1.css&f=file2.css
        """
        resource = kwargs.get('resource', 'css')
        if resource == 'css':
            return self.serve(kwargs, self.cssmap, self.cssdir, 'css', True)

    @exposejs
    @tools.gzip()
    def js(self, **kwargs):
        """
        Serve provided JS scripts. They can be passed as
        f=file1.js&f=file2.js with optional resource parameter
        to speficy type of JS files, e.g. resource=yui.
        """
        resource = kwargs.get('resource', 'js')
        if resource == 'js':
            return self.serve(kwargs, self.jsmap, self.jsdir)

    def serve_files(self,
                    args,
                    scripts,
                    resource,
                    datatype='',
                    minimize=False):
        """
        Return asked set of files for JS, YUI, CSS.
        """
        idx = "-".join(scripts)
        if idx not in self._cache.keys():
            data = ''
            if datatype == 'css':
                data = '@CHARSET "UTF-8";'
            for script in args:
                path = os.path.join(sys.path[0], resource[script])
                path = os.path.normpath(path)
                ifile = open(path)
                data = "\n".join ([data, ifile.read().\
                    replace('@CHARSET "UTF-8";', '')])
                ifile.close()
            if datatype == 'css':
                set_headers("text/css")
            if minimize:
                self._cache[idx] = minify(data)
            else:
                self._cache[idx] = data
        return self._cache[idx]

    def check_scripts(self, scripts, resource, path):
        """
        Check a script is known to the resource map
        and that the script actually exists
        """
        for script in scripts:
            if script not in resource.keys():
                spath = os.path.normpath(os.path.join(path, script))
                if os.path.isfile(spath):
                    resource.update({script: spath})
        return scripts
Exemple #40
0
class MSCore(object):
    """
    This class provides core functionality for
    MSTransferor, MSMonitor, MSOutput. MSRuleCleaner classes.
    """

    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s", self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0}, logger=self.logger)

        self.phedex = None
        self.rucio = None
        if not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={"logger": self.logger, "user_agent": "wmcore-microservices"})

    def unifiedConfig(self):
        """
        Fetches the unified configuration
        :return: unified configuration content
        """
        res = self.reqmgrAux.getUnifiedConfig(docName="config")
        if res:
            if isinstance(res, list):
                return res[0]
            return res
        else:
            return {}

    def change(self, reqName, reqStatus, prefix='###'):
        """
        Update the request status in ReqMgr2
        """
        try:
            if self.msConfig['enableStatusTransition']:
                self.logger.info('%s updating %s status to: %s', prefix, reqName, reqStatus)
                self.reqmgr2.updateRequestStatus(reqName, reqStatus)
            else:
                self.logger.info('DRY-RUN:: %s updating %s status to: %s', prefix, reqName, reqStatus)
        except Exception as err:
            self.logger.exception("Failed to change request status. Error: %s", str(err))

    def updateReportDict(self, reportDict, keyName, value):
        """
        Provided a key name and value, validate the key name
        and update the report dictionary if it passes the validation
        :param reportDict: dictionary with a summary of the service
        :param keyName: string with the key name in the report
        :param value: string/integer value with the content of a metric
        :return: the updated dictionary
        """
        if keyName not in reportDict:
            self.logger.error("Report metric '%s' is not supported", keyName)
        else:
            reportDict[keyName] = value
        return reportDict
    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if  config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if  not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd()+'/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd()+'/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir  = os.environ.get('RM_JSPATH', os.getcwd()+'/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir  = os.environ.get('RM_SPECPATH', os.getcwd()+'/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd()+'/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr', 600) # put reasonable 10 min interval
        self.sdict = {'ts':time.time()} # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap  = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir, 'WMSpec.StdSpecs')
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types  = ['text/css']
        mime_types += ['application/javascript', 'text/javascript',
                       'application/x-javascript', 'text/x-javascript']
        cherryconf.update({'tools.encode.on': True,
                           'tools.gzip.on': True,
                           'tools.gzip.mime_types': mime_types,
                          })
        self._cache    = {}

        # initialize rest API
        statedir = '/tmp'
        app = RESTMain(config, statedir) # REST application
        mount = '/rest' # mount point for cherrypy service
        api = RestApiHub(app, config.reqmgr, mount)

        # initialize access to reqmgr2 APIs
        self.reqmgr = ReqMgr(config.reqmgr.reqmgr2_url)
        # only gets current view (This might cause to reponse time much longer, 
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # admin helpers
        self.admin_info = Info(app, api, config.reqmgr, mount=mount+'/info')
        self.admin_group = Group(app, api, config.reqmgr, mount=mount+'/group')
        self.admin_team = Team(app, api, config.reqmgr, mount=mount+'/team')

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url', self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_5_2_5')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc5_amd64_gcc434')
class ReqMgrService(TemplatedPage):
    """
    Request Manager web service class
    """
    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if  config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if  not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd()+'/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd()+'/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir  = os.environ.get('RM_JSPATH', os.getcwd()+'/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir  = os.environ.get('RM_SPECPATH', os.getcwd()+'/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd()+'/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr', 600) # put reasonable 10 min interval
        self.sdict = {'ts':time.time()} # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap  = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir, 'WMSpec.StdSpecs')
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types  = ['text/css']
        mime_types += ['application/javascript', 'text/javascript',
                       'application/x-javascript', 'text/x-javascript']
        cherryconf.update({'tools.encode.on': True,
                           'tools.gzip.on': True,
                           'tools.gzip.mime_types': mime_types,
                          })
        self._cache    = {}

        # initialize rest API
        statedir = '/tmp'
        app = RESTMain(config, statedir) # REST application
        mount = '/rest' # mount point for cherrypy service
        api = RestApiHub(app, config.reqmgr, mount)

        # initialize access to reqmgr2 APIs
        self.reqmgr = ReqMgr(config.reqmgr.reqmgr2_url)
        # only gets current view (This might cause to reponse time much longer, 
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # admin helpers
        self.admin_info = Info(app, api, config.reqmgr, mount=mount+'/info')
        self.admin_group = Group(app, api, config.reqmgr, mount=mount+'/group')
        self.admin_team = Team(app, api, config.reqmgr, mount=mount+'/team')

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url', self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_5_2_5')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc5_amd64_gcc434')

    def user(self):
        """
        Return user name associated with this instance.
        """
        try:
            return cherrypy.request.user['login']
        except:
            return 'testuser'

    def user_dn(self):
        "Return user DN"
        try:
            return cherrypy.request.user['dn']
        except:
            return '/CN/bla/foo'

    def update_scripts(self, force=False):
        "Update scripts dict"
        if  force or abs(time.time()-self.sdict['ts']) > self.sdict_thr:
            for item in os.listdir(self.sdir):
                with open(os.path.join(self.sdir, item), 'r') as istream:
                    self.sdict[item.split('.')[0]] = istream.read()
            self.sdict['ts'] = time.time()

    def abs_page(self, tmpl, content):
        """generate abstract page"""
        menu = self.templatepage('menu', menus=menus(), tmpl=tmpl)
        body = self.templatepage('generic', menu=menu, content=content)
        page = self.templatepage('main', content=body, user=self.user())
        return page

    def page(self, content):
        """
        Provide page wrapped with top/bottom templates.
        """
        return self.templatepage('main', content=content)

    def error(self, content):
        "Generate common error page"
        content = self.templatepage('error', content=content)
        return self.abs_page('error', content)

    @expose
    def index(self, **kwds):
        """Main page"""
        content = self.templatepage('index', requests=ACTIVE_STATUS, rdict=REQUEST_STATE_TRANSITION)
        return self.abs_page('main', content)

    @expose
    def home(self, **kwds):
        """Main page"""
        return self.index(**kwds)

    ### Admin actions ###

    @expose
    def admin(self, **kwds):
        """admin page"""
        print "\n### ADMIN PAGE"
        rows = self.admin_info.get()
        print "rows", [r for r in rows]

        content = self.templatepage('admin')
        return self.abs_page('admin', content)

    @expose
    def add_user(self, **kwds):
        """add_user action"""
        rid = genid(kwds)
        status = "ok" # chagne to whatever it would be
        content = self.templatepage('confirm', ticket=rid, user=self.user(), status=status)
        return self.abs_page('admin', content)

    @expose
    def add_group(self, **kwds):
        """add_group action"""
        rows = self.admin_group.get()
        print "\n### GROUPS", [r for r in rows]
        rid = genid(kwds)
        status = "ok" # chagne to whatever it would be
        content = self.templatepage('confirm', ticket=rid, user=self.user(), status=status)
        return self.abs_page('admin', content)

    @expose
    def add_team(self, **kwds):
        """add_team action"""
        rows = self.admin_team.get()
        print "\n### TEAMS", kwds, [r for r in rows]
        print "request to add", kwds
        rid = genid(kwds)
        status = "ok" # chagne to whatever it would be
        content = self.templatepage('confirm', ticket=rid, user=self.user(), status=status)
        return self.abs_page('admin', content)

    ### Request actions ###

    @expose
    @checkargs(['status', 'sort'])
    def assign(self, **kwds):
        """assign page"""
        if  not kwds:
            kwds = {}
        if  'status' not in kwds:
            kwds.update({'status': 'assignment-approved'})
        docs = []
        attrs = ['RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus']
        data = self.reqmgr.getRequestByStatus(statusList=[kwds['status']])
        for key, val in data.items():
            docs.append(request_attr(val, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        misc_json = {'CMSSW Releases':releases(),
                'CMSSW architectures':architectures(),
                'SubscriptionPriority':['Low', 'Normal', 'High'],
                'CustodialSubType':['Move', 'Replica'],
                'NonCustodialSubType':['Move', 'Replica'],
                'MinMergeSize':2147483648,
                'MaxMergeSize':4294967296,
                'MaxMergeEvents':50000,
                'MaxRSS':20411724,
                'MaxVSize':20411724,
                'SoftTimeout':129600,
                'GracePeriod':300,
                'BlockCloseMaxWaitTime':66400,
                'BlockCloseMaxFiles':500,
                'BlockCloseMaxEvents':250000000,
                'BlockCloseMaxSize':5000000000000,
                'AcquisitionEra':'',
                'ProcessingVersion':1,
                'ProcessingString':'',
                'MergedLFNBase':lfn_bases(),
                'UnmergedLFNBase':lfn_unmerged_bases(),}
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('assign', sort=sortby,
                filter_sort_table=filter_sort,
                sites=sites(),
                site_white_list=site_white_list(),
                site_black_list=site_black_list(),
                user=self.user(), user_dn=self.user_dn(), requests=docs,
                misc_table=json2table(misc_json, web_ui_names()),
                misc_json=json2form(misc_json, indent=2, keep_first_value=True))
        return self.abs_page('assign', content)

    @expose
    @checkargs(['status', 'sort'])
    def approve(self, **kwds):
        """
        Approve page: get list of request associated with user DN.
        Fetch their status list from ReqMgr and display if requests
        were seen by data-ops.
        """
        if  not kwds:
            kwds = {}
        if  'status' not in kwds:
            kwds.update({'status': 'new'})
        kwds.update({'_nostale':True})
        docs = []
        attrs = ['RequestName', 'RequestDate', 'Group', 'Requestor', 'RequestStatus']
        data = self.reqmgr.getRequestByStatus(statusList=[kwds['status']])
        for key, val in data.items():
            docs.append(request_attr(val, attrs))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('approve', requests=docs, date=tstamp(),
                sort=sortby, filter_sort_table=filter_sort)
        return self.abs_page('approve', content)

    @expose
    def create(self, **kwds):
        """create page"""
        # get list of standard specs from WMCore and new ones from local area
        #loc_specs_dir = os.path.join(self.spdir, 'Specs') # local specs
        #loc_specs = spec_list(loc_specs_dir, 'Specs')
        #all_specs = list(set(self.std_specs + loc_specs))
        #all_specs.sort()
        all_specs = self.std_specs
        spec = kwds.get('form', '')
        if  not spec:
            spec = self.std_specs[0]
        # make spec first in all_specs list
        if  spec in all_specs:
            all_specs.remove(spec)
        all_specs = [spec] + all_specs
        jsondata = get_request_template_from_type(spec)
        # create templatized page out of provided forms
        self.update_scripts()
        content = self.templatepage('create', table=json2table(jsondata, web_ui_names()),
                jsondata=json2form(jsondata, indent=2, keep_first_value=True), name=spec,
                scripts=[s for s in self.sdict.keys() if s!='ts'],
                specs=all_specs)
        return self.abs_page('create', content)

    def generate_objs(self, script, jsondict):
        """Generate objects from givem JSON template"""
        self.update_scripts()
        code = self.sdict.get(script, '')
        if  code.find('def genobjs(jsondict)') == -1:
            return self.error("Improper python snippet, your code should start with <b>def genobjs(jsondict)</b> function")
        exec(code) # code snippet must starts with genobjs function
        return [r for r in genobjs(jsondict)]

    @expose
    def fetch(self, rid, **kwds):
        "Fetch document for given id"
        rid = rid.replace('request-', '')
        doc = self.reqmgr.getRequestByNames(rid)
        transitions = []
        if len(doc) == 1:
            try:
                doc = doc[rid]
            except:
                pass
            name = doc.get('RequestName', 'NA')
            title = 'Request %s' % name
            status = doc.get('RequestStatus', '')
            transitions = REQUEST_STATE_TRANSITION.get(status, [])
            if  status in transitions:
                transitions.remove(status)
            content = self.templatepage('doc', title=title, status=status, name=name,
                    table=json2table(doc, web_ui_names()),
                    jsondata=json2form(doc, indent=2, keep_first_value=False),
                    transitions=transitions)
        elif len(doc) > 1:
            jsondata = [pprint.pformat(d) for d in doc]
            content = self.templatepage('doc', title='Series of docs: %s' % rid,
                    table="", jsondata=jsondata,
                    transitions=transitions)
        else:
            doc = 'No request found for name=%s' % rid
        return self.abs_page('request', content)

    @expose
    def requests(self, **kwds):
        """Page showing requests"""
        if  not kwds:
            kwds = {}
        if  'status' not in kwds:
            kwds.update({'status': 'acquired'})
        results = self.reqmgr.getRequestByStatus(kwds['status'])
        docs = []
        for key, doc in results.items():
            docs.append(request_attr(doc))
        sortby = kwds.get('sort', 'status')
        docs = [r for r in sort(docs, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('requests', requests=docs, sort=sortby,
                status=kwds['status'], filter_sort_table=filter_sort)
        return self.abs_page('requests', content)

    @expose
    def request(self, **kwargs):
        "Get data example and expose it as json"
        dataset = kwargs.get('uinput', '')
        if  not dataset:
            return {'error':'no input dataset'}
        url = 'https://cmsweb.cern.ch/reqmgr/rest/outputdataset/%s' % dataset
        params = {}
        headers = {'Accept': 'application/json;text/json'}
        wdata = getdata(url, params)
        wdict = dict(date=time.ctime(), team='Team-A', status='Running', ID=genid(wdata))
        winfo = self.templatepage('workflow', wdict=wdict,
                dataset=dataset, code=pprint.pformat(wdata))
        content = self.templatepage('search', content=winfo)
        return self.abs_page('request', content)

    @expose
    def batch(self, **kwds):
        """batch page"""
        # TODO: we need a template for batch attributes
        #       and read it from separate area, like DASMaps
        name = kwds.get('name', '')
        batch = {}
        if  name:
#            batch = self.reqmgr.getBatchesByName(name)
            batch = {'Name':'Batch1', 'Description': 'Bla-bla', 'Creator':'valya', 'Group':'test',
                    'Workflows':['workflow1', 'workflow2'],
                    'Attributes':{'HeavyIon':['true', 'false']}}
        attributes = batch.get('Attributes', {})
        workflows = batch.get('Workflows', [])
        description = batch.get('Description', '')
        creator = batch.get('Creator', self.user_dn())
        content = self.templatepage('batch', name=name,
                attributes=json2table(attributes, web_ui_names()),
                workflows=workflows, creator=creator,
                description=description)
        return self.abs_page('batch', content)

    @expose
    def batches(self, **kwds):
        """Page showing batches"""
        if  not kwds:
            kwds = {}
        if  'name' not in kwds:
            kwds.update({'name': ''})
        sortby = kwds.get('sort', 'name')
#        results = self.reqmgr.getBatchesByName(kwds['name'])
        results = [
                {'Name':'Batch1', 'Description': 'Bla-bla', 'Creator':'valya', 'Group':'test',
                    'Workflows':['workflow1', 'workflow2'],
                    'Date': 'Fri Feb 13 10:36:41 EST 2015',
                    'Attributes':{'HeavyIon':['true', 'false']}},
                {'Name':'Batch2', 'Description': 'lksdjflksjdf', 'Creator':'valya', 'Group':'test',
                    'Workflows':['workflow1', 'workflow2'],
                    'Date': 'Fri Feb 10 10:36:41 EST 2015',
                    'Attributes':{'HeavyIon':['true', 'false']}},
                   ]
        docs = [r for r in sort(results, sortby)]
        filter_sort = self.templatepage('filter_sort')
        content = self.templatepage('batches', batches=docs, sort=sortby,
                filter_sort_table=filter_sort)
        return self.abs_page('batches', content)

    ### Aux methods ###

    @expose
    def put_request(self, *args, **kwds):
        "PUT request callback to reqmgr server, should be used in AJAX"
        reqname = kwds.get('RequestName', '')
        status = kwds.get('RequestStatus', '')
        if  not reqname:
            msg = 'Unable to update request status, empty request name'
            raise cherrypy.HTTPError(406, msg)
        if  not status:
            msg = 'Unable to update request status, empty status value'
            raise cherrypy.HTTPError(406, msg)
        return self.reqmgr.updateRequestStatus(reqname, status)

    @expose
    def images(self, *args, **kwargs):
        """
        Serve static images.
        """
        args = list(args)
        self.check_scripts(args, self.imgmap, self.imgdir)
        mime_types = ['*/*', 'image/gif', 'image/png',
                      'image/jpg', 'image/jpeg']
        accepts = cherrypy.request.headers.elements('Accept')
        for accept in accepts:
            if  accept.value in mime_types and len(args) == 1 \
                and args[0] in self.imgmap:
                image = self.imgmap[args[0]]
                # use image extension to pass correct content type
                ctype = 'image/%s' % image.split('.')[-1]
                cherrypy.response.headers['Content-type'] = ctype
                return serve_file(image, content_type=ctype)

    def serve(self, kwds, imap, idir, datatype='', minimize=False):
        "Serve files for high level APIs (yui/css/js)"
        args = []
        for key, val in kwds.items():
            if  key == 'f': # we only look-up files from given kwds dict
                if  isinstance(val, list):
                    args += val
                else:
                    args.append(val)
        scripts = self.check_scripts(args, imap, idir)
        return self.serve_files(args, scripts, imap, datatype, minimize)

    @exposecss
    @tools.gzip()
    def css(self, **kwargs):
        """
        Serve provided CSS files. They can be passed as
        f=file1.css&f=file2.css
        """
        resource = kwargs.get('resource', 'css')
        if  resource == 'css':
            return self.serve(kwargs, self.cssmap, self.cssdir, 'css', True)

    @exposejs
    @tools.gzip()
    def js(self, **kwargs):
        """
        Serve provided JS scripts. They can be passed as
        f=file1.js&f=file2.js with optional resource parameter
        to speficy type of JS files, e.g. resource=yui.
        """
        resource = kwargs.get('resource', 'js')
        if  resource == 'js':
            return self.serve(kwargs, self.jsmap, self.jsdir)

    def serve_files(self, args, scripts, resource, datatype='', minimize=False):
        """
        Return asked set of files for JS, YUI, CSS.
        """
        idx = "-".join(scripts)
        if  idx not in self._cache.keys():
            data = ''
            if  datatype == 'css':
                data = '@CHARSET "UTF-8";'
            for script in args:
                path = os.path.join(sys.path[0], resource[script])
                path = os.path.normpath(path)
                ifile = open(path)
                data = "\n".join ([data, ifile.read().\
                    replace('@CHARSET "UTF-8";', '')])
                ifile.close()
            if  datatype == 'css':
                set_headers("text/css")
            if  minimize:
                self._cache[idx] = minify(data)
            else:
                self._cache[idx] = data
        return self._cache[idx]

    def check_scripts(self, scripts, resource, path):
        """
        Check a script is known to the resource map
        and that the script actually exists
        """
        for script in scripts:
            if  script not in resource.keys():
                spath = os.path.normpath(os.path.join(path, script))
                if  os.path.isfile(spath):
                    resource.update({script: spath})
        return scripts
class CleanCouchPoller(BaseWorkerThread):
    """
    Cleans up local couch db according the the given condition.
    1. Cleans local couch db when request is completed and reported to cental db.
       This will clean up local couchdb, local summary db, local queue
       
    2. Cleans old couchdoc which is created older than the time threshold
    
    """
    def __init__(self, config):
        """
        Initialize config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        
    def setup(self, parameters):
        """
        Called at startup
        """
        self.teamName = self.config.Agent.teamName
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.archiveDelayHours = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0)
        self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent")
        
        #TODO: we might need to use local db for Tier0
        self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                      couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
        
        if self.useReqMgrForCompletionCheck:
            self.deletableState = "announced"
            self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                          couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            #TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only)
            self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableState = "completed"
            # use local for update
            self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, 
                                                          couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
        
        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb  = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        
        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)

    def algorithm(self, parameters):
        """
        Get information from wmbs, workqueue and local couch.
          - It deletes old wmstats docs
          - Archive workflows
        """
        try:
            logging.info("Cleaning up the old request docs")
            report = self.wmstatsCouchDB.deleteOldDocs(self.config.TaskArchiver.DataKeepDays)
            logging.info("%s docs deleted" % report)
            
            logging.info("Cleaning up the archived request docs")
            report = self.cleanAlreadyArchivedWorkflows()
            logging.info("%s archived workflows deleted" % report)
            
            # archiving only workflows that I own (same team)
            logging.info("Getting requests in '%s' state for team '%s'", self.deletableState,
                                                                           self.teamName)
            endTime = int(time.time()) - self.archiveDelayHours * 3600
            wfs = self.centralRequestDBReader.getRequestByTeamAndStatus(self.teamName,
                                                                        self.deletableState)
            commonWfs = self.centralRequestDBReader.getRequestByStatusAndStartTime(self.deletableState, 
                                                                                   False, endTime)
            deletableWorkflows = list(set(wfs) & set(commonWfs))
            logging.info("Ready to archive normal %s workflows", len(deletableWorkflows))
            numUpdated = self.archiveWorkflows(deletableWorkflows, "normal-archived")
            logging.info("archive normal %s workflows", numUpdated)
            
            abortedWorkflows = self.centralRequestDBReader.getRequestByStatus(["aborted-completed"])
            logging.info("Ready to archive aborted %s workflows", len(abortedWorkflows))
            numUpdated = self.archiveWorkflows(abortedWorkflows, "aborted-archived")
            logging.info("archive aborted %s workflows", numUpdated)
            
            rejectedWorkflows = self.centralRequestDBReader.getRequestByStatus(["rejected"])
            logging.info("Ready to archive rejected %s workflows", len(rejectedWorkflows))
            numUpdated = self.archiveWorkflows(rejectedWorkflows, "rejected-archived")
            logging.info("archive rejected %s workflows", numUpdated)

        except Exception as ex:
            logging.error(str(ex))
            logging.error("Error occurred, will try again next cycle")
    
    def archiveWorkflows(self, workflows, archiveState):
        updated = 0
        for workflowName in workflows:
            if self.cleanAllLocalCouchDB(workflowName):
                if self.useReqMgrForCompletionCheck:
                    try:
                        #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced)
                        self.reqmgrSvc.updateRequestStatus(workflowName, archiveState)
                        #And replace with this - remove all the excption
                        #self.reqmgr2Svc.updateRequestStatus(workflowName, archiveState)
                    except HTTPException as ex:
                        # If we get an HTTPException of 404 means reqmgr2 request
                        if ex.status == 404:
                            # try reqmgr2 call
                            msg = "%s : reqmgr2 request: %s" % (workflowName, str(ex))
                            logging.warning(msg)
                            self.reqmgr2Svc.updateRequestStatus(workflowName, archiveState)
                        else:
                            msg = "%s : fail to update status with HTTP error: %s" % (workflowName, str(ex))
                            logging.error(msg)
                            raise ex
                            
                    updated += 1 
                    logging.debug("status updated to %s %s",  archiveState, workflowName)
                else:
                    # tier0 update case
                    self.centralRequestDBWriter.updateRequestStatus(workflowName, archiveState)
        return updated
    
    def deleteWorkflowFromJobCouch(self, workflowName, db):
        """
        _deleteWorkflowFromCouch_

        If we are asked to delete the workflow from couch, delete it
        to clear up some space.

        Load the document IDs and revisions out of couch by workflowName,
        then order a delete on them.
        """
        options = {"startkey": [workflowName], "endkey": [workflowName, {}], "reduce": False}
        
        if db == "JobDump":
            couchDB = self.jobsdatabase
            view = "jobsByWorkflowName"
        elif db == "FWJRDump":
            couchDB = self.fwjrdatabase
            view = "fwjrsByWorkflowName"
        elif db == "SummaryStats":
            couchDB = self.statsumdatabase
            view = None
        elif db == "WMStatsAgent":
            couchDB = self.wmstatsCouchDB.getDBInstance()
            view = "allWorkflows"
            options = {"key": workflowName, "reduce": False}
            
        if view == None:
            try:
                committed = couchDB.delete_doc(workflowName)
            except CouchNotFoundError as ex:
                return {'status': 'warning', 'message': "%s: %s" % (workflowName, str(ex))}
        else:
            try:
                jobs = couchDB.loadView(db, view, options = options)['rows']
            except Exception as ex:
                errorMsg = "Error on loading jobs for %s" % workflowName
                logging.warning("%s/n%s" % (str(ex), errorMsg))
                return {'status': 'error', 'message': errorMsg}
            
            for j in jobs:
                doc = {}
                doc["_id"]  = j['value']['id']
                doc["_rev"] = j['value']['rev']
                couchDB.queueDelete(doc)
            committed = couchDB.commit()
        
        if committed:
            #create the error report
            errorReport = {}
            deleted = 0
            status = "ok"
            for data in committed:
                if 'error' in data:
                    errorReport.setdefault(data['error'], 0)
                    errorReport[data['error']] += 1
                    status = "error"
                else:
                    deleted += 1
            return {'status': status, 'delete': deleted, 'message': errorReport}
        else:
            return {'status': 'warning', 'message': "no %s exist" % workflowName}


    def cleanAllLocalCouchDB(self, workflowName):
        logging.info("Deleting %s from JobCouch" % workflowName)
        
        jobReport = self.deleteWorkflowFromJobCouch(workflowName, "JobDump")
        logging.debug("%s docs deleted from JobDump", jobReport)
        
        fwjrReport = self.deleteWorkflowFromJobCouch(workflowName, "FWJRDump")
        logging.debug("%s docs deleted from FWJRDump", fwjrReport)
        
        summaryReport = self.deleteWorkflowFromJobCouch(workflowName, "SummaryStats")
        logging.debug("%s docs deleted from SummaryStats", summaryReport)
        
        wmstatsReport = self.deleteWorkflowFromJobCouch(workflowName, "WMStatsAgent")
        logging.debug("%s docs deleted from wmagent_summary", wmstatsReport)
        
        # if one of the procedure fails return False
        if (jobReport["status"] == "error" or fwjrReport["status"] == "error" or 
            wmstatsReport["status"] == "error"):
            return False
        # other wise return True.
        return True
        
    def cleanAlreadyArchivedWorkflows(self):
        """
        loop through the workflows in couchdb, if archived delete all the data in couchdb
        """
        
        numDeletedRequests = 0
        try:
            localWMStats = self.wmstatsCouchDB.getDBInstance()
            options = {"group_level": 1, "reduce": True}
            
            results = localWMStats.loadView("WMStatsAgent", "allWorkflows", options = options)['rows']
            requestNames = [x['key'] for x in results]
            logging.info("There are %s workfows to check for archived status" % len(requestNames))
            
            workflowDict = self.centralRequestDBReader.getStatusAndTypeByRequest(requestNames)
            
            for request, value in workflowDict.items():
                if value[0].endswith("-archived"):
                    self.cleanAllLocalCouchDB(request)
                    numDeletedRequests += 1
        
        except Exception as ex:
            errorMsg = "Error on loading workflow list from wmagent_summary db"
            logging.warning("%s/n%s" % (errorMsg, str(ex)))
            
        return numDeletedRequests
Exemple #44
0
    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd() + '/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd() + '/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir = os.environ.get('RM_JSPATH', os.getcwd() + '/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir = os.environ.get('RM_SPECPATH', os.getcwd() + '/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd() + '/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr',
                                        600)  # put reasonable 10 min interval
        self.sdict = {'ts': time.time()}  # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir, 'WMSpec.StdSpecs')
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types = ['text/css']
        mime_types += [
            'application/javascript', 'text/javascript',
            'application/x-javascript', 'text/x-javascript'
        ]
        cherryconf.update({
            'tools.encode.on': True,
            'tools.gzip.on': True,
            'tools.gzip.mime_types': mime_types,
        })
        self._cache = {}

        # initialize rest API
        statedir = '/tmp'
        app = RESTMain(config, statedir)  # REST application
        mount = '/rest'  # mount point for cherrypy service
        api = RestApiHub(app, config.reqmgr, mount)

        # initialize access to reqmgr2 APIs
        self.reqmgr = ReqMgr(config.reqmgr.reqmgr2_url)
        # only gets current view (This might cause to reponse time much longer,
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # admin helpers
        self.admin_info = Info(app, api, config.reqmgr, mount=mount + '/info')
        self.admin_group = Group(app,
                                 api,
                                 config.reqmgr,
                                 mount=mount + '/group')
        self.admin_team = Team(app, api, config.reqmgr, mount=mount + '/team')

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url',
                                         self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_5_2_5')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc5_amd64_gcc434')
class WorkQueueReqMgrInterface():
    """Helper class for ReqMgr interaction"""
    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        #TODO: (reqmgr2Only - remove this line when reqmgr is replaced)
        self.reqMgr = RequestManager(kwargs)
        #this will break all in one test
        self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None))

        centralurl = kwargs.get("central_logdb_url", "")
        identifier = kwargs.get("log_reporter", "")

        # set the thread name before creat the log db.
        # only sets that when it is not set already
        myThread = threading.currentThread()
        if myThread.getName() == "MainThread":
            myThread.setName(self.__class__.__name__)

        self.logdb = LogDB(centralurl, identifier, logger=self.logger)
        self.previous_state = {}

    def __call__(self, queue):
        """Synchronize WorkQueue and RequestManager"""
        msg = ''
        try:  # pull in new work
            work = self.queueNewRequests(queue)
            msg += "New Work: %d\n" % work
        except Exception:
            self.logger.exception("Error caught during RequestManager pull")

        try:  # get additional open-running work
            extraWork = self.addNewElementsToOpenRequests(queue)
            msg += "Work added: %d\n" % extraWork
        except Exception:
            self.logger.exception("Error caught during RequestManager split")

        try:  # report back to ReqMgr
            uptodate_elements = self.report(queue)
            msg += "Updated ReqMgr status for: %s\n" % ", ".join(
                [x['RequestName'] for x in uptodate_elements])
        except Exception:
            self.logger.exception("Error caught during RequestManager update")
        else:
            try:  # Delete finished requests from WorkQueue
                self.deleteFinishedWork(queue, uptodate_elements)
            except Exception:
                self.logger.exception("Error caught during work deletion")

        queue.backend.recordTaskActivity('reqmgr_sync', msg)

    def queueNewRequests(self, queue):
        """Get requests from regMgr and queue to workqueue"""
        self.logger.info("Contacting Request manager for more work")
        work = 0
        workLoads = []

        if queue.params['DrainMode']:
            self.logger.info(
                'Draining queue: Skip requesting work from ReqMgr')
            return 0

        try:
            workLoads = self.getAvailableRequests(queue.params['Teams'])
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for team, reqName, workLoadUrl in workLoads:
            #            try:
            #                self.reportRequestStatus(reqName, "negotiating")
            #            except Exception, ex:
            #                self.logger.error("""
            #                    Unable to update ReqMgr state to negotiating: %s
            #                    Ignoring this request: %s""" % (str(ex), reqName))
            #                continue

            try:
                try:
                    Lexicon.couchurl(workLoadUrl)
                except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                    # check its not a local file
                    if not os.path.exists(workLoadUrl):
                        error = WorkQueueWMSpecError(
                            None,
                            "Workflow url validation error: %s" % str(ex))
                        raise error

                self.logger.info("Processing request %s at %s" %
                                 (reqName, workLoadUrl))
                units = queue.queueWork(workLoadUrl,
                                        request=reqName,
                                        team=team)
                self.logdb.delete(reqName, "error", this_thread=True)
            except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex:
                # fatal error - report back to ReqMgr
                self.logger.info(
                    'Permanent failure processing request "%s": %s' %
                    (reqName, str(ex)))
                self.logger.info("Marking request %s as failed in ReqMgr" %
                                 reqName)
                self.reportRequestStatus(reqName, 'Failed', message=str(ex))
                continue
            except (IOError, socket.error, CouchError,
                    CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' \
                '\nError: "%s"' % (reqName, str(ex))
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' \
                '\nSee log for details.\nError: "%s"' % (reqName, str(ex))
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            try:
                self.reportRequestStatus(reqName, "acquired")
            except Exception as ex:
                self.logger.warning("Unable to update ReqMgr state: %s" %
                                    str(ex))
                self.logger.warning('Will try again later')

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

            self.logger.info("%s element(s) obtained from RequestManager" %
                             work)
        return work

    def report(self, queue):
        """Report queue status to ReqMgr."""
        new_state = {}
        uptodate_elements = []
        now = time.time()

        elements = queue.statusInbox(dictKey="RequestName")
        if not elements:
            return new_state

        for ele in elements:
            ele = elements[ele][0]  # 1 element tuple
            try:
                request = self.reqMgr2.getRequestByNames(ele['RequestName'])
                if not request:
                    msg = 'Failed to get request "%s" from ReqMgr2. Will try again later.' % ele[
                        'RequestName']
                    self.logger.warning(msg)
                    continue
                request = request[ele['RequestName']]
                if request['RequestStatus'] in ('failed', 'completed',
                                                'announced', 'epic-FAILED',
                                                'closed-out', 'rejected'):
                    # requests can be done in reqmgr but running in workqueue
                    # if request has been closed but agent cleanup actions
                    # haven't been run (or agent has been retired)
                    # Prune out obviously too old ones to avoid build up
                    if queue.params.get('reqmgrCompleteGraceTime', -1) > 0:
                        if (now - float(ele.updatetime)
                            ) > queue.params['reqmgrCompleteGraceTime']:
                            # have to check all elements are at least running and are old enough
                            request_elements = queue.statusInbox(
                                WorkflowName=request['RequestName'])
                            if not any([
                                    x for x in request_elements
                                    if x['Status'] != 'Running'
                                    and not x.inEndState()
                            ]):
                                last_update = max([
                                    float(x.updatetime)
                                    for x in request_elements
                                ])
                                if (
                                        now - last_update
                                ) > queue.params['reqmgrCompleteGraceTime']:
                                    self.logger.info(
                                        "Finishing request %s as it is done in reqmgr"
                                        % request['RequestName'])
                                    queue.doneWork(
                                        WorkflowName=request['RequestName'])
                                    continue
                    else:
                        pass  # assume workqueue status will catch up later
                elif request['RequestStatus'] == 'aborted' or request[
                        'RequestStatus'] == 'force-complete':
                    queue.cancelWork(WorkflowName=request['RequestName'])
                # Check consistency of running-open/closed and the element closure status
                elif request['RequestStatus'] == 'running-open' and not ele.get(
                        'OpenForNewData', False):
                    self.reportRequestStatus(ele['RequestName'],
                                             'running-closed')
                elif request['RequestStatus'] == 'running-closed' and ele.get(
                        'OpenForNewData', False):
                    queue.closeWork(ele['RequestName'])
                # update request status if necessary
                elif ele['Status'] not in self._reqMgrToWorkQueueStatus(
                        request['RequestStatus']):
                    self.reportElement(ele)

                uptodate_elements.append(ele)
            except Exception as ex:
                msg = 'Error talking to ReqMgr about request "%s": %s'
                traceMsg = traceback.format_exc()
                self.logger.error(msg % (ele['RequestName'], traceMsg))

        return uptodate_elements

    def deleteFinishedWork(self, queue, elements):
        """Delete work from queue that is finished in ReqMgr"""
        finished = []
        for element in elements:
            if self._workQueueToReqMgrStatus(element['Status']) in ('aborted', 'failed', 'completed', 'announced',
                                                         'epic-FAILED', 'closed-out', 'rejected') \
                                                         and element.inEndState():
                finished.append(element['RequestName'])
        return queue.deleteWorkflows(*finished)

    def _getRequestsByTeamsAndStatus(self, status, teams=[]):
        """
        TODO: now it assumes one team per requests - check whether this assumption is correct
        Check whether we actually use the team for this.
        Also switch to byteamandstatus couch call instead of 
        """
        requests = self.reqMgr2.getRequestByStatus(status)
        #Then sort by Team name then sort by Priority
        #https://docs.python.org/2/howto/sorting.html
        if teams and len(teams) > 0:
            results = {}
            for reqName, value in requests.items():
                if value["Teams"][0] in teams:
                    results[reqName] = value
            return results
        else:
            return requests

    def getAvailableRequests(self, teams):
        """
        Get available requests for the given teams and sort by team and priority
        returns [(team, request_name, request_spec_url)]
        """

        tempResults = self._getRequestsByTeamsAndStatus("assigned",
                                                        teams).values()
        filteredResults = []
        for request in tempResults:
            if "Teams" in request and len(request["Teams"]) == 1:
                filteredResults.append(request)
                self.logdb.delete(request["RequestName"],
                                  "error",
                                  this_thread=True)
            else:
                msg = "no team or more than one team (%s) are assigined: %s" % (
                    request.get("Teams", None), request["RequestName"])
                self.logger.error(msg)
                self.logdb.post(request["RequestName"], msg, 'error')
        filteredResults.sort(key=itemgetter('RequestPriority'), reverse=True)
        filteredResults.sort(key=lambda r: r["Teams"][0])

        results = [(x["Teams"][0], x["RequestName"], x["RequestWorkflow"])
                   for x in filteredResults]

        return results

    def reportRequestStatus(self, request, status, message=None):
        """Change state in RequestManager
           Optionally, take a message to append to the request
        """
        if message:
            self.logdb.post(request, str(message), 'info')
        reqmgrStatus = self._workQueueToReqMgrStatus(status)
        if reqmgrStatus:  # only send known states
            try:
                #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced)
                self.reqMgr.reportRequestStatus(request, reqmgrStatus)
                # And replace with this (remove all Exceptins)
                #self.reqMgr2.updateRequestStatus(request, reqmgrStatus)
            except HTTPException as ex:
                # If we get an HTTPException of 404 means reqmgr2 request
                if ex.status == 404:
                    # try reqmgr2 call
                    msg = "%s : reqmgr2 request: %s" % (request, str(ex))
                    self.logdb.post(request, msg, 'info')
                    self.reqMgr2.updateRequestStatus(request, reqmgrStatus)
                else:
                    msg = "%s : fail to update status with HTTP error: %s" % (
                        request, str(ex))
                    self.logdb.post(request, msg, 'warning')
                    raise ex
            except Exception as ex:
                msg = "%s : fail to update status will try later: %s" % (
                    request, str(ex))
                self.logdb.post(request, msg, 'warning')
                raise ex

    def markAcquired(self, request, url=None):
        """Mark request acquired"""
        self.reqMgr.putWorkQueue(request, url)

    def _workQueueToReqMgrStatus(self, status):
        """Map WorkQueue Status to that reported to ReqMgr"""
        statusMapping = {
            'Acquired': 'acquired',
            'Running': 'running-open',
            'Failed': 'failed',
            'Canceled': 'aborted',
            'CancelRequested': 'aborted',
            'Done': 'completed'
        }
        if status in statusMapping:
            # if wq status passed convert to reqmgr status
            return statusMapping[status]
        elif status in REQUEST_STATE_LIST:
            # if reqmgr status passed return reqmgr status
            return status
        else:
            # unknown status
            return None

    def _reqMgrToWorkQueueStatus(self, status):
        """Map ReqMgr status to that in a WorkQueue element, it is not a 1-1 relation"""
        statusMapping = {
            'acquired': ['Acquired'],
            'running': ['Running'],
            'running-open': ['Running'],
            'running-closed': ['Running'],
            'failed': ['Failed'],
            'aborted': ['Canceled', 'CancelRequested'],
            'force-complete': ['Canceled', 'CancelRequested'],
            'completed': ['Done']
        }
        if status in statusMapping:
            return statusMapping[status]
        else:
            return []

    def reportElement(self, element):
        """Report element to ReqMgr"""
        self.reportRequestStatus(element['RequestName'], element['Status'])

    def addNewElementsToOpenRequests(self, queue):
        """Add new elements to open requests which are in running-open state, only works adding new blocks from the input dataset"""
        self.logger.info(
            "Checking Request Manager for open requests and closing old ones")

        # First close any open inbox element which hasn't found anything new in a while
        queue.closeWork()
        self.report(queue)

        work = 0
        requests = []

        # Drain mode, don't pull any work into open requests. They will be closed if the queue stays in drain long enough
        if queue.params['DrainMode']:
            self.logger.info(
                'Draining queue: Skip requesting work from ReqMgr')
            return 0

        try:
            requests = self._getRequestsByTeamsAndStatus(
                "running-open", queue.params['Teams']).keys()
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for reqName in requests:
            try:
                self.logger.info("Processing request %s" % (reqName))
                units = queue.addWork(requestName=reqName)
                self.logdb.delete(request["RequestName"], 'error', True)
            except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex:
                # fatal error - but at least it was split the first time. Log and skip.
                msg = 'Error adding further work to request "%s". Will try again later' \
                '\nError: "%s"' % (reqName, str(ex))
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except (IOError, socket.error, CouchError,
                    CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' \
                '\nError: "%s"' % (reqName, str(ex))
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' \
                '\nSee log for details.\nError: "%s"' % (reqName, str(ex))
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

        self.logger.info("%s element(s) added to open requests" % work)
        return work
class CleanCouchPoller(BaseWorkerThread):
    """
    Cleans up local couch db according the the given condition.
    1. Cleans local couch db when request is completed and reported to cental db.
       This will clean up local couchdb, local summary db, local queue
       
    2. Cleans old couchdoc which is created older than the time threshold
    
    """
    def __init__(self, config):
        """
        Initialize config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        
    def setup(self, parameters):
        """
        Called at startup
        """
        # set the connection for local couchDB call
        self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        self.archiveDelayHours   = getattr(self.config.TaskArchiver, 'archiveDelayHours', 0)
        self.wmstatsCouchDB = WMStatsWriter(self.config.TaskArchiver.localWMStatsURL, 
                                            "WMStatsAgent")
        
        #TODO: we might need to use local db for Tier0
        self.centralRequestDBReader = RequestDBReader(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
        
        if self.useReqMgrForCompletionCheck:
            self.deletableState = "announced"
            self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
            if self.config.TaskArchiver.reqmgr2Only:
                self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            else:
                #TODO: remove this for reqmgr2
                self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})
        else:
            # Tier0 case
            self.deletableState = "completed"
            # use local for update
            self.centralRequestDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
        
        jobDBurl = sanitizeURL(self.config.JobStateMachine.couchurl)['url']
        jobDBName = self.config.JobStateMachine.couchDBName
        self.jobCouchdb  = CouchServer(jobDBurl)
        self.jobsdatabase = self.jobCouchdb.connectDatabase("%s/jobs" % jobDBName)
        self.fwjrdatabase = self.jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        
        statSummaryDBName = self.config.JobStateMachine.summaryStatsDBName
        self.statsumdatabase = self.jobCouchdb.connectDatabase(statSummaryDBName)

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Cleaning up the old request docs")
            report = self.wmstatsCouchDB.deleteOldDocs(self.config.TaskArchiver.DataKeepDays)
            logging.info("%s docs deleted" % report)
            logging.info("getting complete and announced requests")
            
            endTime = int(time.time()) - self.archiveDelayHours * 3600
            deletableWorkflows = self.centralRequestDBReader.getRequestByStatusAndStartTime(self.deletableState, 
                                                                                            False, endTime)
            logging.info("Ready to archive normal %s workflows" % len(deletableWorkflows))
            numUpdated = self.archiveWorkflows(deletableWorkflows, "normal-archived")
            logging.info("archive normal %s workflows" % numUpdated)
            
            abortedWorkflows = self.centralRequestDBReader.getRequestByStatus(["aborted-completed"])
            logging.info("Ready to archive aborted %s workflows" % len(abortedWorkflows))
            numUpdated = self.archiveWorkflows(abortedWorkflows, "aborted-archived")
            logging.info("archive aborted %s workflows" % numUpdated)
            
            rejectedWorkflows = self.centralRequestDBReader.getRequestByStatus(["rejected"])
            logging.info("Ready to archive rejected %s workflows" % len(rejectedWorkflows))
            numUpdated = self.archiveWorkflows(rejectedWorkflows, "rejected-archived")
            logging.info("archive rejected %s workflows" % numUpdated)

        except Exception as ex:
            logging.error(str(ex))
            logging.error("Error occurred, will try again next cycle")
    
    def archiveWorkflows(self, workflows, archiveState):
        updated = 0
        for workflowName in workflows:
            if self.cleanAllLocalCouchDB(workflowName):
                if self.useReqMgrForCompletionCheck:
                    
                    if self.config.TaskArchiver.reqmgr2Only:
                        self.reqmgr2Svc.updateRequestStatus(workflowName, archiveState)
                    else:
                        self.reqmgrSvc.updateRequestStatus(workflowName, archiveState);
                    updated += 1 
                    logging.debug("status updated to %s %s" % (archiveState, workflowName))
                else:
                    self.centralRequestDBWriter.updateRequestStatus(workflowName, archiveState)
        return updated
    
    def deleteWorkflowFromJobCouch(self, workflowName, db):
        """
        _deleteWorkflowFromCouch_

        If we are asked to delete the workflow from couch, delete it
        to clear up some space.

        Load the document IDs and revisions out of couch by workflowName,
        then order a delete on them.
        """
        if (db == "JobDump"):
            couchDB = self.jobsdatabase
            view = "jobsByWorkflowName"
        elif (db == "FWJRDump"):
            couchDB = self.fwjrdatabase
            view = "fwjrsByWorkflowName"
        elif (db == "SummaryStats"):
            couchDB = self.statsumdatabase
            view = None
        elif (db == "WMStats"):
            couchDB = self.wmstatsCouchDB.getDBInstance()
            view = "jobsByStatusWorkflow"
        
        if view == None:
            try:
                committed = couchDB.delete_doc(workflowName)
            except CouchNotFoundError as ex:
                return {'status': 'warning', 'message': "%s: %s" % (workflowName, str(ex))}
        else:
            options = {"startkey": [workflowName], "endkey": [workflowName, {}], "reduce": False}
            try:
                jobs = couchDB.loadView(db, view, options = options)['rows']
            except Exception as ex:
                errorMsg = "Error on loading jobs for %s" % workflowName
                logging.warning("%s/n%s" % (str(ex), errorMsg))
                return {'status': 'error', 'message': errorMsg}
            
            for j in jobs:
                doc = {}
                doc["_id"]  = j['value']['id']
                doc["_rev"] = j['value']['rev']
                couchDB.queueDelete(doc)
            committed = couchDB.commit()
        
        if committed:
            #create the error report
            errorReport = {}
            deleted = 0
            status = "ok"
            for data in committed:
                if 'error' in data:
                    errorReport.setdefault(data['error'], 0)
                    errorReport[data['error']] += 1
                    status = "error"
                else:
                    deleted += 1
            return {'status': status, 'delete': deleted, 'message': errorReport}
        else:
            return {'status': 'warning', 'message': "no %s exist" % workflowName}


    def cleanAllLocalCouchDB(self, workflowName):
        logging.info("Deleting %s from JobCouch" % workflowName)
        
        jobReport = self.deleteWorkflowFromJobCouch(workflowName, "JobDump")
        logging.debug("%s docs deleted from JobDump" % jobReport)
        
        fwjrReport = self.deleteWorkflowFromJobCouch(workflowName, "FWJRDump")
        logging.debug("%s docs deleted from FWJRDump" % fwjrReport)
        
        summaryReport = self.deleteWorkflowFromJobCouch(workflowName, "SummaryStats")
        logging.debug("%s docs deleted from SummaryStats" % summaryReport)
        
        wmstatsReport = self.deleteWorkflowFromJobCouch(workflowName, "WMStats")
        logging.debug("%s docs deleted from wmagent_summary" % wmstatsReport)
        
        # if one of the procedure fails return False
        if (jobReport["status"] == "error" or fwjrReport["status"] == "error" or 
            wmstatsReport["status"] == "error"):
            return False
        # other wise return True.
        return True
        
Exemple #47
0
class TaskArchiverPoller(BaseWorkerThread):
    """
    Polls for Ended jobs

    List of attributes

    requireCouch:  raise an exception on couch failure instead of ignoring
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.config = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, 'WorkQueueParams'):
                self.workQueue = localQueue(
                    **self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.timeout = getattr(self.config.TaskArchiver, "timeOut", None)
        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if not self.useReqMgrForCompletionCheck:
            #sets the local monitor summary couch db
            self.requestLocalCouchDB = RequestDBWriter(
                self.config.AnalyticsDataCollector.localT0RequestDBURL,
                couchapp=self.config.AnalyticsDataCollector.RequestCouchApp)
            self.centralCouchDBWriter = self.requestLocalCouchDB
        else:
            self.centralCouchDBWriter = RequestDBWriter(
                self.config.AnalyticsDataCollector.centralRequestDBURL)

            self.reqmgr2Svc = ReqMgr(
                self.config.TaskArchiver.ReqMgr2ServiceURL)
            #TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only)
            self.reqmgrSvc = RequestManager(
                {'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})

        #Load the cleanout state ID and save it
        stateIDDAO = self.daoFactory(classname="Jobs.GetStateID")
        self.stateID = stateIDDAO.execute("cleanout")

        return

    def terminate(self, params):
        """
        _terminate_

        This function terminates the job after a final pass
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Executes the two main methods of the poller:
        1. findAndMarkFinishedSubscriptions
        2. completeTasks
        Final result is that finished workflows get their summary built and uploaded to couch,
        and all traces of them are removed from the agent WMBS and couch (this last one on demand).
        """
        try:
            self.findAndMarkFinishedSubscriptions()
            (finishedwfs, finishedwfsWithLogCollectAndCleanUp
             ) = self.getFinishedWorkflows()
            # set the data cache which can be used other thread (no ther thread should set the data cache)
            DataCache.setFinishedWorkflows(finishedwfsWithLogCollectAndCleanUp)
            self.completeTasks(finishedwfs)
        except WMException:
            myThread = threading.currentThread()
            if getattr(myThread, 'transaction', False) \
                   and getattr(myThread.transaction, 'transaction', False):
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            myThread = threading.currentThread()
            msg = "Caught exception in TaskArchiver\n"
            msg += str(ex)
            if getattr(myThread, 'transaction', False) \
                   and getattr(myThread.transaction, 'transaction', False):
                myThread.transaction.rollback()
            raise TaskArchiverPollerException(msg)

        return

    def findAndMarkFinishedSubscriptions(self):
        """
        _findAndMarkFinishedSubscriptions_

        Find new finished subscriptions and mark as finished in WMBS.
        """
        myThread = threading.currentThread()

        myThread.transaction.begin()

        #Get the subscriptions that are now finished and mark them as such
        logging.info("Polling for finished subscriptions")
        finishedSubscriptions = self.daoFactory(
            classname="Subscriptions.MarkNewFinishedSubscriptions")
        finishedSubscriptions.execute(self.stateID, timeOut=self.timeout)
        logging.info("Finished subscriptions updated")

        myThread.transaction.commit()

        return

    def getFinishedWorkflows(self):
        """
        1. Get finished workflows (a finished workflow is defined in Workflow.GetFinishedWorkflows)
        2. Get finished workflows with logCollect and Cleanup only.
        3. combined those and make return 
           finishedwfs - without LogCollect and CleanUp task
           finishedwfsWithLogCollectAndCleanUp - including LogCollect and CleanUp task
        """

        finishedWorkflowsDAO = self.daoFactory(
            classname="Workflow.GetFinishedWorkflows")
        finishedwfs = finishedWorkflowsDAO.execute()
        finishedLogCollectAndCleanUpwfs = finishedWorkflowsDAO.execute(
            onlySecondary=True)
        finishedwfsWithLogCollectAndCleanUp = {}
        for wf in finishedLogCollectAndCleanUpwfs:
            if wf in finishedwfs:
                finishedwfsWithLogCollectAndCleanUp[wf] = finishedwfs[wf]
        return (finishedwfs, finishedwfsWithLogCollectAndCleanUp)

    def completeTasks(self, finishedwfs):
        """
        _completeTasks_

        This method will call several auxiliary methods to do the following:
        
        1. Notify the WorkQueue about finished subscriptions
        2. update dbsbuffer_workflow table with finished subscription
        """

        #Only delete those where the upload and notification succeeded
        logging.info("Found %d candidate workflows for completing: %s" %
                     (len(finishedwfs), finishedwfs.keys()))
        # update the completed flag in dbsbuffer_workflow table so blocks can be closed
        # create updateDBSBufferWorkflowComplete DAO
        if len(finishedwfs) == 0:
            return

        completedWorkflowsDAO = self.dbsDaoFactory(
            classname="UpdateWorkflowsToCompleted")

        centralCouchAlive = True
        try:
            #TODO: need to enable when reqmgr2 -wmstats is ready
            #abortedWorkflows = self.reqmgrCouchDBWriter.getRequestByStatus(["aborted"], format = "dict");
            abortedWorkflows = self.centralCouchDBWriter.getRequestByStatus(
                ["aborted"])
            logging.info(
                "There are %d requests in 'aborted' status in central couch." %
                len(abortedWorkflows))
            forceCompleteWorkflows = self.centralCouchDBWriter.getRequestByStatus(
                ["force-complete"])
            logging.info(
                "List of 'force-complete' workflows in central couch: %s" %
                forceCompleteWorkflows)

        except Exception as ex:
            centralCouchAlive = False
            logging.error(
                "we will try again when remote couch server comes back\n%s" %
                str(ex))

        if centralCouchAlive:
            for workflow in finishedwfs:
                try:
                    #Notify the WorkQueue, if there is one
                    if self.workQueue != None:
                        subList = []
                        logging.info("Marking subscriptions as Done ...")
                        for l in finishedwfs[workflow]["workflows"].values():
                            subList.extend(l)
                        self.notifyWorkQueue(subList)

                    #Now we know the workflow as a whole is gone, we can delete the information from couch
                    if not self.useReqMgrForCompletionCheck:
                        self.requestLocalCouchDB.updateRequestStatus(
                            workflow, "completed")
                        logging.info("status updated to completed %s" %
                                     workflow)

                    if workflow in abortedWorkflows:
                        #TODO: remove when reqmgr2-wmstats deployed
                        newState = "aborted-completed"
                    elif workflow in forceCompleteWorkflows:
                        newState = "completed"
                    else:
                        newState = None

                    if newState != None:
                        # update reqmgr workload document only request mgr is installed
                        if not self.useReqMgrForCompletionCheck:
                            # commented out untill all the agent is updated so every request have new state
                            # TODO: agent should be able to write reqmgr db diretly add the right group in
                            # reqmgr
                            self.requestLocalCouchDB.updateRequestStatus(
                                workflow, newState)
                        else:
                            try:
                                #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced)
                                logging.info(
                                    "Updating status to '%s' in both oracle and couchdb ..."
                                    % newState)
                                self.reqmgrSvc.updateRequestStatus(
                                    workflow, newState)
                                #And replace with this - remove all the excption
                                #self.reqmgr2Svc.updateRequestStatus(workflow, newState)
                            except httplib.HTTPException as ex:
                                # If we get an HTTPException of 404 means reqmgr2 request
                                if ex.status == 404:
                                    # try reqmgr2 call
                                    msg = "%s : reqmgr2 request: %s" % (
                                        workflow, str(ex))
                                    logging.warning(msg)
                                    self.reqmgr2Svc.updateRequestStatus(
                                        workflow, newState)
                                else:
                                    msg = "%s : fail to update status %s  with HTTP error: %s" % (
                                        workflow, newState, str(ex))
                                    logging.error(msg)
                                    raise ex

                        logging.info("status updated to '%s' : %s" %
                                     (newState, workflow))

                    completedWorkflowsDAO.execute([workflow])

                except TaskArchiverPollerException as ex:

                    #Something didn't go well when notifying the workqueue, abort!!!
                    logging.error(
                        "Something bad happened while archiving tasks.")
                    logging.error(str(ex))
                    continue
                except Exception as ex:
                    #Something didn't go well on couch, abort!!!
                    msg = "Problem while archiving tasks for workflow %s\n" % workflow
                    msg += "Exception message: %s" % str(ex)
                    msg += "\nTraceback: %s" % traceback.format_exc()
                    logging.error(msg)
                    continue
        return

    def notifyWorkQueue(self, subList):
        """
        _notifyWorkQueue_

        Tells the workQueue component that a particular subscription,
        or set of subscriptions, is done.  Receives confirmation
        """

        for sub in subList:
            try:
                self.workQueue.doneWork(SubscriptionId=sub)
            except WorkQueueNoMatchingElements:
                #Subscription wasn't known to WorkQueue, feel free to clean up
                logging.debug(
                    "Local WorkQueue knows nothing about this subscription: %s"
                    % sub)
                pass
            except Exception as ex:
                msg = "Error talking to workqueue: %s\n" % str(ex)
                msg += "Tried to complete the following: %s\n" % sub
                raise TaskArchiverPollerException(msg)

        return
Exemple #48
0
class WorkQueueReqMgrInterface(object):
    """Helper class for ReqMgr interaction"""

    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        # this will break all in one test
        self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None))

        centralurl = kwargs.get("central_logdb_url", "")
        identifier = kwargs.get("log_reporter", "")

        # set the thread name before creat the log db.
        # only sets that when it is not set already
        myThread = threading.currentThread()
        if myThread.getName() == "MainThread":
            myThread.setName(self.__class__.__name__)

        self.logdb = LogDB(centralurl, identifier, logger=self.logger)
        self.previous_state = {}

    def __call__(self, queue):
        """Synchronize WorkQueue and RequestManager"""
        msg = ''
        try:  # pull in new work
            work = self.queueNewRequests(queue)
            msg += "New Work: %d\n" % work
        except Exception:
            self.logger.exception("Error caught during RequestManager pull")

        try:  # get additional open-running work
            extraWork = self.addNewElementsToOpenRequests(queue)
            msg += "Work added: %d\n" % extraWork
        except Exception:
            self.logger.exception("Error caught during RequestManager split")

        try:  # report back to ReqMgr
            uptodate_elements = self.report(queue)
            msg += "Updated ReqMgr status for: %s\n" % ", ".join([x['RequestName'] for x in uptodate_elements])
        except Exception:
            self.logger.exception("Error caught during RequestManager update")
        else:
            try:  # Delete finished requests from WorkQueue
                self.deleteFinishedWork(queue, uptodate_elements)
            except Exception:
                self.logger.exception("Error caught during work deletion")

        queue.backend.recordTaskActivity('reqmgr_sync', msg)

    def queueNewRequests(self, queue):
        """Get requests from regMgr and queue to workqueue"""
        self.logger.info("Contacting Request manager for more work")
        work = 0
        workLoads = []

        if queue.params['DrainMode']:
            self.logger.info('Draining queue: Skip requesting work from ReqMgr')
            return 0

        try:
            workLoads = self.getAvailableRequests()
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for team, reqName, workLoadUrl in workLoads:
            try:
                try:
                    Lexicon.couchurl(workLoadUrl)
                except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                    # check its not a local file
                    if not os.path.exists(workLoadUrl):
                        error = WorkQueueWMSpecError(None, "Workflow url validation error: %s" % str(ex))
                        raise error

                self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl))
                units = queue.queueWork(workLoadUrl, request=reqName, team=team)
                self.logdb.delete(reqName, "error", this_thread=True)
            except TERMINAL_EXCEPTIONS as ex:
                # fatal error - report back to ReqMgr
                self.logger.error('Permanent failure processing request "%s": %s' % (reqName, str(ex)))
                self.logger.info("Marking request %s as failed in ReqMgr" % reqName)
                self.reportRequestStatus(reqName, 'Failed', message=str(ex))
                continue
            except (IOError, socket.error, CouchError, CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nSee log for details.\nError: "%s"' % str(ex)
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            try:
                self.reportRequestStatus(reqName, "acquired")
            except Exception as ex:
                self.logger.warning("Unable to update ReqMgr state: %s" % str(ex))
                self.logger.warning('Will try again later')

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

            self.logger.info("%s element(s) obtained from RequestManager" % work)
        return work

    def report(self, queue):
        """Report queue status to ReqMgr."""
        new_state = {}
        uptodate_elements = []
        now = time.time()

        elements = queue.statusInbox(dictKey="RequestName")
        if not elements:
            return new_state

        for ele in elements:
            ele = elements[ele][0]  # 1 element tuple
            try:
                request = self.reqMgr2.getRequestByNames(ele['RequestName'])
                if not request:
                    msg = 'Failed to get request "%s" from ReqMgr2. Will try again later.' % ele['RequestName']
                    self.logger.warning(msg)
                    continue
                request = request[0][ele['RequestName']]
                if request['RequestStatus'] in ('failed', 'completed', 'announced',
                                                'epic-FAILED', 'closed-out', 'rejected'):
                    # requests can be done in reqmgr but running in workqueue
                    # if request has been closed but agent cleanup actions
                    # haven't been run (or agent has been retired)
                    # Prune out obviously too old ones to avoid build up
                    if queue.params.get('reqmgrCompleteGraceTime', -1) > 0:
                        if (now - float(ele.updatetime)) > queue.params['reqmgrCompleteGraceTime']:
                            # have to check all elements are at least running and are old enough
                            request_elements = queue.statusInbox(WorkflowName=request['RequestName'])
                            if not any(
                                    [x for x in request_elements if x['Status'] != 'Running' and not x.inEndState()]):
                                last_update = max([float(x.updatetime) for x in request_elements])
                                if (now - last_update) > queue.params['reqmgrCompleteGraceTime']:
                                    self.logger.info(
                                        "Finishing request %s as it is done in reqmgr" % request['RequestName'])
                                    queue.doneWork(WorkflowName=request['RequestName'])
                                    continue
                    else:
                        pass  # assume workqueue status will catch up later
                elif request['RequestStatus'] in ['aborted', 'force-complete']:
                    queue.cancelWork(WorkflowName=request['RequestName'])
                # Check consistency of running-open/closed and the element closure status
                elif request['RequestStatus'] == 'running-open' and not ele.get('OpenForNewData', False):
                    self.reportRequestStatus(ele['RequestName'], 'running-closed')
                elif request['RequestStatus'] == 'running-closed' and ele.get('OpenForNewData', False):
                    queue.closeWork(ele['RequestName'])
                # we do not want to move the request to 'failed' status
                elif ele['Status'] == 'Failed':
                    continue
                elif ele['Status'] not in self._reqMgrToWorkQueueStatus(request['RequestStatus']):
                    self.reportElement(ele)

                uptodate_elements.append(ele)
            except Exception as ex:
                msg = 'Error talking to ReqMgr about request "%s": %s' % (ele['RequestName'], str(ex))
                self.logger.exception(msg)

        return uptodate_elements

    def deleteFinishedWork(self, queue, elements):
        """Delete work from queue that is finished in ReqMgr"""
        finished = []
        for element in elements:
            if element.inEndState() and self._workQueueToReqMgrStatus(element['Status']) in ('aborted', 'failed', 'completed',
                                                                                             'announced', 'epic-FAILED',
                                                                                             'closed-out', 'rejected'):
                finished.append(element['RequestName'])
        return queue.deleteWorkflows(*finished)

    def getAvailableRequests(self):
        """
        Get available requests and sort by team and priority
        returns [(team, request_name, request_spec_url)]
        """

        tempResults = self.reqMgr2.getRequestByStatus("assigned")
        filteredResults = []
        for requests in tempResults:
            for request in requests.values():
                filteredResults.append(request)
        filteredResults.sort(key=itemgetter('RequestPriority'), reverse=True)
        filteredResults.sort(key=lambda r: r["Teams"][0])

        results = [(x["Teams"][0], x["RequestName"], x["RequestWorkflow"]) for x in filteredResults]

        return results

    def reportRequestStatus(self, request, status, message=None):
        """Change state in RequestManager
           Optionally, take a message to append to the request
        """
        if message:
            self.logdb.post(request, str(message), 'info')
        reqmgrStatus = self._workQueueToReqMgrStatus(status)
        if reqmgrStatus:  # only send known states
            try:
                self.reqMgr2.updateRequestStatus(request, reqmgrStatus)
            except Exception as ex:
                msg = "%s : fail to update status will try later: %s" % (request, str(ex))
                msg += traceback.format_exc()
                self.logdb.post(request, msg, 'warning')
                raise ex
        return

    def _workQueueToReqMgrStatus(self, status):
        """Map WorkQueue Status to that reported to ReqMgr"""
        statusMapping = {'Acquired': 'acquired',
                         'Running': 'running-open',
                         'Failed': 'failed',
                         'Canceled': 'aborted',
                         'CancelRequested': 'aborted',
                         'Done': 'completed'
                        }
        if status in statusMapping:
            # if wq status passed convert to reqmgr status
            return statusMapping[status]
        elif status in REQUEST_STATE_LIST:
            # if reqmgr status passed return reqmgr status
            return status
        else:
            # unknown status
            return None

    def _reqMgrToWorkQueueStatus(self, status):
        """Map ReqMgr status to that in a WorkQueue element, it is not a 1-1 relation"""
        statusMapping = {'acquired': ['Acquired'],
                         'running': ['Running'],
                         'running-open': ['Running'],
                         'running-closed': ['Running'],
                         'failed': ['Failed'],
                         'aborted': ['Canceled', 'CancelRequested'],
                         'force-complete': ['Canceled', 'CancelRequested'],
                         'completed': ['Done']}
        if status in statusMapping:
            return statusMapping[status]
        else:
            return []

    def reportElement(self, element):
        """Report element to ReqMgr"""
        self.reportRequestStatus(element['RequestName'], element['Status'])

    def addNewElementsToOpenRequests(self, queue):
        """Add new elements to open requests which are in running-open state, only works adding new blocks from the input dataset"""
        self.logger.info("Checking Request Manager for open requests and closing old ones")

        # First close any open inbox element which hasn't found anything new in a while
        queue.closeWork()
        self.report(queue)

        work = 0
        requests = []

        # Drain mode, don't pull any work into open requests. They will be closed if the queue stays in drain long enough
        if queue.params['DrainMode']:
            self.logger.info('Draining queue: Skip requesting work from ReqMgr')
            return 0

        try:
            requests = self.reqMgr2.getRequestByStatus("running-open", detail=False)
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for reqName in requests:
            try:
                self.logger.info("Processing request %s" % (reqName))
                units = queue.addWork(requestName=reqName)
                self.logdb.delete(reqName, 'error', True)
            except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex:
                # fatal error - but at least it was split the first time. Log and skip.
                msg = 'Error adding further work to request "%s". Will try again later' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except (IOError, socket.error, CouchError, CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nSee log for details.\nError: "%s"' % str(ex)
                traceMsg = traceback.format_exc()
                msg = "%s\n%s" % (msg, traceMsg)
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

        self.logger.info("%s element(s) added to open requests" % work)
        return work
Exemple #49
0
class TaskArchiverPoller(BaseWorkerThread):
    """
    Polls for Ended jobs

    List of attributes

    requireCouch:  raise an exception on couch failure instead of ignoring
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        
        self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer",
                                     logger = myThread.logger, 
                                     dbinterface = myThread.dbi)

        self.config      = config
        self.jobCacheDir = self.config.JobCreator.jobCacheDir

        if getattr(self.config.TaskArchiver, "useWorkQueue", False) != False:
            # Get workqueue setup from config unless overridden
            if hasattr(self.config.TaskArchiver, 'WorkQueueParams'):
                self.workQueue = localQueue(**self.config.TaskArchiver.WorkQueueParams)
            else:
                from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
                self.workQueue = queueFromConfig(self.config)
        else:
            self.workQueue = None

        self.timeout           = getattr(self.config.TaskArchiver, "timeOut", None)
        self.useReqMgrForCompletionCheck   = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)
        
        if not self.useReqMgrForCompletionCheck:
            #sets the local monitor summary couch db
            self.requestLocalCouchDB = RequestDBWriter(self.config.AnalyticsDataCollector.localT0RequestDBURL, 
                                                   couchapp = self.config.AnalyticsDataCollector.RequestCouchApp)
            self.centralCouchDBWriter = self.requestLocalCouchDB;
        else:
            self.centralCouchDBWriter = RequestDBWriter(self.config.AnalyticsDataCollector.centralRequestDBURL)
            
            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            #TODO: remove this when reqmgr2 replace reqmgr completely (reqmgr2Only)
            self.reqmgrSvc = RequestManager({'endpoint': self.config.TaskArchiver.ReqMgrServiceURL})

        #Load the cleanout state ID and save it
        stateIDDAO = self.daoFactory(classname = "Jobs.GetStateID")
        self.stateID = stateIDDAO.execute("cleanout")

        return

    def terminate(self, params):
        """
        _terminate_

        This function terminates the job after a final pass
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    def algorithm(self, parameters = None):
        """
        _algorithm_

        Executes the two main methods of the poller:
        1. findAndMarkFinishedSubscriptions
        2. completeTasks
        Final result is that finished workflows get their summary built and uploaded to couch,
        and all traces of them are removed from the agent WMBS and couch (this last one on demand).
        """
        try:
            self.findAndMarkFinishedSubscriptions()
            (finishedwfs, finishedwfsWithLogCollectAndCleanUp) = self.getFinishedWorkflows()
            # set the data cache which can be used other thread (no ther thread should set the data cache)
            DataCache.setFinishedWorkflows(finishedwfsWithLogCollectAndCleanUp)
            self.completeTasks(finishedwfs)
        except WMException:
            myThread = threading.currentThread()
            if getattr(myThread, 'transaction', False) \
                   and getattr(myThread.transaction, 'transaction', False):
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            myThread = threading.currentThread()
            msg = "Caught exception in TaskArchiver\n"
            msg += str(ex)
            if getattr(myThread, 'transaction', False) \
                   and getattr(myThread.transaction, 'transaction', False):
                myThread.transaction.rollback()
            raise TaskArchiverPollerException(msg)

        return

    def findAndMarkFinishedSubscriptions(self):
        """
        _findAndMarkFinishedSubscriptions_

        Find new finished subscriptions and mark as finished in WMBS.
        """
        myThread = threading.currentThread()

        myThread.transaction.begin()

        #Get the subscriptions that are now finished and mark them as such
        logging.info("Polling for finished subscriptions")
        finishedSubscriptions = self.daoFactory(classname = "Subscriptions.MarkNewFinishedSubscriptions")
        finishedSubscriptions.execute(self.stateID, timeOut = self.timeout)
        logging.info("Finished subscriptions updated")

        myThread.transaction.commit()

        return
    
    
    def getFinishedWorkflows(self):
        """
        1. Get finished workflows (a finished workflow is defined in Workflow.GetFinishedWorkflows)
        2. Get finished workflows with logCollect and Cleanup only.
        3. combined those and make return 
           finishedwfs - without LogCollect and CleanUp task
           finishedwfsWithLogCollectAndCleanUp - including LogCollect and CleanUp task
        """
        
        finishedWorkflowsDAO = self.daoFactory(classname = "Workflow.GetFinishedWorkflows")
        finishedwfs = finishedWorkflowsDAO.execute()
        finishedLogCollectAndCleanUpwfs = finishedWorkflowsDAO.execute(onlySecondary=True)
        finishedwfsWithLogCollectAndCleanUp = {}
        for wf in finishedLogCollectAndCleanUpwfs:
            if wf in finishedwfs:
                finishedwfsWithLogCollectAndCleanUp[wf] = finishedwfs[wf]
        return (finishedwfs, finishedwfsWithLogCollectAndCleanUp)
        
    def completeTasks(self, finishedwfs):
        """
        _completeTasks_

        This method will call several auxiliary methods to do the following:
        
        1. Notify the WorkQueue about finished subscriptions
        2. update dbsbuffer_workflow table with finished subscription
        """


        #Only delete those where the upload and notification succeeded
        logging.info("Found %d candidate workflows for completing: %s" % (len(finishedwfs),finishedwfs.keys()))
        # update the completed flag in dbsbuffer_workflow table so blocks can be closed
        # create updateDBSBufferWorkflowComplete DAO
        if len(finishedwfs) == 0:
            return
        
        completedWorkflowsDAO = self.dbsDaoFactory(classname = "UpdateWorkflowsToCompleted")
        
        centralCouchAlive = True
        try:
            #TODO: need to enable when reqmgr2 -wmstats is ready
            #abortedWorkflows = self.reqmgrCouchDBWriter.getRequestByStatus(["aborted"], format = "dict");
            abortedWorkflows = self.centralCouchDBWriter.getRequestByStatus(["aborted"])
            logging.info("There are %d requests in 'aborted' status in central couch." % len(abortedWorkflows))
            forceCompleteWorkflows = self.centralCouchDBWriter.getRequestByStatus(["force-complete"])
            logging.info("List of 'force-complete' workflows in central couch: %s" % forceCompleteWorkflows)
            
        except Exception as ex:
            centralCouchAlive = False
            logging.error("we will try again when remote couch server comes back\n%s" % str(ex))
        
        if centralCouchAlive:
            for workflow in finishedwfs:
                try:
                    #Notify the WorkQueue, if there is one
                    if self.workQueue != None:
                        subList = []
                        logging.info("Marking subscriptions as Done ...")
                        for l in finishedwfs[workflow]["workflows"].values():
                            subList.extend(l)
                        self.notifyWorkQueue(subList)
                    
                    #Now we know the workflow as a whole is gone, we can delete the information from couch
                    if not self.useReqMgrForCompletionCheck:
                        self.requestLocalCouchDB.updateRequestStatus(workflow, "completed")
                        logging.info("status updated to completed %s" % workflow)
    
                    if workflow in abortedWorkflows:
                        #TODO: remove when reqmgr2-wmstats deployed
                        newState = "aborted-completed"
                    elif workflow in forceCompleteWorkflows:
                        newState = "completed"
                    else:
                        newState = None
                        
                    if newState != None:
                        # update reqmgr workload document only request mgr is installed
                        if not self.useReqMgrForCompletionCheck:
                            # commented out untill all the agent is updated so every request have new state
                            # TODO: agent should be able to write reqmgr db diretly add the right group in
                            # reqmgr
                            self.requestLocalCouchDB.updateRequestStatus(workflow, newState)
                        else:
                            try:
                                #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced)
                                logging.info("Updating status to '%s' in both oracle and couchdb ..." % newState)
                                self.reqmgrSvc.updateRequestStatus(workflow, newState)
                                #And replace with this - remove all the excption
                                #self.reqmgr2Svc.updateRequestStatus(workflow, newState)
                            except httplib.HTTPException as ex:
                                # If we get an HTTPException of 404 means reqmgr2 request
                                if ex.status == 404:
                                    # try reqmgr2 call
                                    msg = "%s : reqmgr2 request: %s" % (workflow, str(ex))
                                    logging.warning(msg)
                                    self.reqmgr2Svc.updateRequestStatus(workflow, newState)
                                else:
                                    msg = "%s : fail to update status %s  with HTTP error: %s" % (workflow, newState, str(ex))
                                    logging.error(msg)
                                    raise ex
                            
                        logging.info("status updated to '%s' : %s" % (newState, workflow))
                    
                    completedWorkflowsDAO.execute([workflow])
        
                except TaskArchiverPollerException as ex:

                    #Something didn't go well when notifying the workqueue, abort!!!
                    logging.error("Something bad happened while archiving tasks.")
                    logging.error(str(ex))
                    continue
                except Exception as ex:
                    #Something didn't go well on couch, abort!!!
                    msg = "Problem while archiving tasks for workflow %s\n" % workflow
                    msg += "Exception message: %s" % str(ex)
                    msg += "\nTraceback: %s" % traceback.format_exc()
                    logging.error(msg)
                    continue
        return
    
    def notifyWorkQueue(self, subList):
        """
        _notifyWorkQueue_

        Tells the workQueue component that a particular subscription,
        or set of subscriptions, is done.  Receives confirmation
        """

        for sub in subList:
            try:
                self.workQueue.doneWork(SubscriptionId = sub)
            except WorkQueueNoMatchingElements:
                #Subscription wasn't known to WorkQueue, feel free to clean up
                logging.info("Local WorkQueue knows nothing about this subscription: %s" % sub)
                pass
            except Exception as ex:
                msg = "Error talking to workqueue: %s\n" % str(ex)
                msg += "Tried to complete the following: %s\n" % sub
                raise TaskArchiverPollerException(msg)

        return
Exemple #50
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Exemple #51
0
class JobUpdaterPoller(BaseWorkerThread):
    """
    _JobUpdaterPoller_

    Poller class for the JobUpdater
    """

    def __init__(self, config):
        """
        __init__
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.bossAir = BossAirAPI(config=self.config)
        self.reqmgr2 = ReqMgr(self.config.JobUpdater.reqMgr2Url)
        self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl,
                                   self.config.WorkQueueManager.dbname)

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater")
        self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority")
        self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")

    def setup(self, parameters=None):
        """
        _setup_
        """
        pass

    def terminate(self, parameters=None):
        """
        _terminate_

        Terminate gracefully.
        """
        pass

    def algorithm(self, parameters=None):
        """
        _algorithm_
        """
        try:
            logging.info("Synchronizing priorities with ReqMgr...")
            self.synchronizeJobPriority()
            logging.info("Priorities were synchronized, wait until the next cycle")
        except CouchConnectionError as ex:
            msg = "Caught CouchConnectionError exception in JobUpdater\n"
            msg += "transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.exception(msg)
        except CouchConflictError as ex:
            msg = "Caught CouchConflictError exception in JobUpdater\n"
            msg += "transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.exception(msg)
        except Exception as ex:
            if 'Connection refused' in str(ex):
                logging.warn("Failed to sync priorities. Trying in the next cycle")
            else:
                msg = "Caught unexpected exception in JobUpdater: %s\n" % str(ex)
                logging.exception(msg)
                raise JobUpdaterException(msg)

    def synchronizeJobPriority(self):
        """
        _synchronizeJobPriority_

        Check WMBS and WorkQueue for active workflows and compare with the
        ReqMgr for priority changes. If a priority change occurs
        then update the job priority in the batch system and
        the elements in the local queue that have not been injected yet.
        """
        # Update the priority of workflows that are not in WMBS and just in local queue
        priorityCache = {}
        workflowsToUpdate = {}
        workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()]
        for workflow, priority in workflowsToCheck:
            if workflow not in priorityCache:
                try:
                    priorityCache[workflow] = self.reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority']
                except Exception as ex:
                    logging.error("Couldn't retrieve the priority of request %s", workflow)
                    logging.error("Error: %s", str(ex))
                    continue
            if priority != priorityCache[workflow]:
                workflowsToUpdate[workflow] = priorityCache[workflow]
        logging.info("Found %d workflows to update in workqueue", len(workflowsToUpdate))
        for workflow in workflowsToUpdate:
            self.workqueue.updatePriority(workflow, workflowsToUpdate[workflow])

        # Check the workflows in WMBS
        priorityCache = {}
        workflowsToUpdateWMBS = {}
        workflowsToCheck = self.listWorkflowsDAO.execute()
        for workflowEntry in workflowsToCheck:
            workflow = workflowEntry['name']
            if workflow not in priorityCache:
                try:
                    priorityCache[workflow] = self.reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority']
                except Exception as ex:
                    logging.error("Couldn't retrieve the priority of request %s", workflow)
                    logging.error("Error: %s", str(ex))
                    continue
            requestPriority = int(priorityCache[workflow])
            if requestPriority != int(workflowEntry['workflow_priority']):
                # Update the workqueue priority for the Available elements
                self.workqueue.updatePriority(workflow, requestPriority)
                # Check if there are executing jobs for this particular task
                if self.executingJobsDAO.execute(workflow, workflowEntry['task']) > 0:
                    self.bossAir.updateJobInformation(workflow, workflowEntry['task'],
                                                      requestPriority=priorityCache[workflow],
                                                      taskPriority=workflowEntry['task_priority'])
                workflowsToUpdateWMBS[workflow] = priorityCache[workflow]
        if workflowsToUpdateWMBS:
            logging.info("Updating %d workflows in WMBS.", len(workflowsToUpdateWMBS))
            self.updateWorkflowPrioDAO.execute(workflowsToUpdateWMBS)
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)


        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Exemple #53
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        # DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        # Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config, insertStates=True)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsToCache = int(getattr(self.config.JobSubmitter, 'maxJobsToCache', 50000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')
        self.drainGracePeriod = getattr(self.config.JobSubmitter, 'drainGraceTime', 2 * 24 * 60 * 60)  # 2 days

        # Used for speed draining the agent
        self.enableAllSites = False

        # Additions for caching-based JobSubmitter
        self.jobsByPrio = {}  # key'ed by the final job priority, which contains a set of job ids
        self.jobDataCache = {}  # key'ed by the job id, containing the whole job info dict
        self.jobsToPackage = {}
        self.locationDict = {}
        self.drainSites = dict()
        self.drainSitesSet = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Exemple #54
0
class JobUpdaterPoller(BaseWorkerThread):
    """
    _JobUpdaterPoller_

    Poller class for the JobUpdater
    """
    def __init__(self, config):
        """
        __init__
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.bossAir = BossAirAPI(config=self.config)
        self.reqmgr2 = ReqMgr(self.config.General.ReqMgr2ServiceURL)
        self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl,
                                   self.config.WorkQueueManager.dbname)

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.listWorkflowsDAO = self.daoFactory(
            classname="Workflow.ListForJobUpdater")
        self.updateWorkflowPrioDAO = self.daoFactory(
            classname="Workflow.UpdatePriority")
        self.executingJobsDAO = self.daoFactory(
            classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")

    def setup(self, parameters=None):
        """
        _setup_
        """
        pass

    def terminate(self, parameters=None):
        """
        _terminate_

        Terminate gracefully.
        """
        pass

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_
        """
        try:
            logging.info("Synchronizing priorities with ReqMgr...")
            self.synchronizeJobPriority()
            logging.info(
                "Priorities were synchronized, wait until the next cycle")
        except CouchConnectionError as ex:
            msg = "Caught CouchConnectionError exception in JobUpdater\n"
            msg += "transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.exception(msg)
        except CouchConflictError as ex:
            msg = "Caught CouchConflictError exception in JobUpdater\n"
            msg += "transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.exception(msg)
        except Exception as ex:
            if 'Connection refused' in str(ex):
                logging.warn(
                    "Failed to sync priorities. Trying in the next cycle")
            else:
                msg = "Caught unexpected exception in JobUpdater: %s\n" % str(
                    ex)
                logging.exception(msg)
                raise JobUpdaterException(msg)

    def synchronizeJobPriority(self):
        """
        _synchronizeJobPriority_

        Check WMBS and WorkQueue for active workflows and compare with the
        ReqMgr for priority changes. If a priority change occurs
        then update the job priority in the batch system and
        the elements in the local queue that have not been injected yet.
        """
        # Update the priority of workflows that are not in WMBS and just in local queue
        priorityCache = {}
        workflowsToUpdate = {}
        workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()]
        for workflow, priority in workflowsToCheck:
            if workflow not in priorityCache:
                try:
                    result = self.reqmgr2.getRequestByNames(workflow)[0]
                    priorityCache[workflow] = result[workflow][
                        'RequestPriority']
                except Exception as ex:
                    logging.error(
                        "Couldn't retrieve the priority of request %s",
                        workflow)
                    logging.error("Error: %s", str(ex))
                    continue
            if priority != priorityCache[workflow]:
                workflowsToUpdate[workflow] = priorityCache[workflow]
        logging.info("Found %d workflows to update in workqueue",
                     len(workflowsToUpdate))
        for workflow in workflowsToUpdate:
            self.workqueue.updatePriority(workflow,
                                          workflowsToUpdate[workflow])

        # Check the workflows in WMBS
        priorityCache = {}
        workflowsToUpdateWMBS = {}
        workflowsToCheck = self.listWorkflowsDAO.execute()
        for workflowEntry in workflowsToCheck:
            workflow = workflowEntry['name']
            if workflow not in priorityCache:
                try:
                    result = self.reqmgr2.getRequestByNames(workflow)[0]
                    priorityCache[workflow] = result[workflow][
                        'RequestPriority']
                except Exception as ex:
                    logging.error(
                        "Couldn't retrieve the priority of request %s",
                        workflow)
                    logging.error("Error: %s", str(ex))
                    continue
            requestPriority = int(priorityCache[workflow])
            if requestPriority != int(workflowEntry['workflow_priority']):
                # Update the workqueue priority for the Available elements
                self.workqueue.updatePriority(workflow, requestPriority)
                # Check if there are executing jobs for this particular task
                if self.executingJobsDAO.execute(workflow,
                                                 workflowEntry['task']) > 0:
                    self.bossAir.updateJobInformation(
                        workflow,
                        workflowEntry['task'],
                        requestPriority=priorityCache[workflow],
                        taskPriority=workflowEntry['task_priority'])
                workflowsToUpdateWMBS[workflow] = priorityCache[workflow]
        if workflowsToUpdateWMBS:
            logging.info("Updating %d workflows in WMBS.",
                         len(workflowsToUpdateWMBS))
            self.updateWorkflowPrioDAO.execute(workflowsToUpdateWMBS)
Exemple #55
0
    def __init__(self, app, config, mount):
        self.base = config.base
        self.rootdir = '/'.join(WMCore.__file__.split('/')[:-1])
        if config and not isinstance(config, dict):
            web_config = config.dictionary_()
        if not config:
            web_config = {'base': self.base}
        TemplatedPage.__init__(self, web_config)
        imgdir = os.environ.get('RM_IMAGESPATH', os.getcwd() + '/images')
        self.imgdir = web_config.get('imgdir', imgdir)
        cssdir = os.environ.get('RM_CSSPATH', os.getcwd() + '/css')
        self.cssdir = web_config.get('cssdir', cssdir)
        jsdir = os.environ.get('RM_JSPATH', os.getcwd() + '/js')
        self.jsdir = web_config.get('jsdir', jsdir)
        spdir = os.environ.get('RM_SPECPATH', os.getcwd() + '/specs')
        self.spdir = web_config.get('spdir', spdir)
        # read scripts area and initialize data-ops scripts
        self.sdir = os.environ.get('RM_SCRIPTS', os.getcwd() + '/scripts')
        self.sdir = web_config.get('sdir', self.sdir)
        self.sdict_thr = web_config.get('sdict_thr', 600)  # put reasonable 10 min interval
        self.sdict = {'ts': time.time()}  # placeholder for data-ops scripts
        self.update_scripts(force=True)

        # To be filled at run time
        self.cssmap = {}
        self.jsmap = {}
        self.imgmap = {}
        self.yuimap = {}

        std_specs_dir = os.path.join(self.rootdir, 'WMSpec/StdSpecs')
        self.std_specs = spec_list(std_specs_dir)
        self.std_specs.sort()

        # Update CherryPy configuration
        mime_types = ['text/css']
        mime_types += ['application/javascript', 'text/javascript',
                       'application/x-javascript', 'text/x-javascript']
        cherryconf.update({'tools.encode.on': True,
                           'tools.gzip.on': True,
                           'tools.gzip.mime_types': mime_types,
                           })
        self._cache = {}

        # initialize access to reqmgr2 APIs
        self.reqmgr_url = config.reqmgr.reqmgr2_url
        self.reqmgr = ReqMgr(self.reqmgr_url)
        # only gets current view (This might cause to reponse time much longer,
        # If upto date view is not needed overwrite Fale)
        self.reqmgr._noStale = True

        # get fields which we'll use in templates
        cdict = config.reqmgr.dictionary_()
        self.couch_url = cdict.get('couch_host', '')
        self.couch_dbname = cdict.get('couch_reqmgr_db', '')
        self.couch_wdbname = cdict.get('couch_workload_summary_db', '')
        self.acdc_url = cdict.get('acdc_host', '')
        self.acdc_dbname = cdict.get('acdc_db', '')
        self.configcache_url = cdict.get('couch_config_cache_url', self.couch_url)
        self.dbs_url = cdict.get('dbs_url', '')
        self.dqm_url = cdict.get('dqm_url', '')
        self.sw_ver = cdict.get('default_sw_version', 'CMSSW_7_6_1')
        self.sw_arch = cdict.get('default_sw_scramarch', 'slc6_amd64_gcc493')

        # LogDB holder
        centralurl = cdict.get("central_logdb_url", "")
        identifier = cdict.get("log_reporter", "reqmgr2")
        self.logdb = LogDB(centralurl, identifier)

        # local team cache which will request data from wmstats
        base, uri = self.reqmgr_url.split('://')
        base_url = '%s://%s' % (base, uri.split('/')[0])
        self.wmstatsurl = cdict.get('wmstats_url', '%s/wmstatsserver' % base_url)
        if not self.wmstatsurl:
            raise Exception('ReqMgr2 configuration file does not provide wmstats url')
        self.team_cache = []

        # fetch assignment arguments specification from StdBase
        self.assignArgs = StdBase().getWorkloadAssignArgs()
        self.assignArgs = {key: val['default'] for key, val in self.assignArgs.items()}