예제 #1
0
    def setResourceThresholds(self, site, **options):
        """
        _setResourceThresholds_

        Utility to set resource thresholds
        """
        if not options:
            options = {'state'        : 'Normal',
                       'runningSlots' : 10,
                       'pendingSlots' : 5,
                       'tasks' : ['Processing', 'Merge'],
                       'Processing' : {'pendingSlots' : 5,
                                       'runningSlots' : 10},
                       'Merge' : {'pendingSlots' : 2,
                                  'runningSlots' : 5}}

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site),
                                   ceName = site, plugin = "MockPlugin", pendingSlots = options['pendingSlots'],
                                   runningSlots = options['runningSlots'], cmsName = site)
        for task in options['tasks']:
            resourceControl.insertThreshold(siteName = site, taskType = task,
                                            maxSlots = options[task]['runningSlots'],
                                            pendingSlots = options[task]['pendingSlots'])
        if options.get('state'):
            resourceControl.changeSiteState(site, options.get('state'))

        return
예제 #2
0
    def setUp(self):

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
                                useDefault = False)

        self.daoFactory = DAOFactory(package = "WMCore.BossAir",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu',
                                   ceName = 'Xanadu', plugin = "TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000, pendingSlots = 10000)

        # Create user
        wmbsFactory = DAOFactory(package = "WMCore.WMBS",
                                 logger = myThread.logger,
                                 dbinterface = myThread.dbi)
        newuser = wmbsFactory(classname = "Users.New")
        newuser.execute(dn = "mnorman", group_name = "phgroup", role_name = "cmsrole")
예제 #3
0
    def setResourceThresholds(self, site, **options):
        """
        _setResourceThresholds_

        Utility to set resource thresholds
        """
        if not options:
            options = {'state': 'Normal',
                       'runningSlots': 10,
                       'pendingSlots': 5,
                       'tasks': ['Processing', 'Merge'],
                       'Processing': {'pendingSlots': 5,
                                      'runningSlots': 10},
                       'Merge': {'pendingSlots': 2,
                                 'runningSlots': 5}}

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName=site, pnn='se.%s' % (site),
                                   ceName=site, plugin="MockPlugin", pendingSlots=options['pendingSlots'],
                                   runningSlots=options['runningSlots'], cmsName=site)
        for task in options['tasks']:
            resourceControl.insertThreshold(siteName=site, taskType=task,
                                            maxSlots=options[task]['runningSlots'],
                                            pendingSlots=options[task]['pendingSlots'])
        if options.get('state'):
            resourceControl.changeSiteState(site, options.get('state'))

        return
예제 #4
0
    def setUp(self):
        """
        _setUp_

        Setup the database and logging connection.  Try to create all of the
        WMBS tables.  Also, create some dummy locations.
        """
        
        myThread = threading.currentThread()

        self.sites = ['T2_US_Florida', 'T2_US_UCSD', 'T2_TW_Taiwan', 'T1_CH_CERN']
        
        self.testInit = TestInit(__file__)
        self.testInit.setLogging(logLevel = logging.DEBUG)
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ['WMCore.WMBS', 
                                                 'WMCore.ResourceControl',
                                                 'WMCore.Agent.Database'], useDefault = False)
        self.testInit.setupCouch("dashboardreporter_t/jobs", "JobDump")
        self.testInit.setupCouch("dashboardreporter_t/fwjrs", "FWJRDump")


        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = site, ceName = site)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 10000)

        self.testDir = self.testInit.generateWorkDir()
        self.alertsReceiver = None
        return
    def setUp(self):
        """
        _setUp_

        Set everything up.
        """
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir",
                                                 "WMCore.ResourceControl"],
                                useDefault = False)
        self.testInit.setupCouch("jobsubmittercaching_t/jobs", "JobDump")
        self.testInit.setupCouch("jobsubmittercaching_t/fwjrs", "FWJRDump")

        resourceControl = ResourceControl()
        for siteName in ["T1_US_FNAL", "T1_UK_RAL"]:
            resourceControl.insertSite(siteName = siteName, pnn = "se.%s" % (siteName),
                                       ceName = siteName, plugin = "CondorPlugin", cmsName = siteName)
            resourceControl.insertThreshold(siteName = siteName, taskType = "Processing",
                                            maxSlots = 10000, pendingSlots = 10000)

        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
        return
예제 #6
0
    def testThresholdsForSite(self):
        """
        _testThresholdsForSite_

        Check that we can get the thresholds in intelligible form
        for each site
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3)

        result = myResourceControl.thresholdBySite(siteName="testSite1")
        procInfo = {}
        mergInfo = {}
        for res in result:
            if res['task_type'] == 'Processing':
                procInfo = res
            elif res['task_type'] == 'Merge':
                mergInfo = res
        self.assertEqual(procInfo.get('pending_slots', None), 20)
        self.assertEqual(procInfo.get('running_slots', None), 40)
        self.assertEqual(procInfo.get('max_slots', None), 10)
        self.assertEqual(procInfo.get('task_pending_slots', None), 8)
        self.assertEqual(mergInfo.get('pending_slots', None), 20)
        self.assertEqual(mergInfo.get('running_slots', None), 40)
        self.assertEqual(mergInfo.get('max_slots', None), 5)
        self.assertEqual(mergInfo.get('task_pending_slots', None), 3)

        return
예제 #7
0
    def testThresholdsForSite(self):
        """
        _testThresholdsForSite_

        Check that we can get the thresholds in intelligible form
        for each site
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 5)

        result = myResourceControl.thresholdBySite(siteName="testSite1")
        procInfo = {}
        mergInfo = {}
        for res in result:
            if res["task_type"] == "Processing":
                procInfo = res
            elif res["task_type"] == "Merge":
                mergInfo = res
        self.assertEqual(procInfo.get("job_slots", None), 20)
        self.assertEqual(procInfo.get("max_slots", None), 10)
        self.assertEqual(mergInfo.get("job_slots", None), 20)
        self.assertEqual(mergInfo.get("max_slots", None), 5)

        return
예제 #8
0
    def testThresholdPriority(self):
        """
        _testThresholdPriority_

        Test that we get things back in priority order
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3)

        # test default task priorities
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite1']['thresholds']['Merge']['priority'], 4)
        self.assertEqual(result['testSite1']['thresholds']['Processing']['priority'], 0)

        myResourceControl.changeTaskPriority("Merge", 3)
        myResourceControl.changeTaskPriority("Processing", 1)

        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite1']['thresholds']['Merge']['priority'], 3)
        self.assertEqual(result['testSite1']['thresholds']['Processing']['priority'], 1)

        myResourceControl.changeTaskPriority("Merge", 1)
        myResourceControl.changeTaskPriority("Processing", 3)

        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite1']['thresholds']['Merge']['priority'], 1)
        self.assertEqual(result['testSite1']['thresholds']['Processing']['priority'], 3)

        return
예제 #9
0
    def testF_OverloadTest(self):
        """
        _OverloadTest_
        
        Test and see what happens if you put in more jobs
        Then the sites can handle
        """

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertThreshold(siteName=site, taskType="Silly", maxSlots=1)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        workloadName = "basicWorkload"
        myThread = threading.currentThread()
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10
        cacheDir = os.path.join(self.testDir, "CacheDir")

        jobGroupList = self.createJobGroups(
            nSubs=nSubs,
            nJobs=nJobs,
            task=workload.getTask("ReReco"),
            workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName),
            type="Silly",
        )

        for group in jobGroupList:
            changeState.propagate(group.jobs, "created", "new")

        jobSubmitter = JobSubmitterPoller(config=config)

        # Actually run it
        jobSubmitter.algorithm()

        # Should be one job for each site
        nSites = len(self.sites)
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSites)

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state="Executing", jobType="Silly")
        self.assertEqual(len(result), nSites)
        result = getJobsAction.execute(state="Created", jobType="Silly")
        self.assertEqual(len(result), nJobs * nSubs - nSites)

        # Now clean-up
        command = ["condor_rm", self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        del jobSubmitter

        return
예제 #10
0
    def setUp(self):
        """
        _setUp_

        Setup the database and logging connection.  Try to create all of the
        WMBS tables.  Also, create some dummy locations.
        """
        super(JobCreatorTest, self).setUp()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()

        self.testInit.setSchema(customModules=[
            'WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database'
        ],
                                useDefault=False)
        self.couchdbname = "jobcreator_t"
        self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump")
        self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump")
        self.configFile = EmulatorSetup.setupWMAgentConfig()

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        locationAction = self.daoFactory(classname="Locations.New")
        for site in self.sites:
            locationAction.execute(siteName=site, pnn=site)

        # Create sites in resourceControl

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName=site, pnn=site, ceName=site)
            resourceControl.insertThreshold(siteName=site,
                                            taskType='Processing',
                                            maxSlots=10000,
                                            pendingSlots=10000)

        self.resourceControl = resourceControl

        self._setup = True
        self._teardown = False

        self.testDir = self.testInit.generateWorkDir()
        self.cwd = os.getcwd()

        # Set heartbeat
        self.componentName = 'JobCreator'
        self.heartbeatAPI = HeartbeatAPI(self.componentName)
        self.heartbeatAPI.registerComponent()

        if PY3:
            self.assertItemsEqual = self.assertCountEqual

        return
예제 #11
0
    def setUp(self):
        """
        _setUp_

        Setup the database and logging connection.  Try to create all of the
        WMBS tables.  Also, create some dummy locations.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ['WMCore.WMBS',
                                                 'WMCore.ResourceControl',
                                                 'WMCore.Agent.Database'], useDefault = False)
        self.couchdbname = "jobcreator_t"
        self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump")
        self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump")


        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        locationAction = self.daoFactory(classname = "Locations.New")
        for site in self.sites:
            locationAction.execute(siteName = site, seName = site)



        #Create sites in resourceControl

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = site, ceName = site)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 10000, pendingSlots = 10000)

        self.resourceControl = resourceControl



        self._setup = True
        self._teardown = False

        self.testDir = self.testInit.generateWorkDir()
        self.cwd = os.getcwd()

        # Set heartbeat
        self.componentName = 'JobCreator'
        self.heartbeatAPI  = HeartbeatAPI(self.componentName)
        self.heartbeatAPI.registerComponent()

        return
예제 #12
0
파일: BossAir_t.py 프로젝트: vkuznet/WMCore
    def setUp(self):
        """
        setup for test.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.tearDown()
        self.testInit.setSchema(
            customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
            useDefault=False)
        self.testInit.setupCouch("bossair_t/jobs", "JobDump")
        self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")

        # Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName=site, pnn='%s_PNN' % site, cmsName=site,
                                       ceName=site, plugin="SimpleCondorPlugin", pendingSlots=1000,
                                       runningSlots=2000)
            resourceControl.insertThreshold(siteName=site, taskType='Processing',
                                            maxSlots=1000, pendingSlots=1000)

        site = 'T3_US_Xanadu'
        resourceControl.insertSite(siteName=site, pnn='%s_PNN' % site, cmsName=site,
                                   ceName=site, plugin="TestPlugin")
        resourceControl.insertThreshold(siteName=site, taskType='Processing',
                                        maxSlots=10000, pendingSlots=10000)

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole")

        # We actually need the user name
        self.user = getpass.getuser()

        # Change this to the working dir to keep track of error and log files from condor
        self.testDir = self.testInit.generateWorkDir()

        # Set heartbeat
        componentName = 'test'
        self.heartbeatAPI = HeartbeatAPI(componentName)
        self.heartbeatAPI.registerComponent()
        componentName = 'JobTracker'
        self.heartbeatAPI2 = HeartbeatAPI(componentName)
        self.heartbeatAPI2.registerComponent()

        return
예제 #13
0
파일: WMAgent_t.py 프로젝트: cinquo/WMCore
    def setUp(self):
        """
        _setUp_

        Set up vital components
        """

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ["WMCore.WMBS",'WMCore.MsgService',
                                                 'WMCore.ResourceControl', 'WMCore.ThreadPool',
                                                 'WMCore.Agent.Database'],
                                useDefault = False)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)



        locationAction = self.daoFactory(classname = "Locations.New")
        pendingSlots  = self.daoFactory(classname = "Locations.SetPendingSlots")


        for site in self.sites:
            locationAction.execute(siteName = site, seName = 'se.%s' % (site), ceName = site)
            pendingSlots.execute(siteName = site, pendingSlots = 1000)


        #Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), ceName = site)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 10000, pendingSlots = 10000)


        self.testDir = self.testInit.generateWorkDir()


        # Set heartbeat
        for component in self.components:
            heartbeatAPI = HeartbeatAPI(component)
            heartbeatAPI.registerComponent()




        return
예제 #14
0
    def testAbortedState(self):
        """
        _testAbortedState_

        Check that we can kill jobs when a site is set to aborted
        ### We no longer need this test as we are not killing jobs that are running
        """
        self.tempDir = self.testInit.generateWorkDir()
        config = self.createConfig()
        myResourceControl = ResourceControl(config)
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1", "T1_US_FNAL", "MockPlugin")
        myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2", "T1_IT_CNAF", "MockPlugin")

        myResourceControl.insertThreshold("testSite1", "Processing", 20, 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 200, 100)
        myResourceControl.insertThreshold("testSite2", "Processing", 50, 25)
        myResourceControl.insertThreshold("testSite2", "Merge", 135, 65)

        self.createJobs()

        myResourceControl.changeSiteState("testSite1", "Aborted")

        ## Now check the tempDir for a FWJR for the killed job
        reportPath = os.path.join(self.tempDir, "Report.0.pkl")
        report = Report()
        report.load(reportPath)
        self.assertEqual(report.getExitCode(), 71301)
        return
예제 #15
0
    def testDrain(self):
        """Drain a site"""
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, priority=1)
        myResourceControl.drainSite("testSite1")

        result = myResourceControl.listThresholdsForCreate()
        self.assertTrue(result["testSite1"]["drain"])

        # re-renable
        myResourceControl.drainSite("testSite1", drain=False)
        result = myResourceControl.listThresholdsForCreate()
        self.assertFalse(result["testSite1"]["drain"])
예제 #16
0
    def setUp(self):
        """
        Standard setup: Now with 100% more couch
        """

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setSchema(
            customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
            useDefault=False,
        )
        self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump")
        self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump")

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)

        locationAction = self.daoFactory(classname="Locations.New")
        locationSlots = self.daoFactory(classname="Locations.SetJobSlots")

        # We actually need the user name
        self.user = getpass.getuser()

        self.ceName = "127.0.0.1"

        # Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(
                siteName=site,
                seName="se.%s" % (site),
                ceName=site,
                plugin="CondorPlugin",
                pendingSlots=10000,
                runningSlots=20000,
                cmsName=site,
            )
            resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=10000)

        self.testDir = self.testInit.generateWorkDir()

        # Set heartbeat
        self.componentName = "JobSubmitter"
        self.heartbeatAPI = HeartbeatAPI(self.componentName)
        self.heartbeatAPI.registerComponent()

        return
예제 #17
0
    def testChangeState(self):
        """
        _testChangeState_

        Check that we can change the state between different values and
        retrieve it through the threshold methods
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 5)

        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Normal', 'Error: Wrong site state')

        myResourceControl.changeSiteState("testSite1", "Down")
        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Down', 'Error: Wrong site state')
예제 #18
0
    def setUp(self):
        """
        setup for test.
        """
        super(JobTrackerTest, self).setUp()
        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        # self.testInit.clearDatabase(modules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"])
        self.testInit.setSchema(customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"],
                                useDefault=False)
        self.testInit.setupCouch("jobtracker_t/jobs", "JobDump")
        self.testInit.setupCouch("jobtracker_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")


        # Create sites in resourceControl
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='malpaquet', pnn='se.malpaquet',
                                   ceName='malpaquet', plugin="CondorPlugin")
        resourceControl.insertThreshold(siteName='malpaquet', taskType='Processing', \
                                        maxSlots=10000, pendingSlots=10000)

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute(siteName="malpaquet", pnn="malpaquet",
                               ceName="malpaquet", plugin="CondorPlugin")

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="jchurchill")

        # We actually need the user name
        self.user = getpass.getuser()

        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
예제 #19
0
    def setUp(self):
        
        myThread = threading.currentThread()
        
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
                                useDefault = False)

        self.daoFactory = DAOFactory(package = "WMCore.BossAir",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu',
                                   ceName = 'Xanadu', plugin = "TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000)
예제 #20
0
    def setResourceThresholds(self, site, **options):
        """
        _setResourceThresholds_

        Utility to set resource thresholds
        """
        if not options:
            options = {
                "state": "Normal",
                "runningSlots": 10,
                "pendingSlots": 5,
                "tasks": ["Processing", "Merge"],
                "Processing": {"pendingSlots": 5, "runningSlots": 10},
                "Merge": {"pendingSlots": 2, "runningSlots": 5},
            }

        resourceControl = ResourceControl()
        resourceControl.insertSite(
            siteName=site,
            pnn="se.%s" % (site),
            ceName=site,
            plugin="MockPlugin",
            pendingSlots=options["pendingSlots"],
            runningSlots=options["runningSlots"],
            cmsName=site,
        )
        for task in options["tasks"]:
            resourceControl.insertThreshold(
                siteName=site,
                taskType=task,
                maxSlots=options[task]["runningSlots"],
                pendingSlots=options[task]["pendingSlots"],
            )
        if options.get("state"):
            resourceControl.changeSiteState(site, options.get("state"))

        return
예제 #21
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize 
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        self.setVariables(self.config)
        
    def setVariables(self, config):
        """
        load all the variables from the config file
        """
        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        
        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent
        
        # forced site list
        self.forcedSiteList = config.AgentStatusWatcher.forcedSiteList
        
        # agent teams (for dynamic threshold) and queueParams (drain mode)
        self.teamNames = config.Agent.teamName
        self.queueParams = config.WorkQueueManager.queueParams
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)
                
        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB
        
        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)
       
        
    def setup(self, parameters):
        """
        Set db connection and prepare resource control
        """
        # Interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set resource control
        self.resourceControl = ResourceControl(config = self.config)
        
        # wmstats connection 
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
        
        # init variables
        self.agentsByTeam = {}

    def algorithm(self, parameters):
        """
        _algorithm_
        
        Update site info about state and thresholds
            1. Get information from SSB
            2. Get information about teams and agents from WMStats
            3. Set site status and set therholds for each valid site
        Sites from SSB are validated with PhEDEx node names
        """
        # set variables every polling cycle
        self.setVariables(self.config)
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            # Get sites in Resource Control
            currentSites = self.resourceControl.listCurrentSites()
            
            logging.debug("Starting algorithm, getting site info from SSB")
            stateBySite, slotsCPU, slotsIO = self.getInfoFromSSB()
            
            if not stateBySite or not slotsCPU or not slotsIO:
                logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
                return
            
            logging.debug("Setting status and thresholds for all sites, site pending: %s%%, task pending: %s%%" % 
                          (str(self.pendingSlotsSitePercent), str(self.pendingSlotsTaskPercent))) 
            
        
            # get number of agents working in the same team (not in DrainMode)
            agentsByTeam = self.getAgentsByTeam()
            if not agentsByTeam:
                logging.debug("agentInfo couch view is not available, use previous agent count %s" % self.agentsNumByTeam)
            else:
                self.agentsByTeam = agentsByTeam
                teams = self.teamNames.split(',')
                agentsCount = []
                for team in teams:
                    if  team not in self.agentsByTeam:
                        agentsCount.append(1)
                    else:
                        agentsCount.append(self.agentsByTeam[team])
                self.agentsNumByTeam = min(agentsCount) # If agent is in several teams, we choose the team with less agents
                logging.debug("Number of agents not in DrainMode running in the same team: %s" % str(self.agentsNumByTeam))
        
            # set site status and thresholds
            listSites = stateBySite.keys()
            if self.forcedSiteList:
                if set(self.forcedSiteList).issubset(set(listSites)):
                    listSites = self.forcedSiteList
                    logging.info("Forcing site list: %s" % (', '.join(self.forcedSiteList)))
                else:
                    listSites = self.forcedSiteList
                    logging.warn("Forcing site list: %s. Some site(s) are not in SSB" % (', '.join(self.forcedSiteList)))
                    
            for site in listSites:
                if site in currentSites:
                    sitestate = stateBySite.get(site,'Normal')
                    if site not in slotsCPU or site not in slotsIO:
                        logging.warn("%s not available in SSB. Changing only site status to %s." % (site,sitestate))
                        pluginResponse = self.updateSiteInfo(site, sitestate, None, None, self.agentsNumByTeam)
                        continue
                    
                    pluginResponse = self.updateSiteInfo(site, sitestate, slotsCPU[site], slotsIO[site], self.agentsNumByTeam)
                    if not pluginResponse:
                        continue
                    logging.info('Setting site %s to %s, CPUBound: %s, IOBound: %s' % 
                                 (site, sitestate, slotsCPU[site], slotsIO[site]))
                else:
                    logging.debug("Site '%s' has not been added to Resource Control" % site)
            
            # if onlySSB sites or forcedSiteList, force to down all the sites not in SSB/forcedSiteList
            if self.onlySSB or self.forcedSiteList:
                for site in set(currentSites).difference(set(listSites)):
                    pluginResponse = self.updateSiteInfo(site, 'Down', 0, 0, self.agentsNumByTeam)
                    if not pluginResponse:
                        continue
                    logging.info('Only SSBsites/forcedSiteList, forcing site %s to Down, CPUBound: 0, IOBound: 0' % site)
            
            logging.info("Resource update is completed, waiting for the next cycle.\n")
            
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_
        
        Get the WMStats view about agents and teams
        """
        agentsByTeam = []
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam()
            return agentsByTeam
        except Exception as ex:
            logging.error("WMStats is not available or is unresponsive. Don't divide thresholds by team")
            return agentsByTeam

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_
        
        Get site status, CPU bound and IO bound from dashboard (SSB)
        """
        # urls from site status board
        url_site_state = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.siteStatusMetric)
        url_cpu_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.cpuBoundMetric)
        url_io_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.ioBoundMetric)
        
        # get info from dashboard
        sites = urllib2.urlopen(url_site_state).read()
        cpu_bound = urllib2.urlopen(url_cpu_bound).read()
        io_bound = urllib2.urlopen(url_io_bound).read()
        
        # parse from json format to dictionary, get only 'csvdata'
        site_state = json.loads(sites)['csvdata']
        cpu_slots = json.loads(cpu_bound)['csvdata']
        io_slots = json.loads(io_bound)['csvdata']
        
        # dictionaries with status/thresholds info by VOName
        stateBySite = self.siteStatusByVOName(site_state)
        slotsCPU = self.thresholdsByVOName(cpu_slots)
        slotsIO = self.thresholdsByVOName(io_slots)
        
        return stateBySite, slotsCPU, slotsIO
        
    def thresholdsByVOName(self, sites):
        """
        _thresholdsByVOName_
        
        Creates a dictionary with keys->VOName and values->threshold: 
        """
        thresholdbyVOName = {}
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname not in thresholdbyVOName:
                if value is None: 
                    logging.warn('Site %s does not have threholds in SSB, assuming 0' % voname) 
                    thresholdbyVOName[voname] = 0
                else:
                    thresholdbyVOName[voname] = int(value)
            else:
                logging.error('I have a duplicated threshold entry in SSB for %s' % voname) 
        return thresholdbyVOName
    
    def siteStatusByVOName(self, sites):
        """
        _siteStatusByVOName_
        
        Creates a dictionary with keys->VOName and values->status:
        """
        statusBySite = {}
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if voname not in statusBySite:
                if not status: 
                    logging.error('Site %s does not have status in SSB' % voname) 
                    continue
                new_status = self.getState(str(status))
                if not new_status:
                    logging.error("Unkwown status '%s' for site %s, please check SSB" % (str(status), voname))
                    continue
                statusBySite[voname] = new_status
            else:
                logging.error('I have a duplicated status entry in SSB for %s' % voname) 
        return statusBySite

    def getState(self, stateFromSSB):
        """
        _getState_
        
        Translate SSB states into resource control state
        """
        if stateFromSSB == "on":
            return "Normal"
        elif stateFromSSB == "drain":
            return "Draining"
        elif stateFromSSB == "tier0":
            logging.debug('There is a site in tier0 status (Tier0Mode is %s)' % self.tier0Mode )
            if self.tier0Mode: 
                return "Normal"
            else:
                return "Draining"
        elif stateFromSSB == "down":
            return "Down"
        elif stateFromSSB == "skip":
            return "Down"
        else:
            return None

    def updateSiteInfo(self, siteName, state, CPUBound, IOBound, agentsNum):
        """
        _updateSiteInfo_
    
        Update information about a site in the database. Also set thresholds for a given site
        pending_jobs policy:
            sitePending is CPUBound*(pendingSlotsSitePercent/100)
            taskPending is (CPUBound or IOBound)*(pendingSlotsTaskPercent/100) depending on the task type
        This allows to maintain the right pressure in the queue, and keep the agent safe.
        The site threshold is higger than each task threshold. This allow to have different task type jobs in the queue.
        When there is several agents in the same team, we divide the pending threshold between the number of agents running.
        """
        if self.resourceControl.listSiteInfo(siteName) is None:
            logging.warn("Site %s has not been added to the resource control. Please check if the site was added by the condor plugin" % siteName)
            return False
        
        # set site state:
        self.resourceControl.changeSiteState(siteName, state)
        if CPUBound == None or IOBound == None:
            return True
        
        # tier0 T1 cores utilization
        if self.tier0Mode and 'T1_' in siteName:
            CPUBound = CPUBound*self.t1SitesCores/100
            IOBound = IOBound*self.t1SitesCores/100
        
        # Thresholds:
        sitePending = int(CPUBound/agentsNum*self.pendingSlotsSitePercent/100)
        taskCPUPending = int(CPUBound/agentsNum*self.pendingSlotsTaskPercent/100)
        taskIOPending = int(IOBound/agentsNum*self.pendingSlotsTaskPercent/100)
        
        # min pending values for thresholds
        if taskCPUPending < 10 and taskCPUPending > 0: 
            taskCPUPending = 10
        if taskIOPending < 10 and taskIOPending > 0: 
            taskIOPending = 10
        
        # Set site main thresholds
        self.resourceControl.setJobSlotsForSite(siteName = siteName,
                                                pendingJobSlots = sitePending,
                                                runningJobSlots = CPUBound)
        
        # Set thresholds for CPU bound task types
        cpuTasks = ['Processing', 'Production', 'Analysis']
        for task in cpuTasks:
            self.resourceControl.insertThreshold(siteName = siteName, taskType = task,
                                                 maxSlots = CPUBound, pendingSlots = taskCPUPending)
        
        # Set thresholds for IO bound task types
        ioTasks = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        for task in ioTasks:
            self.resourceControl.insertThreshold(siteName = siteName, taskType = task,
                                                 maxSlots = IOBound, pendingSlots = taskIOPending)
        
        if self.tier0Mode:
            # Set thresholds for tier0 task types
            expressSlots = int(CPUBound*self.runningExpressPercent/100)
            pendingExpress = int(expressSlots*self.pendingSlotsTaskPercent/100)
            self.resourceControl.insertThreshold(siteName = siteName, taskType = 'Express',
                                                 maxSlots = expressSlots, pendingSlots = pendingExpress)
            repackSlots = int(CPUBound*self.runningRepackPercent/100)
            pendingRepack = int(repackSlots*self.pendingSlotsTaskPercent/100)
            self.resourceControl.insertThreshold(siteName = siteName, taskType = 'Repack',
                                                 maxSlots = repackSlots, pendingSlots = pendingRepack)
        return True
예제 #22
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """

    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            # first, update site status
            ssbSiteStatus = self.getSiteStatus()
            self.checkStatusChanges(sitesRC, ssbSiteStatus)

            # now fetch site slots thresholds
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
                return

            logging.debug("Info from SSB: %s", sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view for agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric)
        ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric)

        ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots)

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoSSB.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down', site)
                    self.updateSiteState(site, 'Down')

        # normally set all the others
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return

    def checkSlotsChanges(self, infoRC, infoSSB):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then updates the task level too.
        """
        logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent,
                      self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and site.startswith('T1_'):
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100
            else:
                # round very small sites to the bare minimum
                infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'], self.minCPUSlots)
                infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'], self.minIOSlots)
            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']

            sitePending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsSitePercent / 100),
                              self.minCPUSlots)

            # update site slots, if needed
            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending,
                                                        runningJobSlots=CPUBound)

            # now handle the task level thresholds
            self.checkTaskSlotsChanges(site, CPUBound, IOBound)

    def thresholdsByVOName(self, infoCpu, infoIo):
        """
        _thresholdsByVOName_

        Creates a dictionary with CPU and IO slots keyed by the site name.
        If any of the thresholds is missing or has an invalid value, the whole
        site thresholds is skipped.
        """
        ssbSiteSlots = {}
        for entry in infoCpu:
            if entry['Value'] is None:
                logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName'])
                continue
            ssbSiteSlots[entry['VOName']] = {'slotsCPU': int(entry['Value'])}

        # then iterate over the IO slots
        for entry in infoIo:
            if entry['VOName'] not in ssbSiteSlots:
                logging.warn('Site %s does not have CPU thresholds in SSB. Taking no action', entry['VOName'])
                ssbSiteSlots.pop(entry['VOName'], None)
                continue
            if entry['Value'] is None:
                logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName'])
                ssbSiteSlots.pop(entry['VOName'], None)
                continue
            ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value'])

        return ssbSiteSlots

    def getSiteStatus(self):
        """
        _getSiteStatus_

        Fetch site state from SSB and map it to agent state
        """
        ssbState = self.ssb.getMetric(self.siteStatusMetric)

        ssbSiteState = {}
        for site in ssbState:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteState:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unknown status '%s' for site %s, please check SSB", status, voname)
                else:
                    ssbSiteState[voname] = {'state': statusAgent}
            else:
                logging.warning('I have a duplicated status entry in SSB for %s', voname)

        return ssbSiteState

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {'enabled': 'Normal',
                     'drain': 'Draining',
                     'disabled': 'Down',
                     'test': 'Draining'}
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return

    def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound):
        """
        _checkTaskSlotsChanges_

        Update the CPU and IOBound slots for a given site.
        """
        siteTaskSlots = self.resourceControl.thresholdBySite(siteName)
        taskCPUPending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100),
                             self.minCPUSlots)
        taskIOPending = max(int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minIOSlots)

        updateTasks = False
        if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0]['task_pending_slots'] != taskCPUPending:
            updateTasks = True
        elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[0]['task_pending_slots'] != taskIOPending:
            updateTasks = True

        if updateTasks:
            logging.info("Updating %s CPU tasks thresholds for pend/runn: %d/%d", siteName,
                         taskCPUPending, CPUBound)
            self.resourceControl.insertThreshold(siteName, taskType=self.tasksCPU, maxSlots=CPUBound,
                                                 pendingSlots=taskCPUPending)
            logging.info("Updating %s IO tasks thresholds for pend/runn: %d/%d", siteName,
                         taskIOPending, IOBound)
            self.resourceControl.insertThreshold(siteName, taskType=self.tasksIO, maxSlots=IOBound,
                                                 pendingSlots=taskIOPending)

        if self.tier0Mode:
            # Set task thresholds for Tier0
            logging.debug("Updating %s Express and Repack task thresholds.", siteName)
            expressSlots = int(CPUBound * self.runningExpressPercent / 100)
            pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100)
            self.resourceControl.insertThreshold(siteName, 'Express', expressSlots, pendingExpress)

            repackSlots = int(CPUBound * self.runningRepackPercent / 100)
            pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100)
            self.resourceControl.insertThreshold(siteName, 'Repack', repackSlots, pendingRepack)
예제 #23
0
    def setUp(self):
        """
        setup for test.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.tearDown()
        self.testInit.setSchema(
            customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
            useDefault=False,
        )
        self.testInit.setupCouch("bossair_t/jobs", "JobDump")
        self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")

        # Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(
                siteName=site,
                pnn="se.%s" % (site),
                cmsName=site,
                ceName=site,
                plugin="CondorPlugin",
                pendingSlots=1000,
                runningSlots=2000,
            )
            resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=1000, pendingSlots=1000)
        resourceControl.insertSite(
            siteName="Xanadu", pnn="se.Xanadu", cmsName=site, ceName="Xanadu", plugin="TestPlugin"
        )
        resourceControl.insertThreshold(siteName="Xanadu", taskType="Processing", maxSlots=10000, pendingSlots=10000)

        resourceControl.insertSite(
            siteName="jade-cms.hip.fi",
            pnn="madhatter.csc.fi",
            cmsName=site,
            ceName="jade-cms.hip.fi",
            plugin="ARCPlugin",
        )
        resourceControl.insertThreshold(
            siteName="jade-cms.hip.fi", taskType="Processing", maxSlots=100, pendingSlots=100
        )
        # using this for glite submissions
        resourceControl.insertSite(
            siteName="grid-ce-01.ba.infn.it",
            pnn="storm-se-01.ba.infn.it",
            cmsName=site,
            ceName="grid-ce-01.ba.infn.it",
            plugin="gLitePlugin",
        )
        resourceControl.insertThreshold(
            siteName="grid-ce-01.ba.infn.it", taskType="Processing", maxSlots=50, pendingSlots=50
        )

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole")

        # We actually need the user name
        self.user = getpass.getuser()

        # Change this to the working dir to keep track of error and log files from condor
        self.testDir = self.testInit.generateWorkDir()

        # Set heartbeat
        componentName = "test"
        self.heartbeatAPI = HeartbeatAPI(componentName)
        self.heartbeatAPI.registerComponent()
        componentName = "JobTracker"
        self.heartbeatAPI2 = HeartbeatAPI(componentName)
        self.heartbeatAPI2.registerComponent()

        return
예제 #24
0
     def createTestJob(self):
         """
         _createTestJobs_
 
         Create several jobs
         """
         #Create sites in resourceControl
         resourceControl = ResourceControl()
         resourceControl.insertSite(siteName = 'malpaquet', seName = 'se.malpaquet',
                                    ceName = 'malpaquet', plugin = "CondorPlugin")
         resourceControl.insertThreshold(siteName = 'malpaquet', taskType = 'Processing', \
                                         maxSlots = 10000)
 
         locationAction = self.daoFactory(classname = "Locations.New")
         locationAction.execute(siteName = "malpaquet", seName = "malpaquet",
                                ceName = "malpaquet", plugin = "CondorPlugin")
         jobAction = self.daoFactory(classname = "Jobs.New")
         jobAction.execute()
         # Create user
         newuser = self.daoFactory(classname = "Users.New")
         newuser.execute(dn = "jchurchill")
 
         testWorkflow = Workflow(spec = "spec.xml", owner = "Simon",
                                 name = "meloam_ASYNCTEST1_120810_170823_8981", task="Test")
         testWorkflow.create()
         
         testWMBSFileset = Fileset(name = "TestFileset")
         testWMBSFileset.create()
         
         testSubscription = Subscription(fileset = testWMBSFileset,
                                         workflow = testWorkflow,
                                         type = "Processing",
                                         split_algo = "FileBased")
         testSubscription.create()
 
         testJobGroup = JobGroup(subscription = testSubscription)
         testJobGroup.create()
 
         # Create a file
         testFileA = File(lfn = "/this/is/a/lfnA", size = 1024, events = 10)
         testFileA.addRun(Run(10, *[12312]))
         testFileA.setLocation('malpaquet')
         testFileA.create()
 
         baseName = makeUUID()
         
         # make a copy of the FWJR since it will be modified
         shutil.copyfile(os.path.join(WMCore.WMBase.getTestBase(),
                                             'WMComponent_t',
                                             'AsyncStageoutTracker_t',
                                             'oneaso.pkl'),
                         os.path.join(self.testDir, "oneaso.pkl") )
         
 
         # Now create a job
 
         testJob = Job(name = '%s-%i' % (baseName, 1))
         testJob.addFile(testFileA)
         testJob['location'] = 'malpaquet'
         testJob['retry_count'] = 1
         testJob['retry_max'] = 10
         testJob['fwjr_path'] = os.path.join(self.testDir, "oneaso.pkl")
         testJob.create(testJobGroup)
         testJob.save()
         testJobGroup.add(testJob)
 
         testJobGroup.commit()
         
         stateAction = self.daoFactory(classname = "Jobs.ChangeState")
         stateAction.execute( [{'id' : testJob['id'],
                                 'state' : 'asopending',
                                 'couch_record' : testJob['couch_record']}])
         return testJob
 
 
         return testJobGroup       
예제 #25
0
    def testList(self):
        """
        _testList_

        Test the functions that list thresholds for creating jobs and submitting
        jobs.
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1", "T1_US_FNAL", "LsfPlugin")
        myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2", "T3_US_FNAL", "LsfPlugin")

        myResourceControl.insertThreshold("testSite1", "Processing", 20, 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 200, 100)
        myResourceControl.insertThreshold("testSite2", "Processing", 50, 25)
        myResourceControl.insertThreshold("testSite2", "Merge", 135, 65)

        self.createJobs()

        createThresholds = myResourceControl.listThresholdsForCreate()
        submitThresholds = myResourceControl.listThresholdsForSubmit()

        self.assertEqual(len(createThresholds.keys()), 2,
                         "Error: Wrong number of sites in create thresholds")

        self.assertEqual(createThresholds["testSite1"]["total_slots"], 10,
                         "Error: Wrong number of slots for site 1")

        self.assertEqual(createThresholds["testSite2"]["total_slots"], 20,
                         "Error: Wrong number of slots for site 2")

        # We should have two running jobs with locations at site one,
        # two running jobs without locations at site two, and one running
        # job without a location at site one and two.
        self.assertEqual(createThresholds["testSite1"]["pending_jobs"], {0: 4},
                         "Error: Wrong number of pending jobs for site 1")

        # We should have one running job with a location at site 2 and
        # another running job without a location.
        self.assertEqual(createThresholds["testSite2"]["pending_jobs"], {0: 2},
                         "Error: Wrong number of pending jobs for site 2")

        # We should also have a phedex_name
        self.assertEqual(createThresholds["testSite1"]["cms_name"], "T1_US_FNAL")
        self.assertEqual(createThresholds["testSite2"]["cms_name"], "T3_US_FNAL")

        mergeThreshold1 = None
        mergeThreshold2 = None
        procThreshold1 = None
        procThreshold2 = None
        self.assertEqual(set(submitThresholds.keys()), set(["testSite1", "testSite2"]))
        for taskType, threshold in submitThresholds["testSite1"]["thresholds"].items():
            if taskType == "Merge":
                mergeThreshold1 = threshold
            elif taskType == "Processing":
                procThreshold1 = threshold
        for taskType, threshold in submitThresholds["testSite2"]["thresholds"].items():
            if taskType == "Merge":
                mergeThreshold2 = threshold
            elif taskType == "Processing":
                procThreshold2 = threshold

        self.assertEqual(submitThresholds["testSite1"]["total_running_jobs"], 0,
                         "Error: Wrong number of running jobs for submit thresholds.")
        self.assertEqual(submitThresholds["testSite2"]["total_running_jobs"], 1,
                         "Error: Wrong number of running jobs for submit thresholds.")
        self.assertEqual(submitThresholds["testSite1"]["total_pending_jobs"], 1,
                         "Error: Wrong number of pending jobs for submit thresholds.")
        self.assertEqual(submitThresholds["testSite2"]["total_pending_jobs"], 0,
                         "Error: Wrong number of pending jobs for submit thresholds.")

        self.assertEqual(mergeThreshold1["task_running_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(mergeThreshold1["task_pending_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold1["task_running_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold1["task_pending_jobs"], 1,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(mergeThreshold2["task_running_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(mergeThreshold2["task_pending_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold2["task_running_jobs"], 1,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold2["task_pending_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")

        return
예제 #26
0
    def testInsert(self):
        """
        _testInsert_

        Verify that inserting sites and thresholds works correctly, even if the
        site or threshold already exists.
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1")
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1")
        myResourceControl.insertSite("testSite2", 100, 200, "testSE2", "testCE2")

        myResourceControl.insertThreshold("testSite1", "Processing", 20, 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 200, 100)
        myResourceControl.insertThreshold("testSite1", "Merge", 250, 150)
        myResourceControl.insertThreshold("testSite2", "Processing", 50, 30)
        myResourceControl.insertThreshold("testSite2", "Merge", 135, 100)

        createThresholds = myResourceControl.listThresholdsForCreate()

        self.assertEqual(len(createThresholds.keys()), 2,
                         "Error: Wrong number of site in Resource Control DB")

        self.assertTrue("testSite1" in createThresholds.keys(),
                        "Error: Test Site 1 missing from thresholds.")

        self.assertTrue("testSite2" in createThresholds.keys(),
                        "Error: Test Site 2 missing from thresholds.")

        self.assertEqual(createThresholds["testSite1"]["total_slots"], 10,
                         "Error: Wrong number of total slots.")

        self.assertEqual(createThresholds["testSite1"]["pending_jobs"], {0: 0},
                         "Error: Wrong number of running jobs: %s" %
                         createThresholds["testSite1"]["pending_jobs"])

        self.assertEqual(createThresholds["testSite2"]["total_slots"], 100,
                         "Error: Wrong number of total slots.")

        self.assertEqual(createThresholds["testSite2"]["pending_jobs"], {0: 0},
                         "Error: Wrong number of running jobs.")

        thresholds = myResourceControl.listThresholdsForSubmit()

        self.assertEqual(len(thresholds.keys()), 2,
                         "Error: Wrong number of sites in Resource Control DB")

        self.assertTrue("testSite1" in thresholds.keys(),
                        "Error: testSite1 missing from thresholds.")

        self.assertTrue("testSite2" in thresholds.keys(),
                        "Error: testSite2 missing from thresholds.")

        site1Info = thresholds["testSite1"]
        site2Info = thresholds["testSite2"]
        site1Thresholds = site1Info["thresholds"]
        site2Thresholds = site2Info["thresholds"]

        procThreshold1 = None
        procThreshold2 = None
        mergeThreshold1 = None
        mergeThreshold2 = None
        for taskType, threshold in site1Thresholds.items():
            if taskType == "Merge":
                mergeThreshold1 = threshold
            elif taskType == "Processing":
                procThreshold1 = threshold
        for taskType, threshold in site2Thresholds.items():
            if taskType == "Merge":
                mergeThreshold2 = threshold
            elif taskType == "Processing":
                procThreshold2 = threshold

        self.assertEqual(len(site1Thresholds), 2,
                         "Error: Wrong number of task types.")

        self.assertEqual(len(site2Thresholds), 2,
                         "Error: Wrong number of task types.")

        self.assertNotEqual(procThreshold1, None)
        self.assertNotEqual(procThreshold2, None)
        self.assertNotEqual(mergeThreshold1, None)
        self.assertNotEqual(mergeThreshold2, None)

        self.assertEqual(site1Info["total_pending_slots"], 10,
                         "Error: Site thresholds wrong")

        self.assertEqual(site1Info["total_running_slots"], 20,
                         "Error: Site thresholds wrong")

        self.assertEqual(site1Info["total_running_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(site1Info["total_pending_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold1["task_running_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold1["task_pending_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold1["max_slots"], 20,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold1["pending_slots"], 10,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold1["task_running_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold1["task_pending_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold1["max_slots"], 250,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold1["pending_slots"], 150,
                         "Error: Site thresholds wrong")

        self.assertEqual(site2Info["total_pending_slots"], 100,
                         "Error: Site thresholds wrong")

        self.assertEqual(site2Info["total_running_slots"], 200,
                         "Error: Site thresholds wrong")

        self.assertEqual(site2Info["total_running_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(site2Info["total_pending_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold2["task_running_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold2["task_pending_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold2["max_slots"], 50,
                         "Error: Site thresholds wrong")

        self.assertEqual(procThreshold2["pending_slots"], 30,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold2["task_running_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold2["task_pending_jobs"], 0,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold2["max_slots"], 135,
                         "Error: Site thresholds wrong")

        self.assertEqual(mergeThreshold2["pending_slots"], 100,
                         "Error: Site thresholds wrong")
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize 
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        self.setVariables(self.config)
        
    def setVariables(self, config):
        """
        load all the variables from the config file
        """
        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        
        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent
        
        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])
        
        # agent teams (for dynamic threshold) and queueParams (drain mode)
        self.teamNames = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)
                
        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB
        
        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)
       
        
    def setup(self, parameters):
        """
        Set db connection and prepare resource control
        """
        # Interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set resource control
        self.resourceControl = ResourceControl(config = self.config)
        
        # wmstats connection 
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
        
    def algorithm(self, parameters):
        """
        _algorithm_
        
        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        # set variables every polling cycle
        self.setVariables(self.config)
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s" % sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s" % sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")


    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_
        
        Get the WMStats view about agents and teams
        """
        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam()
        except Exception as ex:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.debug("agentInfo couch view is not available, use default value %s" % self.agentsNumByTeam)
        else:
            self.agentsByTeam = agentsByTeam
            agentsCount = []
            for team in self.teamNames.split(','):
                if team not in self.agentsByTeam:
                    agentsCount.append(1)
                else:
                    agentsCount.append(self.agentsByTeam[team])
            # If agent is in several teams, we choose the team with less agents
            self.agentsNumByTeam = min(agentsCount, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d" % self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_
        
        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        url_site_state = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.siteStatusMetric)
        url_cpu_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.cpuBoundMetric)
        url_io_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(url_site_state).read()
        cpu_bound = urllib2.urlopen(url_cpu_bound).read()
        io_bound = urllib2.urlopen(url_io_bound).read()

        # parse from json format to dictionary, get only 'csvdata'
        site_state = json.loads(sites)['csvdata']
        cpu_slots = json.loads(cpu_bound)['csvdata']
        io_slots = json.loads(io_bound)['csvdata']

        # dictionaries with status/thresholds info by VOName
        stateBySite = self.siteStatusByVOName(site_state)
        slotsCPU = self.thresholdsByVOName(cpu_slots)
        slotsIO = self.thresholdsByVOName(io_slots)

        sitesSSB = {}
        if not stateBySite or not slotsCPU or not slotsIO:
            logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
            return sitesSSB

        for k,v in stateBySite.iteritems():
            sitesSSB[k] = {'state': v}
            sitesSSB[k]['slotsCPU'] = slotsCPU[k] if k in slotsCPU else None
            sitesSSB[k]['slotsIO'] = slotsIO[k] if k in slotsIO else None
        return sitesSSB

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down" % site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down' % site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s' % (site, infoRC[site]['state'],
                                                                  infoSSB[site]['state']))
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug("Settings for site and task pending slots: %s%% and %s%%" % 
                      (self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)) 

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores/100
                infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores/100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(int(CPUBound/agentsCount * self.pendingSlotsSitePercent/100), minCPUSlots)
            taskCPUPending = max(int(CPUBound/agentsCount * self.pendingSlotsTaskPercent/100), minCPUSlots)
            taskIOPending = max(int(IOBound/agentsCount * self.pendingSlotsTaskPercent/100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.debug("Updating %s site thresholds for pend/runn: %d/%d" % (site, sitePending, CPUBound))
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots = sitePending,
                                                        runningJobSlots = CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d" % (site, taskCPUPending,
                                                                                        CPUBound))
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(site, taskType = task, maxSlots = CPUBound,
                                                         pendingSlots = taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d" % (site, taskIOPending,
                                                                                       IOBound))
                for task in tasksIO:
                    self.resourceControl.insertThreshold(site, taskType = task, maxSlots = IOBound,
                                                         pendingSlots = taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug("Updating %s Express and Repack task thresholds." % site)
                expressSlots = int(CPUBound * self.runningExpressPercent/100)
                pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent/100)
                self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent/100)
                pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent/100)
                self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack)


    def thresholdsByVOName(self, sites):
        """
        _thresholdsByVOName_
        
        Creates a dictionary with keys->VOName and values->threshold: 
        """
        thresholdbyVOName = {}
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname not in thresholdbyVOName:
                if value is None: 
                    logging.warn('Site %s does not have thresholds in SSB, assuming 0' % voname) 
                    thresholdbyVOName[voname] = 0
                else:
                    thresholdbyVOName[voname] = int(value)
            else:
                logging.error('I have a duplicated threshold entry in SSB for %s' % voname) 
        return thresholdbyVOName
    
    def siteStatusByVOName(self, sites):
        """
        _siteStatusByVOName_
        
        Creates a dictionary with keys->VOName and values->status:
        """
        statusBySite = {}
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if not status: 
                logging.error('Site %s does not have status in SSB' % voname)
                continue
            if voname not in statusBySite:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unkwown status '%s' for site %s, please check SSB" % (status, voname))
                    continue
                statusBySite[voname] = statusAgent
            else:
                logging.error('I have a duplicated status entry in SSB for %s' % voname) 
        return statusBySite

    def getState(self, stateSSB):
        """
        _getState_
        
        Translates SSB states into resource control state
        """
        ssb2agent = {'on':    'Normal',
                     'drain': 'Draining',
                     'down': 'Down',
                     'skip': 'Down'}

        if stateSSB in ssb2agent:
            return ssb2agent[stateSSB]
        elif stateSSB == "tier0":
            logging.debug('There is a site in tier0 status (Tier0Mode is %s)' % self.tier0Mode )
            if self.tier0Mode: 
                return "Normal"
            else:
                return "Draining"
        else:
            return None

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_
    
        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:" % (siteName, state))
            logging.error(str(ex))
            logging.error("Traceback: \n%s" % traceback.format_exc())
        return
예제 #28
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)

    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info(
                "This component is not enabled in the configuration. Doing nothing."
            )
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s", sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info(
            "Resource control cycle finished updating site state and thresholds."
        )

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view about agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(
                filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning(
                "agentInfo couch view is not available, use default value %s",
                self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName,
                                                    self.agentsNumByTeam)
            logging.debug(
                "Agents connected to the same team (not in DrainMode): %d",
                self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        urlSiteState = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.siteStatusMetric)
        urlCpuBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.cpuBoundMetric)
        urlIoBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(urlSiteState).read()
        cpuBound = urllib2.urlopen(urlCpuBound).read()
        ioBound = urllib2.urlopen(urlIoBound).read()

        # parse from json format to dictionary, get only 'csvdata'
        ssbSiteState = json.loads(sites)['csvdata']
        ssbCpuSlots = json.loads(cpuBound)['csvdata']
        ssbIoSlots = json.loads(ioBound)['csvdata']

        # dict updated by these methods with status/thresholds info keyed by the site name
        ssbSiteSlots = {}
        self.siteStatusByVOName(ssbSiteState, ssbSiteSlots)
        self.thresholdsByVOName(ssbCpuSlots,
                                ssbSiteSlots,
                                slotsType='slotsCPU')
        self.thresholdsByVOName(ssbIoSlots, ssbSiteSlots, slotsType='slotsIO')

        # Now remove sites with state only, such that no updates are applied to them
        ssbSiteSlots = {
            k: v
            for k, v in ssbSiteSlots.iteritems() if len(v) == 3
        }

        if not ssbSiteSlots:
            logging.error(
                "One or more of the SSB metrics is down. Please contact the Dashboard team."
            )
            return ssbSiteSlots

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down',
                                 site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site,
                             infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug(
            "Settings for site and task pending slots: %s%% and %s%%",
            self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site][
                    'slotsCPU'] * self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] = infoSSB[site][
                    'slotsIO'] * self.t1SitesCores / 100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(
                int(CPUBound / agentsCount * self.pendingSlotsSitePercent /
                    100), minCPUSlots)
            taskCPUPending = max(
                int(CPUBound / agentsCount * self.pendingSlotsTaskPercent /
                    100), minCPUSlots)
            taskIOPending = max(
                int(IOBound / agentsCount * self.pendingSlotsTaskPercent /
                    100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site][
                    'pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info(
                    "Updating %s site thresholds for pend/runn: %d/%d", site,
                    sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(
                    site,
                    pendingJobSlots=sitePending,
                    runningJobSlots=CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug(
                    "Updating %s tasksCPU thresholds for pend/runn: %d/%d",
                    site, taskCPUPending, CPUBound)
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(
                        site,
                        taskType=task,
                        maxSlots=CPUBound,
                        pendingSlots=taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug(
                    "Updating %s tasksIO thresholds for pend/runn: %d/%d",
                    site, taskIOPending, IOBound)
                for task in tasksIO:
                    self.resourceControl.insertThreshold(
                        site,
                        taskType=task,
                        maxSlots=IOBound,
                        pendingSlots=taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug(
                    "Updating %s Express and Repack task thresholds.", site)
                expressSlots = int(CPUBound * self.runningExpressPercent / 100)
                pendingExpress = int(expressSlots *
                                     self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Express',
                                                     expressSlots,
                                                     pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent / 100)
                pendingRepack = int(repackSlots *
                                    self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Repack',
                                                     repackSlots,
                                                     pendingRepack)

    def thresholdsByVOName(self, sites, ssbSiteSlots, slotsType):
        """
        _thresholdsByVOName_

        Updates the dict with CPU and IO slots, only for sites with a valid state
        """
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname in ssbSiteSlots:
                if value is None:
                    logging.warn(
                        'Site %s does not have thresholds in SSB. Taking no action',
                        voname)
                    # then we better remove this site from our final dict
                    ssbSiteSlots.pop(voname)
                else:
                    ssbSiteSlots[voname][slotsType] = int(value)
            else:
                logging.warn(
                    'Found %s thresholds for site %s which has no state in SSB',
                    slotsType, voname)
        return

    def siteStatusByVOName(self, sites, ssbSiteSlots):
        """
        _siteStatusByVOName_

        Creates an inner dictionary for each site that will contain
        the site state and the number of slots
        """
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteSlots:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error(
                        "Unkwown status '%s' for site %s, please check SSB",
                        status, voname)
                else:
                    ssbSiteSlots[voname] = {'state': statusAgent}
            else:
                logging.error('I have a duplicated status entry in SSB for %s',
                              voname)
        return

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining'
        }
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return
예제 #29
0
    def testList(self):
        """
        _testList_

        Test the functions that list thresholds for creating jobs and submitting
        jobs.
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1", "T1_US_FNAL", "LsfPlugin")
        myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2")

        myResourceControl.insertThreshold("testSite1", "Processing", 20, 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 200, 100)
        myResourceControl.insertThreshold("testSite2", "Processing", 50, 25)
        myResourceControl.insertThreshold("testSite2", "Merge", 135, 65)

        testWorkflow = Workflow(spec = makeUUID(), owner = "Steve",
                                name = makeUUID(), task = "Test")
        testWorkflow.create()

        testFilesetA = Fileset(name = "TestFilesetA")
        testFilesetA.create()
        testFilesetB = Fileset(name = "TestFilesetB")
        testFilesetB.create()
        testFilesetC = Fileset(name = "TestFilesetC")
        testFilesetC.create()

        testFileA = File(lfn = "testFileA", locations = set(["testSE1", "testSE2"]))
        testFileA.create()
        testFilesetA.addFile(testFileA)
        testFilesetA.commit()
        testFilesetB.addFile(testFileA)
        testFilesetB.commit()
        testFilesetC.addFile(testFileA)
        testFilesetC.commit()

        testSubscriptionA = Subscription(fileset = testFilesetA,
                                        workflow = testWorkflow,
                                        type = "Processing")
        testSubscriptionA.create()
        testSubscriptionA.addWhiteBlackList([{"site_name": "testSite1", "valid": True}])
        testSubscriptionB = Subscription(fileset = testFilesetB,
                                        workflow = testWorkflow,
                                        type = "Processing")
        testSubscriptionB.create()
        testSubscriptionB.addWhiteBlackList([{"site_name": "testSite1", "valid": False}])
        testSubscriptionC = Subscription(fileset = testFilesetC,
                                        workflow = testWorkflow,
                                        type = "Merge")
        testSubscriptionC.create()

        testJobGroupA = JobGroup(subscription = testSubscriptionA)
        testJobGroupA.create()
        testJobGroupB = JobGroup(subscription = testSubscriptionB)
        testJobGroupB.create()
        testJobGroupC = JobGroup(subscription = testSubscriptionC)
        testJobGroupC.create()

        # Site1, Has been assigned a location and is complete.
        testJobA = Job(name = "testJobA", files = [testFileA])
        testJobA["couch_record"] = makeUUID()
        testJobA.create(group = testJobGroupA)
        testJobA["state"] = "success"

        # Site 1, Has been assigned a location and is incomplete.
        testJobB = Job(name = "testJobB", files = [testFileA])
        testJobB["couch_record"] = makeUUID()
        testJobB.create(group = testJobGroupA)
        testJobB["state"] = "executing"
        runJobB = RunJob()
        runJobB.buildFromJob(testJobB)
        runJobB["status"] = "PEND"

        # Does not have a location, white listed to site 1
        testJobC = Job(name = "testJobC", files = [testFileA])
        testJobC["couch_record"] = makeUUID()
        testJobC.create(group = testJobGroupA)
        testJobC["state"] = "new"

        # Site 2, Has been assigned a location and is complete.
        testJobD = Job(name = "testJobD", files = [testFileA])
        testJobD["couch_record"] = makeUUID()
        testJobD.create(group = testJobGroupB)
        testJobD["state"] = "success"

        # Site 2, Has been assigned a location and is incomplete.
        testJobE = Job(name = "testJobE", files = [testFileA])
        testJobE["couch_record"] = makeUUID()
        testJobE.create(group = testJobGroupB)
        testJobE["state"] = "executing"
        runJobE = RunJob()
        runJobE.buildFromJob(testJobE)
        runJobE["status"] = "RUN"

        # Does not have a location, site 1 is blacklisted.
        testJobF = Job(name = "testJobF", files = [testFileA])
        testJobF["couch_record"] = makeUUID()
        testJobF.create(group = testJobGroupB)
        testJobF["state"] = "new"

        # Site 3, Has been assigned a location and is complete.
        testJobG = Job(name = "testJobG", files = [testFileA])
        testJobG["couch_record"] = makeUUID()
        testJobG.create(group = testJobGroupC)
        testJobG["state"] = "cleanout"

        # Site 3, Has been assigned a location and is incomplete.
        testJobH = Job(name = "testJobH", files = [testFileA])
        testJobH["couch_record"] = makeUUID()
        testJobH.create(group = testJobGroupC)
        testJobH["state"] = "new"

        # Site 3, Does not have a location.
        testJobI = Job(name = "testJobI", files = [testFileA])
        testJobI["couch_record"] = makeUUID()
        testJobI.create(group = testJobGroupC)
        testJobI["state"] = "new"

        # Site 3, Does not have a location and is in cleanout.
        testJobJ = Job(name = "testJobJ", files = [testFileA])
        testJobJ["couch_record"] = makeUUID()
        testJobJ.create(group = testJobGroupC)
        testJobJ["state"] = "cleanout"

        changeStateAction = self.daoFactory(classname = "Jobs.ChangeState")
        changeStateAction.execute(jobs = [testJobA, testJobB, testJobC, testJobD,
                                          testJobE, testJobF, testJobG, testJobH,
                                          testJobI, testJobJ])

        self.insertRunJob.execute([runJobB, runJobE])

        setLocationAction = self.daoFactory(classname = "Jobs.SetLocation")
        setLocationAction.execute(testJobA["id"], "testSite1")
        setLocationAction.execute(testJobB["id"], "testSite1")
        setLocationAction.execute(testJobD["id"], "testSite1")
        setLocationAction.execute(testJobE["id"], "testSite1")
        setLocationAction.execute(testJobG["id"], "testSite1")
        setLocationAction.execute(testJobH["id"], "testSite1")

        createThresholds = myResourceControl.listThresholdsForCreate()
        submitThresholds = myResourceControl.listThresholdsForSubmit()

        self.assertEqual( len(createThresholds.keys()), 2,
               "Error: Wrong number of sites in create thresholds" )

        self.assertEqual( createThresholds["testSite1"]["total_slots"], 10,
               "Error: Wrong number of slots for site 1" )

        self.assertEqual( createThresholds["testSite2"]["total_slots"], 20,
               "Error: Wrong number of slots for site 2" )

        # We should have two running jobs with locations at site one,
        # two running jobs without locations at site two, and one running
        # job without a location at site one and two.
        self.assertEqual( createThresholds["testSite1"]["pending_jobs"], 4,
               "Error: Wrong number of pending jobs for site 1" )

        # We should have one running job with a location at site 2 and
        # another running job without a location.
        self.assertEqual( createThresholds["testSite2"]["pending_jobs"], 2,
               "Error: Wrong number of pending jobs for site 2" )

        # We should also have a phedex_name
        self.assertEqual(createThresholds["testSite1"]["cms_name"], "T1_US_FNAL")
        self.assertEqual(createThresholds["testSite2"]["cms_name"], None)

        mergeThreshold1 = None
        mergeThreshold2 = None
        procThreshold1  = None
        procThreshold2  = None
        self.assertEqual(submitThresholds["testSite1"]['cms_name'], 'T1_US_FNAL')
        for threshold in submitThresholds["testSite1"]["thresholds"]:
            if threshold['task_type'] == "Merge":
                mergeThreshold1 = threshold
            elif threshold['task_type'] == "Processing":
                procThreshold1 = threshold
        self.assertEqual(submitThresholds["testSite2"]['cms_name'], None)
        for threshold in submitThresholds["testSite2"]["thresholds"]:
            if threshold['task_type'] == "Merge":
                mergeThreshold2 = threshold
            elif threshold['task_type'] == "Processing":
                procThreshold2 = threshold

        self.assertEqual(submitThresholds["testSite1"]["total_running_jobs"], 1,
                         "Error: Wrong number of running jobs for submit thresholds.")
        self.assertEqual(submitThresholds["testSite2"]["total_running_jobs"], 0,
                         "Error: Wrong number of running jobs for submit thresholds.")
        self.assertEqual(submitThresholds["testSite1"]["total_pending_jobs"], 1,
                         "Error: Wrong number of pending jobs for submit thresholds.")
        self.assertEqual(submitThresholds["testSite2"]["total_pending_jobs"], 0,
                         "Error: Wrong number of pending jobs for submit thresholds.")

        self.assertEqual(mergeThreshold1["task_running_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(mergeThreshold1["task_pending_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold1["task_running_jobs"], 1,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold1["task_pending_jobs"], 1,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(mergeThreshold2["task_running_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(mergeThreshold2["task_pending_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold2["task_running_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")
        self.assertEqual(procThreshold2["task_pending_jobs"], 0,
                         "Error: Wrong number of task running jobs for submit thresholds.")

        return
예제 #30
0
    def testThresholdPriority(self):
        """
        _testThresholdPriority_

        Test that we get things back in priority order
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3)
        myResourceControl.changeTaskPriority("Merge", 3)
        myResourceControl.changeTaskPriority("Processing", 1)

        result = myResourceControl.listThresholdsForSubmit()

        self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Merge')
        self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Processing')


        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3)
        myResourceControl.changeTaskPriority("Merge", 1)
        myResourceControl.changeTaskPriority("Processing", 3)

        # Should now be in reverse order
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Merge')
        self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Processing')

        myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2")
        myResourceControl.insertThreshold("testSite2", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite2", "Merge", 5, 3)


        # Should be in the same order for site 1 and 2
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite2']['thresholds'][0]['task_type'], result['testSite1']['thresholds'][0]['task_type'])
        self.assertEqual(result['testSite2']['thresholds'][1]['task_type'], result['testSite1']['thresholds'][1]['task_type'])

        myResourceControl.changeTaskPriority("Merge", 4)
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite2']['thresholds'][0]['priority'], 4)

        return
예제 #31
0
    def setupForKillTest(self, baAPI = None):
        """
        _setupForKillTest_

        Inject a workflow into WMBS that has a processing task, a merge task and
        a cleanup task.  Inject files into the various tasks at various
        processing states (acquired, complete, available...).  Also create jobs
        for each subscription in various states.
        """
        myThread = threading.currentThread()
        daoFactory = DAOFactory(package = "WMCore.WMBS",
                                logger = myThread.logger,
                                dbinterface = myThread.dbi)

        locationAction = daoFactory(classname = "Locations.New")
        changeStateAction = daoFactory(classname = "Jobs.ChangeState")
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = 'site1', seName = 'goodse.cern.ch',
                                   ceName = 'site1', plugin = "TestPlugin")
        resourceControl.insertThreshold(siteName = 'site1', taskType = 'Processing', \
                                        maxSlots = 10000)

        inputFileset = Fileset("input")
        inputFileset.create()

        inputFileA = File("lfnA", locations = "goodse.cern.ch")
        inputFileB = File("lfnB", locations = "goodse.cern.ch")
        inputFileC = File("lfnC", locations = "goodse.cern.ch")
        inputFileA.create()
        inputFileB.create()
        inputFileC.create()

        inputFileset.addFile(inputFileA)
        inputFileset.addFile(inputFileB)
        inputFileset.addFile(inputFileC)
        inputFileset.commit()
        
        unmergedOutputFileset = Fileset("unmerged")        
        unmergedOutputFileset.create()

        unmergedFileA = File("ulfnA", locations = "goodse.cern.ch")
        unmergedFileB = File("ulfnB", locations = "goodse.cern.ch")
        unmergedFileC = File("ulfnC", locations = "goodse.cern.ch")
        unmergedFileA.create()
        unmergedFileB.create()
        unmergedFileC.create()        

        unmergedOutputFileset.addFile(unmergedFileA)
        unmergedOutputFileset.addFile(unmergedFileB)
        unmergedOutputFileset.addFile(unmergedFileC)
        unmergedOutputFileset.commit()

        mainProcWorkflow = Workflow(spec = "spec1", owner = "Steve",
                                    name = "Main", task = "Proc")
        mainProcWorkflow.create()
        mainProcMergeWorkflow = Workflow(spec = "spec1", owner = "Steve",
                                         name = "Main", task = "ProcMerge")
        mainProcMergeWorkflow.create()
        mainCleanupWorkflow = Workflow(spec = "spec1", owner = "Steve",
                                       name = "Main", task = "Cleanup")
        mainCleanupWorkflow.create()

        self.mainProcSub = Subscription(fileset = inputFileset,
                                        workflow = mainProcWorkflow,
                                        type = "Processing")
        self.mainProcSub.create()
        self.mainProcSub.acquireFiles(inputFileA)
        self.mainProcSub.completeFiles(inputFileB)

        procJobGroup = JobGroup(subscription = self.mainProcSub)
        procJobGroup.create()
        self.procJobA = Job(name = "ProcJobA")
        self.procJobA["state"] = "new"
        self.procJobA["location"] = "site1"
        self.procJobB = Job(name = "ProcJobB")
        self.procJobB["state"] = "executing"
        self.procJobB["location"] = "site1"
        self.procJobC = Job(name = "ProcJobC")
        self.procJobC["state"] = "complete"
        self.procJobC["location"] = "site1"
        self.procJobA.create(procJobGroup)
        self.procJobB.create(procJobGroup)
        self.procJobC.create(procJobGroup)

        self.mainMergeSub = Subscription(fileset = unmergedOutputFileset,
                                         workflow = mainProcMergeWorkflow,
                                         type = "Merge")
        self.mainMergeSub.create()
        self.mainMergeSub.acquireFiles(unmergedFileA)
        self.mainMergeSub.failFiles(unmergedFileB)

        mergeJobGroup = JobGroup(subscription = self.mainMergeSub)
        mergeJobGroup.create()
        self.mergeJobA = Job(name = "MergeJobA")
        self.mergeJobA["state"] = "exhausted"
        self.mergeJobA["location"] = "site1"
        self.mergeJobB = Job(name = "MergeJobB")
        self.mergeJobB["state"] = "cleanout"
        self.mergeJobB["location"] = "site1"
        self.mergeJobC = Job(name = "MergeJobC")
        self.mergeJobC["state"] = "new"
        self.mergeJobC["location"] = "site1"
        self.mergeJobA.create(mergeJobGroup)
        self.mergeJobB.create(mergeJobGroup)
        self.mergeJobC.create(mergeJobGroup)
        
        self.mainCleanupSub = Subscription(fileset = unmergedOutputFileset,
                                           workflow = mainCleanupWorkflow,
                                           type = "Cleanup")
        self.mainCleanupSub.create()
        self.mainCleanupSub.acquireFiles(unmergedFileA)
        self.mainCleanupSub.completeFiles(unmergedFileB)

        cleanupJobGroup = JobGroup(subscription = self.mainCleanupSub)
        cleanupJobGroup.create()
        self.cleanupJobA = Job(name = "CleanupJobA")
        self.cleanupJobA["state"] = "new"
        self.cleanupJobA["location"] = "site1"
        self.cleanupJobB = Job(name = "CleanupJobB")
        self.cleanupJobB["state"] = "executing"
        self.cleanupJobB["location"] = "site1"
        self.cleanupJobC = Job(name = "CleanupJobC")
        self.cleanupJobC["state"] = "complete"
        self.cleanupJobC["location"] = "site1"
        self.cleanupJobA.create(cleanupJobGroup)
        self.cleanupJobB.create(cleanupJobGroup)
        self.cleanupJobC.create(cleanupJobGroup)

        jobList = [self.procJobA, self.procJobB, self.procJobC,
                   self.mergeJobA, self.mergeJobB, self.mergeJobC,
                   self.cleanupJobA, self.cleanupJobB, self.cleanupJobC]

        changeStateAction.execute(jobList)

        if baAPI:
            for job in jobList:
                job['plugin'] = 'TestPlugin'
                job['userdn'] = 'Steve'
                job['custom']['location'] = 'site1'
            baAPI.createNewJobs(wmbsJobs = jobList)

        # We'll create an unrelated workflow to verify that it isn't affected
        # by the killing code.
        bogusFileset = Fileset("dontkillme")
        bogusFileset.create()

        bogusFileA = File("bogus/lfnA", locations = "goodse.cern.ch")
        bogusFileA.create()
        bogusFileset.addFile(bogusFileA)
        bogusFileset.commit()
        
        bogusWorkflow = Workflow(spec = "spec2", owner = "Steve",
                                 name = "Bogus", task = "Proc")
        bogusWorkflow.create()
        self.bogusSub = Subscription(fileset = bogusFileset,
                                     workflow = bogusWorkflow,
                                     type = "Processing")
        self.bogusSub.create()
        self.bogusSub.acquireFiles(bogusFileA)
        return
예제 #32
0
    def testThresholdPriority(self):
        """
        _testThresholdPriority_

        Test that we get things back in priority order
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8, priority = 1)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3, priority = 2)

        result = myResourceControl.listThresholdsForSubmit()

        self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Merge')
        self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Processing')


        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8, priority = 2)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3, priority = 1)

        # Should now be in reverse order
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Merge')
        self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Processing')

        myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2")
        myResourceControl.insertThreshold("testSite2", "Processing", 10, 8, priority = 1)
        myResourceControl.insertThreshold("testSite2", "Merge", 5, 3, priority = 2)

        # Should be in proper order for site 2
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite2']['thresholds'][0]['task_type'], 'Merge')
        self.assertEqual(result['testSite2']['thresholds'][1]['task_type'], 'Processing')

        # Should now be in reverse order for site 1
        self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Merge')
        self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Processing')

        myResourceControl.insertThreshold("testSite2", "Merge", 20, 10)
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(result['testSite2']['thresholds'][0]['priority'], 2)


        return
예제 #33
0
    def setUp(self):
        """
        setup for test.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
                                useDefault = False)
        self.testInit.setupCouch("bossair_t/jobs", "JobDump")
        self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs")


        locationAction = self.daoFactory(classname = "Locations.New")
        locationSlots  = self.daoFactory(classname = "Locations.SetJobSlots")



        #Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site),
                                       ceName = site, plugin = "CondorPlugin", jobSlots = 1000)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 1000)
        resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu',
                                   ceName = 'Xanadu', plugin = "TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000)

        resourceControl.insertSite(siteName = 'jade-cms.hip.fi', seName = 'madhatter.csc.fi',
                                   ceName = 'jade-cms.hip.fi', plugin = "ARCPlugin")
        resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \
                                        maxSlots = 100)
        # using this for glite submissions
        resourceControl.insertSite(siteName = 'grid-ce-01.ba.infn.it', seName = 'storm-se-01.ba.infn.it',
                                   ceName = 'grid-ce-01.ba.infn.it', plugin = 'gLitePlugin')
        resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \
                                        maxSlots = 50)

        # Create user
        newuser = self.daoFactory(classname = "Users.New")
        newuser.execute(dn = "mnorman", group_name = "phgroup", role_name = "cmsrole")


        # We actually need the user name
        self.user = getpass.getuser()

        self.testDir = self.testInit.generateWorkDir()


        # Set heartbeat
        componentName = 'test'
        self.heartbeatAPI  = HeartbeatAPI(componentName)
        self.heartbeatAPI.registerComponent()
        componentName = 'JobTracker'
        self.heartbeatAPI2  = HeartbeatAPI(componentName)
        self.heartbeatAPI2.registerComponent()

        return
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """

    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)

    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s", sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view about agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        urlSiteState = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.siteStatusMetric)
        urlCpuBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.cpuBoundMetric)
        urlIoBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(
            self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(urlSiteState).read()
        cpuBound = urllib2.urlopen(urlCpuBound).read()
        ioBound = urllib2.urlopen(urlIoBound).read()

        # parse from json format to dictionary, get only 'csvdata'
        ssbSiteState = json.loads(sites)['csvdata']
        ssbCpuSlots = json.loads(cpuBound)['csvdata']
        ssbIoSlots = json.loads(ioBound)['csvdata']

        # dict updated by these methods with status/thresholds info keyed by the site name
        ssbSiteSlots = {}
        self.siteStatusByVOName(ssbSiteState, ssbSiteSlots)
        self.thresholdsByVOName(ssbCpuSlots, ssbSiteSlots, slotsType='slotsCPU')
        self.thresholdsByVOName(ssbIoSlots, ssbSiteSlots, slotsType='slotsIO')

        # Now remove sites with state only, such that no updates are applied to them
        ssbSiteSlots = {k: v for k, v in ssbSiteSlots.iteritems() if len(v) == 3}

        if not ssbSiteSlots:
            logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
            return ssbSiteSlots

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down', site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent,
                                                                                 self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores / 100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(int(CPUBound / agentsCount * self.pendingSlotsSitePercent / 100), minCPUSlots)
            taskCPUPending = max(int(CPUBound / agentsCount * self.pendingSlotsTaskPercent / 100), minCPUSlots)
            taskIOPending = max(int(IOBound / agentsCount * self.pendingSlotsTaskPercent / 100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending,
                                                        runningJobSlots=CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d", site, taskCPUPending,
                              CPUBound)
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(site, taskType=task, maxSlots=CPUBound,
                                                         pendingSlots=taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d", site, taskIOPending,
                              IOBound)
                for task in tasksIO:
                    self.resourceControl.insertThreshold(site, taskType=task, maxSlots=IOBound,
                                                         pendingSlots=taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug("Updating %s Express and Repack task thresholds.", site)
                expressSlots = int(CPUBound * self.runningExpressPercent / 100)
                pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent / 100)
                pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100)
                self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack)

    def thresholdsByVOName(self, sites, ssbSiteSlots, slotsType):
        """
        _thresholdsByVOName_

        Updates the dict with CPU and IO slots, only for sites with a valid state
        """
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname in ssbSiteSlots:
                if value is None:
                    logging.warn('Site %s does not have thresholds in SSB. Taking no action', voname)
                    # then we better remove this site from our final dict
                    ssbSiteSlots.pop(voname)
                else:
                    ssbSiteSlots[voname][slotsType] = int(value)
            else:
                logging.warn('Found %s thresholds for site %s which has no state in SSB', slotsType, voname)
        return

    def siteStatusByVOName(self, sites, ssbSiteSlots):
        """
        _siteStatusByVOName_

        Creates an inner dictionary for each site that will contain
        the site state and the number of slots
        """
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteSlots:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unkwown status '%s' for site %s, please check SSB", status, voname)
                else:
                    ssbSiteSlots[voname] = {'state': statusAgent}
            else:
                logging.error('I have a duplicated status entry in SSB for %s', voname)
        return

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {'enabled': 'Normal',
                     'drain': 'Draining',
                     'disabled': 'Down',
                     'test': 'Draining'}
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return