Example #1
0
    def testInsertAllSEs2(self):
        """
        _testInsertAllSEs2_
        
        Test to see if we can insert all SEs and Thresholds at once
        Depending on the WMCore.Services.SiteDB interface
        """
        myResourceControl = ResourceControl()
        taskList = [
            {"taskType": "Processing", "maxSlots": 100, "priority": 1},
            {"taskType": "Merge", "maxSlots": 50, "priority": 2},
        ]

        myResourceControl.insertAllSEs(
            siteName="test", jobSlots=200, ceName="glidein-ce.fnal.gov", plugin="CondorPlugin", taskList=taskList
        )
        result = myResourceControl.listThresholdsForSubmit()
        self.assertTrue("test_cmssrm.fnal.gov" in result.keys())
        self.assertEqual(result["test_cmssrm.fnal.gov"][0]["cms_name"], "T1_US_FNAL")
        for x in result.keys():
            self.assertEqual(len(result[x]), 2)
            for thresh in result[x]:
                if thresh["task_type"] == "Processing":
                    self.assertEqual(thresh["priority"], 1)
                    self.assertEqual(thresh["max_slots"], 100)
                    self.assertEqual(thresh["total_slots"], 200)
                else:
                    self.assertEqual(thresh["priority"], 2)
                    self.assertEqual(thresh["max_slots"], 50)
                    self.assertEqual(thresh["total_slots"], 200)
        return
Example #2
0
    def testInsertAllSEs(self):
        """
        _testInsertAllSEs_

        Test to see if we can insert all SEs and Thresholds at once
        Depending on the WMCore.Services.SiteDB interface
        """
        self.createConfig()

        resControlPath = os.path.join(WMCore.WMBase.getTestBase(), "../../bin/wmagent-resource-control")
        env = os.environ
        env["PYTHONPATH"] = ":".join(sys.path)
        cmdline = [resControlPath, "--add-all-sites", "--plugin=CondorPlugin", "--site-slots=100"]
        retval = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
        (output, _) = retval.communicate()

        myResourceControl = ResourceControl()
        result = myResourceControl.listThresholdsForSubmit()
        self.assertTrue("T1_US_FNAL" in result.keys())
        for x in result.keys():
            self.assertEqual(len(result[x]), 7)
            for thresh in result[x]:
                if thresh["task_type"] == "Processing":
                    self.assertEqual(thresh["priority"], 1)
                    self.assertEqual(thresh["max_slots"], 100)
                    self.assertEqual(thresh["total_slots"], 100)
Example #3
0
    def testInsertSite(self):
        """
        _testInsertSite_

        Test to see if we can insert a fake test site alone
        with a single option
        """
        self.createConfig()

        resControlPath = os.path.join(getTestBase(), "../../bin/wmagent-resource-control")
        env = os.environ
        env['PYTHONPATH'] = ":".join(sys.path)
        cmdline = [resControlPath, "--add-Test"]
        retval = subprocess.Popen(cmdline,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  env=env)
        (_, _) = retval.communicate()
        myResourceControl = ResourceControl()
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(len(result), 1)
        self.assertTrue('CERN' in result)
        for x in result:
            self.assertEqual(len(result[x]['thresholds']), 7)
            self.assertEqual(result[x]['total_pending_slots'], 500)
            self.assertEqual(result[x]['total_running_slots'], 1)
            for taskType, thresh in result[x]['thresholds'].items():
                if taskType == 'Processing':
                    self.assertEqual(thresh['priority'], 0)
                    self.assertEqual(thresh['max_slots'], 1)

        # Verify that sites with more than one SE were added correctly.
        cernInfo = myResourceControl.listSiteInfo("CERN")
        self.assertTrue(len(cernInfo["pnn"]) == 2)
        return
Example #4
0
def freeSlots(multiplier = 1.0, minusRunning = False, allowedStates = ['Normal'], knownCmsSites = None):
    """
    Get free resources from wmbs.

    Specify multiplier to apply a ratio to the actual numbers.
    minusRunning control if running jobs should be counted
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl
    rc_sites = ResourceControl().listThresholdsForCreate()
    sites = defaultdict(lambda: 0)
    for name, site in rc_sites.items():
        if not site.get('cms_name'):
            logging.warning("Not fetching work for %s, cms_name not defined" % name)
            continue
        if knownCmsSites and site['cms_name'] not in knownCmsSites:
            logging.warning("%s doesn't appear to be a known cms site, work may fail to be acquired for it" % site['cms_name'])
        if site['state'] not in allowedStates:
            continue
        slots = site['total_slots']
        if minusRunning:
            slots -= site['pending_jobs']
        sites[site['cms_name']] += (slots * multiplier)

    # At the end delete entries < 1
    # This allows us to combine multiple sites under the same CMS_Name
    # Without going nuts
    for site in sites.keys():
        if sites[site] < 1:
            del sites[site]
    return dict(sites)
Example #5
0
    def testInsertAllSEs2(self):
        """
        _testInsertAllSEs2_

        Test to see if we can insert all SEs and Thresholds at once
        Depending on the WMCore.Services.SiteDB interface
        """
        myResourceControl = ResourceControl()
        taskList = [{'taskType': 'Processing', 'maxSlots': 100, 'pendingSlots' : 80},
                    {'taskType': 'Merge', 'maxSlots': 50, 'pendingSlots' : 30}]

        myResourceControl.insertAllSEs(siteName = 'test', pendingSlots = 200,
                                       runningSlots = 400,
                                       ceName = 'glidein-ce.fnal.gov',
                                       plugin = 'CondorPlugin', taskList = taskList)
        result = myResourceControl.listThresholdsForSubmit()
        self.assertTrue('test_cmssrm.fnal.gov' in result.keys())
        self.assertEqual(result['test_cmssrm.fnal.gov']['cms_name'], 'T1_US_FNAL')
        for x in result.keys():
            self.assertEqual(len(result[x]['thresholds']), 2)
            self.assertEqual(result[x]['total_pending_slots'], 200)
            self.assertEqual(result[x]['total_running_slots'], 400)
            for thresh in result[x]['thresholds']:
                if thresh['task_type'] == 'Processing':
                    self.assertEqual(thresh['priority'], 0)
                    self.assertEqual(thresh['max_slots'], 100)
                    self.assertEqual(thresh['pending_slots'], 80)

                else:
                    self.assertEqual(thresh['priority'], 5)
                    self.assertEqual(thresh['max_slots'], 50)
                    self.assertEqual(thresh['pending_slots'], 30)
        return
    def setUp(self):
        """
        _setUp_

        Set everything up.
        """
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir",
                                                 "WMCore.ResourceControl"],
                                useDefault = False)
        self.testInit.setupCouch("jobsubmittercaching_t/jobs", "JobDump")
        self.testInit.setupCouch("jobsubmittercaching_t/fwjrs", "FWJRDump")

        resourceControl = ResourceControl()
        for siteName in ["T1_US_FNAL", "T1_UK_RAL"]:
            resourceControl.insertSite(siteName = siteName, pnn = "se.%s" % (siteName),
                                       ceName = siteName, plugin = "CondorPlugin", cmsName = siteName)
            resourceControl.insertThreshold(siteName = siteName, taskType = "Processing",
                                            maxSlots = 10000, pendingSlots = 10000)

        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
        return
Example #7
0
def freeSlots(multiplier = 1.0, minusRunning = False, onlyDrain = False, skipDrain = True, knownCmsSites = None):
    """
    Get free resources from wmbs.

    Specify multiplier to apply a ratio to the actual numbers.
    minusRunning control if running jobs should be counted
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl
    rc_sites = ResourceControl().listThresholdsForCreate()
    sites = defaultdict(lambda: 0)
    for name, site in rc_sites.items():
        if not site.get('cms_name'):
            logging.warning("Not fetching work for %s, cms_name not defined" % name)
            continue
        if knownCmsSites and site['cms_name'] not in knownCmsSites:
            logging.warning("%s doesn't appear to be a known cms site, work may fail to be acquired for it" % site['cms_name'])
        if onlyDrain and not site.get('drain'):
            continue
        if skipDrain and site.get('drain'):
            continue
        slots = site['total_slots']
        if minusRunning:
            slots -= site['running_jobs']
        if slots > 0:
            sites[site['cms_name']] += (slots * multiplier)
    return dict(sites)
Example #8
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setSchema(customModules=["WMCore.WMBS"])

        self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting")

        self.myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=self.myThread.dbi)

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("T1_US_FNAL", 1000, 2000, "T1_US_FNAL_Disk", "T1_US_FNAL")
        myResourceControl.insertSite("T2_CH_CERN", 1000, 2000, "T2_CH_CERN", "T2_CH_CERN")

        self.performanceParams = {'timePerEvent': 12,
                                  'memoryRequirement': 2300,
                                  'sizePerEvent': 400}
        # dummy workflow
        self.testWorkflow = Workflow(spec="spec.xml", owner="dmwm", name="testWorkflow", task="Test")
        self.testWorkflow.create()

        return
Example #9
0
    def testInsertAllSEs(self):
        """
        _testInsertAllSEs_

        Test to see if we can insert all SEs and Thresholds at once
        Depending on the WMCore.Services.SiteDB interface
        """
        self.createConfig()

        resControlPath = os.path.join(WMCore.WMBase.getTestBase(), "../../bin/wmagent-resource-control")
        env = os.environ
        env['PYTHONPATH'] = ":".join(sys.path)
        cmdline = [resControlPath, "--add-all-sites", "--plugin=CondorPlugin", "--pending-slots=100", "--running-slots=500" ]
        retval = subprocess.Popen( cmdline,
                                   stdout = subprocess.PIPE,
                                   stderr = subprocess.STDOUT,
                                   env = env)
        ( output, _ ) = retval.communicate()

        myResourceControl = ResourceControl()
        result = myResourceControl.listThresholdsForSubmit()
        self.assertTrue('T1_US_FNAL' in result.keys())
        for x in result.keys():
            self.assertEqual(len(result[x]['thresholds']), 8)
            self.assertEqual(result[x]['total_pending_slots'], 100)
            self.assertEqual(result[x]['total_running_slots'], 500)
            for thresh in result[x]['thresholds']:
                if thresh['task_type'] == 'Processing':
                    self.assertEqual(thresh['priority'], 1)
                    self.assertEqual(thresh['max_slots'], 500)

        # Verify that sites with more than one SE were added correctly.
        nebInfo = myResourceControl.listSiteInfo("T2_US_Nebraska")
        self.assertTrue(len(nebInfo["se_name"]) == 3)
        return
Example #10
0
def freeSlots(multiplier=1.0,
              minusRunning=False,
              allowedStates=['Normal'],
              knownCmsSites=None):
    """
    Get free resources from wmbs.

    Specify multiplier to apply a ratio to the actual numbers.
    minusRunning control if running jobs should be counted
    """
    rc_sites = ResourceControl().listThresholdsForCreate()
    thresholds = defaultdict(lambda: 0)
    jobCounts = defaultdict(dict)
    for name, site in rc_sites.items():
        if not site.get('cms_name'):
            logging.warning("Not fetching work for %s, cms_name not defined",
                            name)
            continue
        if knownCmsSites and site['cms_name'] not in knownCmsSites:
            logging.warning(
                "%s doesn't appear to be a known cms site, work may fail to be acquired for it",
                site['cms_name'])
        if site['state'] not in allowedStates:
            continue
        slots = site['total_slots']
        thresholds[site['cms_name']] += (slots * multiplier)
        if minusRunning:
            jobCounts[site['cms_name']] = dict(
                (k, jobCounts[site['cms_name']].get(k, 0) +
                 site['pending_jobs'].get(k, 0)) for k in site['pending_jobs'])

    return dict(thresholds), dict(jobCounts)
Example #11
0
    def testInsertAllSEs2(self):
        """
        _testInsertAllSEs2_

        Test to see if we can insert all SEs and Thresholds at once
        Depending on the WMCore.Services.SiteDB interface
        """
        myResourceControl = ResourceControl()
        taskList = [{'taskType': 'Processing', 'maxSlots': 100, 'pendingSlots': 80},
                    {'taskType': 'Merge', 'maxSlots': 50, 'pendingSlots': 30}]
        myResourceControl.insertAllSEs(siteName='test', pendingSlots=200, runningSlots=400,
                                       ceName='glidein-ce.fnal.gov', plugin='CondorPlugin', taskList=taskList)
        result = myResourceControl.listThresholdsForSubmit()
        self.assertTrue('test_T1_US_FNAL_Buffer' in result.keys())
        self.assertGreaterEqual(len(result), 1)
        for siteName in result.iterkeys():
            self.assertEqual(len(result[siteName]['thresholds']), 2)
            self.assertEqual(result[siteName]['total_pending_slots'], 200)
            self.assertEqual(result[siteName]['total_running_slots'], 400)
            for taskType, thresh in result[siteName]['thresholds'].iteritems():
                if taskType == 'Processing':
                    self.assertEqual(thresh['priority'], 0)
                    self.assertEqual(thresh['max_slots'], 100)
                    self.assertEqual(thresh['pending_slots'], 80)
                else:
                    self.assertEqual(thresh['priority'], 5)
                    self.assertEqual(thresh['max_slots'], 50)
                    self.assertEqual(thresh['pending_slots'], 30)
        return
Example #12
0
    def setUp(self):
        """
        _setUp_

        Setup the database and logging connection.  Try to create all of the
        WMBS tables.  Also, create some dummy locations.
        """
        
        myThread = threading.currentThread()

        self.sites = ['T2_US_Florida', 'T2_US_UCSD', 'T2_TW_Taiwan', 'T1_CH_CERN']
        
        self.testInit = TestInit(__file__)
        self.testInit.setLogging(logLevel = logging.DEBUG)
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ['WMCore.WMBS', 
                                                 'WMCore.ResourceControl',
                                                 'WMCore.Agent.Database'], useDefault = False)
        self.testInit.setupCouch("dashboardreporter_t/jobs", "JobDump")
        self.testInit.setupCouch("dashboardreporter_t/fwjrs", "FWJRDump")


        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = site, ceName = site)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 10000)

        self.testDir = self.testInit.generateWorkDir()
        self.alertsReceiver = None
        return
Example #13
0
def freeSlots(multiplier = 1.0, minusRunning = False, allowedStates = ['Normal'], knownCmsSites = None):
    """
    Get free resources from wmbs.

    Specify multiplier to apply a ratio to the actual numbers.
    minusRunning control if running jobs should be counted
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl
    rc_sites = ResourceControl().listThresholdsForCreate()
    thresholds = defaultdict(lambda: 0)
    jobCounts = defaultdict(dict)
    for name, site in rc_sites.items():
        if not site.get('cms_name'):
            logging.warning("Not fetching work for %s, cms_name not defined" % name)
            continue
        if knownCmsSites and site['cms_name'] not in knownCmsSites:
            logging.warning("%s doesn't appear to be a known cms site, work may fail to be acquired for it" % site['cms_name'])
        if site['state'] not in allowedStates:
            continue
        slots = site['total_slots']
        thresholds[site['cms_name']] += (slots * multiplier)
        if minusRunning:
            jobCounts[site['cms_name']] = dict((k, jobCounts[site['cms_name']].get(k, 0) + site['pending_jobs'].get(k, 0))
                                               for k in site['pending_jobs'])

    return dict(thresholds), dict(jobCounts)
Example #14
0
    def testInsertT0(self):
        """
        _testInsertT0_

        Test to see if we can insert the Tier-0 alone
        with a single option
        """
        self.createConfig()

        resControlPath = os.path.join(getTestBase(), "../../bin/wmagent-resource-control")
        env = os.environ
        env['PYTHONPATH'] = ":".join(sys.path)
        cmdline = [resControlPath, "--add-T0" ]
        retval = subprocess.Popen( cmdline,
                                   stdout = subprocess.PIPE,
                                   stderr = subprocess.STDOUT,
                                   env = env)
        (_, _) = retval.communicate()
        myResourceControl = ResourceControl()
        result = myResourceControl.listThresholdsForSubmit()
        self.assertEqual(len(result), 1)
        self.assertTrue('CERN' in result)
        for x in result:
            self.assertEqual(len(result[x]['thresholds']), 9)
            self.assertEqual(result[x]['total_pending_slots'], 500)
            self.assertEqual(result[x]['total_running_slots'], -1)
            for taskType, thresh in result[x]['thresholds'].items():
                if taskType == 'Processing':
                    self.assertEqual(thresh['priority'], 0)
                    self.assertEqual(thresh['max_slots'], -1)

        # Verify that sites with more than one SE were added correctly.
        cernInfo = myResourceControl.listSiteInfo("CERN")
        self.assertTrue(len(cernInfo["pnn"]) == 2)
        return
Example #15
0
    def testF_OverloadTest(self):
        """
        _OverloadTest_
        
        Test and see what happens if you put in more jobs
        Then the sites can handle
        """

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertThreshold(siteName=site, taskType="Silly", maxSlots=1)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        workloadName = "basicWorkload"
        myThread = threading.currentThread()
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10
        cacheDir = os.path.join(self.testDir, "CacheDir")

        jobGroupList = self.createJobGroups(
            nSubs=nSubs,
            nJobs=nJobs,
            task=workload.getTask("ReReco"),
            workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName),
            type="Silly",
        )

        for group in jobGroupList:
            changeState.propagate(group.jobs, "created", "new")

        jobSubmitter = JobSubmitterPoller(config=config)

        # Actually run it
        jobSubmitter.algorithm()

        # Should be one job for each site
        nSites = len(self.sites)
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSites)

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state="Executing", jobType="Silly")
        self.assertEqual(len(result), nSites)
        result = getJobsAction.execute(state="Created", jobType="Silly")
        self.assertEqual(len(result), nJobs * nSubs - nSites)

        # Now clean-up
        command = ["condor_rm", self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        del jobSubmitter

        return
Example #16
0
    def setUp(self):

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
                                useDefault = False)

        self.daoFactory = DAOFactory(package = "WMCore.BossAir",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu',
                                   ceName = 'Xanadu', plugin = "TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000, pendingSlots = 10000)

        # Create user
        wmbsFactory = DAOFactory(package = "WMCore.WMBS",
                                 logger = myThread.logger,
                                 dbinterface = myThread.dbi)
        newuser = wmbsFactory(classname = "Users.New")
        newuser.execute(dn = "mnorman", group_name = "phgroup", role_name = "cmsrole")
Example #17
0
    def testJobSiteDrain(self):
        """
        _testJobSiteDrain_

        Test the behavior of jobs pending to a single site that is in drain mode
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        jobSubmitter = JobSubmitterPoller(config=config)
        myResourceControl = ResourceControl(config)
        changeState = ChangeState(config)
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")

        nSubs = 1
        nJobs = 30

        site = 'T2_US_Nebraska'
        self.setResourceThresholds(site, pendingSlots=100, runningSlots=100,
                                   tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 10, 'runningSlots': 10},
                                   Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5})

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[site],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # submit first 10 jobs
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)

        myResourceControl.changeSiteState(site, 'Draining')

        # site is now in drain, so don't submit anything
        jobSubmitter.algorithm()

        # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 0)

        # make sure the drain grace period expires...
        time.sleep(3)
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        # the remaining jobs should have gone to submitfailed by now
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 0)
Example #18
0
    def testJobSiteDrain(self):
        """
        _testJobSiteDrain_

        Test the behavior of jobs pending to a single site that is in drain mode
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        jobSubmitter = JobSubmitterPoller(config=config)
        myResourceControl = ResourceControl(config)
        changeState = ChangeState(config)
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")

        nSubs = 1
        nJobs = 30

        site = 'T2_US_Nebraska'
        self.setResourceThresholds(site, pendingSlots=100, runningSlots=100,
                                   tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 10, 'runningSlots': 10},
                                   Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5})

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[site],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # submit first 10 jobs
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)

        myResourceControl.changeSiteState(site, 'Draining')

        # site is now in drain, so don't submit anything
        jobSubmitter.algorithm()

        # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 0)

        # make sure the drain grace period expires...
        time.sleep(3)
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        # the remaining jobs should have gone to submitfailed by now
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 0)
Example #19
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.ssb2AgentStatus = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining',
            'unknown': None
        }
        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        _token = config.AgentStatusWatcher.grafanaToken
        self.grafanaURL = config.AgentStatusWatcher.grafanaURL
        self.grafanaAPIName = config.AgentStatusWatcher.grafanaSSB
        self.grafana = Grafana(_token,
                               configDict={"endpoint": self.grafanaURL})

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.General.centralWMStatsURL)
Example #20
0
    def setUp(self):
        """
        _setUp_

        Setup the database and logging connection.  Try to create all of the
        WMBS tables.  Also, create some dummy locations.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ['WMCore.WMBS',
                                                 'WMCore.ResourceControl',
                                                 'WMCore.Agent.Database'], useDefault = False)
        self.couchdbname = "jobcreator_t"
        self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump")
        self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump")


        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        locationAction = self.daoFactory(classname = "Locations.New")
        for site in self.sites:
            locationAction.execute(siteName = site, seName = site)



        #Create sites in resourceControl

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = site, ceName = site)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 10000, pendingSlots = 10000)

        self.resourceControl = resourceControl



        self._setup = True
        self._teardown = False

        self.testDir = self.testInit.generateWorkDir()
        self.cwd = os.getcwd()

        # Set heartbeat
        self.componentName = 'JobCreator'
        self.heartbeatAPI  = HeartbeatAPI(self.componentName)
        self.heartbeatAPI.registerComponent()

        return
Example #21
0
    def testThresholdsForSite(self):
        """
        _testThresholdsForSite_

        Check that we can get the thresholds in intelligible form
        for each site
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3)

        result = myResourceControl.thresholdBySite(siteName="testSite1")
        procInfo = {}
        mergInfo = {}
        for res in result:
            if res['task_type'] == 'Processing':
                procInfo = res
            elif res['task_type'] == 'Merge':
                mergInfo = res
        self.assertEqual(procInfo.get('pending_slots', None), 20)
        self.assertEqual(procInfo.get('running_slots', None), 40)
        self.assertEqual(procInfo.get('max_slots', None), 10)
        self.assertEqual(procInfo.get('task_pending_slots', None), 8)
        self.assertEqual(mergInfo.get('pending_slots', None), 20)
        self.assertEqual(mergInfo.get('running_slots', None), 40)
        self.assertEqual(mergInfo.get('max_slots', None), 5)
        self.assertEqual(mergInfo.get('task_pending_slots', None), 3)

        return
Example #22
0
 def setup(self, parameters):
     """
     Set db connection and prepare resource control
     """
     # Interface to WMBS/BossAir db
     myThread = threading.currentThread()
     # set resource control
     self.resourceControl = ResourceControl(config = self.config)
     
     # wmstats connection 
     self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
Example #23
0
    def setUp(self):
        """
        setup for test.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.tearDown()
        self.testInit.setSchema(
            customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
            useDefault=False)
        self.testInit.setupCouch("bossair_t/jobs", "JobDump")
        self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")

        # Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName=site, pnn='%s_PNN' % site, cmsName=site,
                                       ceName=site, plugin="SimpleCondorPlugin", pendingSlots=1000,
                                       runningSlots=2000)
            resourceControl.insertThreshold(siteName=site, taskType='Processing',
                                            maxSlots=1000, pendingSlots=1000)

        site = 'T3_US_Xanadu'
        resourceControl.insertSite(siteName=site, pnn='%s_PNN' % site, cmsName=site,
                                   ceName=site, plugin="TestPlugin")
        resourceControl.insertThreshold(siteName=site, taskType='Processing',
                                        maxSlots=10000, pendingSlots=10000)

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole")

        # We actually need the user name
        self.user = getpass.getuser()

        # Change this to the working dir to keep track of error and log files from condor
        self.testDir = self.testInit.generateWorkDir()

        # Set heartbeat
        componentName = 'test'
        self.heartbeatAPI = HeartbeatAPI(componentName)
        self.heartbeatAPI.registerComponent()
        componentName = 'JobTracker'
        self.heartbeatAPI2 = HeartbeatAPI(componentName)
        self.heartbeatAPI2.registerComponent()

        return
Example #24
0
    def testThresholdsForSite(self):
        """
        _testThresholdsForSite_

        Check that we can get the thresholds in intelligible form
        for each site
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 5)

        result = myResourceControl.thresholdBySite(siteName="testSite1")
        procInfo = {}
        mergInfo = {}
        for res in result:
            if res["task_type"] == "Processing":
                procInfo = res
            elif res["task_type"] == "Merge":
                mergInfo = res
        self.assertEqual(procInfo.get("job_slots", None), 20)
        self.assertEqual(procInfo.get("max_slots", None), 10)
        self.assertEqual(mergInfo.get("job_slots", None), 20)
        self.assertEqual(mergInfo.get("max_slots", None), 5)

        return
Example #25
0
    def testThresholdsForSite(self):
        """
        _testThresholdsForSite_

        Check that we can get the thresholds in intelligible form
        for each site
        """

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 8)
        myResourceControl.insertThreshold("testSite1", "Merge", 5, 3)

        result = myResourceControl.thresholdBySite(siteName="testSite1")
        procInfo = {}
        mergInfo = {}
        for res in result:
            if res['task_type'] == 'Processing':
                procInfo = res
            elif res['task_type'] == 'Merge':
                mergInfo = res
        self.assertEqual(procInfo.get('pending_slots', None), 20)
        self.assertEqual(procInfo.get('running_slots', None), 40)
        self.assertEqual(procInfo.get('max_slots', None), 10)
        self.assertEqual(procInfo.get('task_pending_slots', None), 8)
        self.assertEqual(mergInfo.get('pending_slots', None), 20)
        self.assertEqual(mergInfo.get('running_slots', None), 40)
        self.assertEqual(mergInfo.get('max_slots', None), 5)
        self.assertEqual(mergInfo.get('task_pending_slots', None), 3)

        return
Example #26
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)
Example #27
0
    def setResourceThresholds(self, site, **options):
        """
        _setResourceThresholds_

        Utility to set resource thresholds
        """
        if not options:
            options = {'state': 'Normal',
                       'runningSlots': 10,
                       'pendingSlots': 5,
                       'tasks': ['Processing', 'Merge'],
                       'Processing': {'pendingSlots': 5,
                                      'runningSlots': 10},
                       'Merge': {'pendingSlots': 2,
                                 'runningSlots': 5}}

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName=site, pnn='se.%s' % (site),
                                   ceName=site, plugin="MockPlugin", pendingSlots=options['pendingSlots'],
                                   runningSlots=options['runningSlots'], cmsName=site)
        for task in options['tasks']:
            resourceControl.insertThreshold(siteName=site, taskType=task,
                                            maxSlots=options[task]['runningSlots'],
                                            pendingSlots=options[task]['pendingSlots'])
        if options.get('state'):
            resourceControl.changeSiteState(site, options.get('state'))

        return
Example #28
0
def mapSitetoSE(sites):
    """
    Receives a list of site names, query resource control and return
    a list of SE names.
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl

    if not len(sites):
        return []

    fakeSEs = []
    rControl = ResourceControl()
    for site in sites:
        fakeSEs.extend(rControl.listSiteInfo(site)['se_name'])
    return fakeSEs
Example #29
0
def mapSitetoPNN(sites):
    """
    Receives a list of site names, query resource control and return
    a list of PNNs.
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl

    if not len(sites):
        return []

    fakePNNs = []
    rControl = ResourceControl()
    for site in sites:
        fakePNNs.extend(rControl.listSiteInfo(site)['pnn'])
    return fakePNNs
Example #30
0
    def setUp(self):
        """
        Standard setup: Now with 100% more couch
        """

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setSchema(
            customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
            useDefault=False,
        )
        self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump")
        self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump")

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)

        locationAction = self.daoFactory(classname="Locations.New")
        locationSlots = self.daoFactory(classname="Locations.SetJobSlots")

        # We actually need the user name
        self.user = getpass.getuser()

        self.ceName = "127.0.0.1"

        # Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(
                siteName=site,
                seName="se.%s" % (site),
                ceName=site,
                plugin="CondorPlugin",
                pendingSlots=10000,
                runningSlots=20000,
                cmsName=site,
            )
            resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=10000)

        self.testDir = self.testInit.generateWorkDir()

        # Set heartbeat
        self.componentName = "JobSubmitter"
        self.heartbeatAPI = HeartbeatAPI(self.componentName)
        self.heartbeatAPI.registerComponent()

        return
Example #31
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package = "WMCore.WMBS", \
                                     logger = logging,
                                     dbinterface = myThread.dbi)

        self.config = config

        #Libraries
        self.resourceControl = ResourceControl()

        self.changeState = ChangeState(self.config)
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)

        # BossAir
        self.bossAir = BossAirAPI(config=self.config)

        # Additions for caching-based JobSubmitter
        self.workflowTimestamps = {}
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.siteKeys = {}
        self.locationDict = {}
        self.cmsNames = {}
        self.drainSites = []
        self.sortedSites = []
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobSubmitter")

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception, ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            try:
                logging.debug("PackageDir: %s" % self.packageDir)
                logging.debug("Config: %s" % config)
            except:
                pass
            raise JobSubmitterPollerException(msg)
Example #32
0
    def setUp(self):
        """
        _setUp_

        """
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()

        self.testInit.setSchema(customModules = ["WMCore.WMBS"])

        self.splitterFactory = SplitterFactory(package = "WMCore.JobSplitting")

        myThread = threading.currentThread()
        self.myThread = myThread
        daoFactory = DAOFactory(package = "WMCore.WMBS",
                                logger = logging,
                                dbinterface = myThread.dbi)
        self.WMBSFactory = daoFactory

        config = self.getConfig()
        self.changer = ChangeState(config)

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("SomeSite", 10, 20, "SomeSE", "SomeCE")
        myResourceControl.insertSite("SomeSite2", 10, 20, "SomeSE2", "SomeCE2")

        self.fileset1 = Fileset(name = "TestFileset1")
        for file in range(11):
            newFile = File("/some/file/name%d" % file, size = 1000, events = 100)
            newFile.addRun(Run(1,*[1]))
            newFile.setLocation('SomeSE')
            self.fileset1.addFile(newFile)

        self.fileset1.create()

        workflow1 = Workflow(spec = "spec.xml", owner = "hufnagel", name = "TestWorkflow1", task="Test")
        workflow1.create()

        self.subscription1  = Subscription(fileset = self.fileset1,
                                           workflow = workflow1,
                                           split_algo = "Harvest",
                                           type = "Harvesting")

        self.subscription1.create()

        return
Example #33
0
    def setUp(self):
        """
        _setUp_

        """
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()

        self.testInit.setSchema(customModules=["WMCore.WMBS"])

        self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting")

        myThread = threading.currentThread()
        self.myThread = myThread
        daoFactory = DAOFactory(package="WMCore.WMBS",
                                logger=logging,
                                dbinterface=myThread.dbi)
        self.WMBSFactory = daoFactory

        config = self.getConfig()
        self.changer = ChangeState(config)

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("T1_US_FNAL", 10, 20, "T1_US_FNAL_Disk",
                                     "T1_US_FNAL")
        myResourceControl.insertSite("T1_US_FNAL", 10, 20, "T3_US_FNALLPC",
                                     "T1_US_FNAL")
        myResourceControl.insertSite("T2_CH_CERN", 10, 20, "T2_CH_CERN",
                                     "T2_CH_CERN")

        self.fileset1 = Fileset(name="TestFileset1")
        for fileNum in range(11):
            newFile = File("/some/file/name%d" % fileNum,
                           size=1000,
                           events=100)
            newFile.addRun(Run(1, *[1]))
            newFile.setLocation('T1_US_FNAL_Disk')
            self.fileset1.addFile(newFile)

        self.fileset1.create()

        workflow1 = Workflow(spec="spec.xml",
                             owner="hufnagel",
                             name="TestWorkflow1",
                             task="Test")
        workflow1.create()

        self.subscription1 = Subscription(fileset=self.fileset1,
                                          workflow=workflow1,
                                          split_algo="Harvest",
                                          type="Harvesting")

        self.subscription1.create()
        self.configFile = EmulatorSetup.setupWMAgentConfig()

        return
Example #34
0
    def testInsertAllSEs(self):
        """
        _testInsertAllSEs_

        Test to see if we can insert all SEs and Thresholds at once
        Depending on the WMCore.Services.SiteDB interface
        """
        self.createConfig()

        resControlPath = os.path.join(getTestBase(),
                                      "../../bin/wmagent-resource-control")
        env = os.environ
        env['PYTHONPATH'] = ":".join(sys.path)
        cmdline = [
            resControlPath, "--add-all-sites", "--plugin=CondorPlugin",
            "--pending-slots=100", "--running-slots=500", "--emulator"
        ]
        retval = subprocess.Popen(cmdline,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  env=env)
        (_, _) = retval.communicate()

        cmdline = [resControlPath, "--priority=20", "--task-type=Processing"]
        retval = subprocess.Popen(cmdline,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  env=env)
        (_, _) = retval.communicate()

        myResourceControl = ResourceControl()
        result = myResourceControl.listThresholdsForSubmit()
        self.assertTrue('T1_US_FNAL' in result.keys())
        for x in result.keys():
            self.assertEqual(len(result[x]['thresholds']), 8)
            self.assertEqual(result[x]['total_pending_slots'], 100)
            self.assertEqual(result[x]['total_running_slots'], 500)
            for thresh in result[x]['thresholds']:
                if thresh['task_type'] == 'Processing':
                    self.assertEqual(thresh['priority'], 20)
                    self.assertEqual(thresh['max_slots'], 500)

        # Verify that sites with more than one SE were added correctly.
        nebInfo = myResourceControl.listSiteInfo("T2_US_Nebraska")
        self.assertTrue(len(nebInfo["pnn"]) == 1)
        return
Example #35
0
    def setResourceThresholds(self, site, **options):
        """
        _setResourceThresholds_

        Utility to set resource thresholds
        """
        if not options:
            options = {'state'        : 'Normal',
                       'runningSlots' : 10,
                       'pendingSlots' : 5,
                       'tasks' : ['Processing', 'Merge'],
                       'Processing' : {'pendingSlots' : 5,
                                       'runningSlots' : 10},
                       'Merge' : {'pendingSlots' : 2,
                                  'runningSlots' : 5}}

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site),
                                   ceName = site, plugin = "MockPlugin", pendingSlots = options['pendingSlots'],
                                   runningSlots = options['runningSlots'], cmsName = site)
        for task in options['tasks']:
            resourceControl.insertThreshold(siteName = site, taskType = task,
                                            maxSlots = options[task]['runningSlots'],
                                            pendingSlots = options[task]['pendingSlots'])
        if options.get('state'):
            resourceControl.changeSiteState(site, options.get('state'))

        return
Example #36
0
    def testListSiteInfo(self):
        """
        _testListSiteInfo_

        Verify that the listSiteInfo() methods works properly.
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1")
        myResourceControl.insertSite("testSite2", 100, 200, "testSE2",
                                     "testCE2")

        siteInfo = myResourceControl.listSiteInfo("testSite1")

        self.assertEqual(siteInfo["site_name"], "testSite1",
                         "Error: Site name is wrong.")

        self.assertEqual(siteInfo["se_name"], ["testSE1"],
                         "Error: SE name is wrong.")

        self.assertEqual(siteInfo["ce_name"], "testCE1",
                         "Error: CE name is wrong.")

        self.assertEqual(siteInfo["pending_slots"], 10,
                         "Error: Pending slots is wrong.")

        self.assertEqual(siteInfo["running_slots"], 20,
                         "Error: Pending slots is wrong.")

        return
Example #37
0
    def testChangeSiteState(self):
        """
        _testNewState_

        Check that we can change the state between different values and
        retrieve it through the threshold methods
        """
        self.tempDir = self.testInit.generateWorkDir()
        config = self.createConfig()
        myResourceControl = ResourceControl(config)
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 5)

        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Normal', 'Error: Wrong site state')

        myResourceControl.changeSiteState("testSite1", "Down")
        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Down', 'Error: Wrong site state')
Example #38
0
    def setUp(self):
        """
        setup for test.
        """
        super(JobTrackerTest, self).setUp()
        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        # self.testInit.clearDatabase(modules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"])
        self.testInit.setSchema(customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"],
                                useDefault=False)
        self.testInit.setupCouch("jobtracker_t/jobs", "JobDump")
        self.testInit.setupCouch("jobtracker_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")


        # Create sites in resourceControl
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='malpaquet', pnn='se.malpaquet',
                                   ceName='malpaquet', plugin="CondorPlugin")
        resourceControl.insertThreshold(siteName='malpaquet', taskType='Processing', \
                                        maxSlots=10000, pendingSlots=10000)

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute(siteName="malpaquet", pnn="malpaquet",
                               ceName="malpaquet", plugin="CondorPlugin")

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="jchurchill")

        # We actually need the user name
        self.user = getpass.getuser()

        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
Example #39
0
    def setUp(self):
        
        myThread = threading.currentThread()
        
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"],
                                useDefault = False)

        self.daoFactory = DAOFactory(package = "WMCore.BossAir",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu',
                                   ceName = 'Xanadu', plugin = "TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000)
Example #40
0
def freeSlots(multiplier = 1.0, minusRunning = False):
    """
    Get free resources from wmbs.

    Specify multiplier to apply a ratio to the actual numbers.
    minusRunning control if running jobs should be counted
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl
    rc_sites = ResourceControl().listThresholdsForCreate()
    sites = defaultdict(lambda: 0)
    for name, site in rc_sites.items():
        if not site.get('cms_name'):
            logging.warning("Not fetching work for %s, cms_name not defined" % name)
            continue
        slots = site['total_slots']
        if minusRunning:
            slots -= site['running_jobs']
        if slots > 0:
            sites[site['cms_name']] += (slots * multiplier)
    return dict(sites)
Example #41
0
    def testChangeState(self):
        """
        _testChangeState_

        Check that we can change the state between different values and
        retrieve it through the threshold methods
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1", "Processing", 10, 5)

        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Normal', 'Error: Wrong site state')

        myResourceControl.changeSiteState("testSite1", "Down")
        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Down', 'Error: Wrong site state')
Example #42
0
    def setUp(self):

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        #self.tearDown()
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl",
            "WMCore.Agent.Database"
        ],
                                useDefault=False)

        self.daoFactory = DAOFactory(package="WMCore.BossAir",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='Xanadu',
                                   pnn='se.Xanadu',
                                   ceName='Xanadu',
                                   plugin="TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000, pendingSlots = 10000)

        # Create user
        wmbsFactory = DAOFactory(package="WMCore.WMBS",
                                 logger=myThread.logger,
                                 dbinterface=myThread.dbi)
        newuser = wmbsFactory(classname="Users.New")
        newuser.execute(dn="mnorman",
                        group_name="phgroup",
                        role_name="cmsrole")
Example #43
0
    def setUp(self):
        """
        _setUp_

        Set everything up.
        """

        super(JobSubmitterCachingTest, self).setUp()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"
        ],
                                useDefault=False)
        self.testInit.setupCouch("jobsubmittercaching_t/jobs", "JobDump")
        self.testInit.setupCouch("jobsubmittercaching_t/fwjrs", "FWJRDump")

        resourceControl = ResourceControl()
        for siteName in ["T1_US_FNAL", "T1_UK_RAL"]:
            resourceControl.insertSite(siteName=siteName,
                                       pnn="%s_Disk" % (siteName),
                                       ceName=siteName,
                                       plugin="SimpleCondorPlugin",
                                       cmsName=siteName)
            resourceControl.insertThreshold(siteName=siteName,
                                            taskType="Processing",
                                            maxSlots=10000,
                                            pendingSlots=10000)

        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
        return
Example #44
0
    def testListSiteInfo(self):
        """
        _testListSiteInfo_

        Verify that the listSiteInfo() methods works properly.
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1")
        myResourceControl.insertSite("testSite2", 100, 200, "testSE2", "testCE2")

        siteInfo = myResourceControl.listSiteInfo("testSite1")

        self.assertEqual(siteInfo["site_name"], "testSite1",
                         "Error: Site name is wrong.")

        self.assertEqual(siteInfo["pnn"], ["testSE1"],
                         "Error: SE name is wrong.")

        self.assertEqual(siteInfo["ce_name"], "testCE1",
                         "Error: CE name is wrong.")

        self.assertEqual(siteInfo["pending_slots"], 10,
                         "Error: Pending slots is wrong.")

        self.assertEqual(siteInfo["running_slots"], 20,
                         "Error: Pending slots is wrong.")

        return
Example #45
0
    def setUp(self):
        """
        _setUp_

        Setup the database and logging connection.  Try to create all of the
        WMBS tables.  Also, create some dummy locations.
        """
        super(JobCreatorTest, self).setUp()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()

        self.testInit.setSchema(customModules=[
            'WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database'
        ],
                                useDefault=False)
        self.couchdbname = "jobcreator_t"
        self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump")
        self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump")
        self.configFile = EmulatorSetup.setupWMAgentConfig()

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        locationAction = self.daoFactory(classname="Locations.New")
        for site in self.sites:
            locationAction.execute(siteName=site, pnn=site)

        # Create sites in resourceControl

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName=site, pnn=site, ceName=site)
            resourceControl.insertThreshold(siteName=site,
                                            taskType='Processing',
                                            maxSlots=10000,
                                            pendingSlots=10000)

        self.resourceControl = resourceControl

        self._setup = True
        self._teardown = False

        self.testDir = self.testInit.generateWorkDir()
        self.cwd = os.getcwd()

        # Set heartbeat
        self.componentName = 'JobCreator'
        self.heartbeatAPI = HeartbeatAPI(self.componentName)
        self.heartbeatAPI.registerComponent()

        if PY3:
            self.assertItemsEqual = self.assertCountEqual

        return
Example #46
0
    def testChangeState(self):
        """
        _testChangeState_

        Check that we can change the state between different values and
        retrieve it through the threshold methods
        """
        myResourceControl = ResourceControl()
        myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1")
        myResourceControl.insertThreshold("testSite1",
                                          "Processing",
                                          10,
                                          5,
                                          priority=1)

        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Normal',
                         'Error: Wrong site state')

        myResourceControl.changeSiteState("testSite1", "Down")
        result = myResourceControl.listThresholdsForCreate()
        self.assertEqual(result['testSite1']['state'], 'Down',
                         'Error: Wrong site state')
Example #47
0
def freeSlots(multiplier=1.0,
              minusRunning=False,
              allowedStates=['Normal'],
              knownCmsSites=None):
    """
    Get free resources from wmbs.

    Specify multiplier to apply a ratio to the actual numbers.
    minusRunning control if running jobs should be counted
    """
    from WMCore.ResourceControl.ResourceControl import ResourceControl
    rc_sites = ResourceControl().listThresholdsForCreate()
    sites = defaultdict(lambda: 0)
    for name, site in rc_sites.items():
        if not site.get('cms_name'):
            logging.warning("Not fetching work for %s, cms_name not defined" %
                            name)
            continue
        if knownCmsSites and site['cms_name'] not in knownCmsSites:
            logging.warning(
                "%s doesn't appear to be a known cms site, work may fail to be acquired for it"
                % site['cms_name'])
        if site['state'] not in allowedStates:
            continue
        slots = site['total_slots']
        if minusRunning:
            slots -= site['pending_jobs']
        sites[site['cms_name']] += (slots * multiplier)

    # At the end delete entries < 1
    # This allows us to combine multiple sites under the same CMS_Name
    # Without going nuts
    for site in sites.keys():
        if sites[site] < 1:
            del sites[site]
    return dict(sites)
Example #48
0
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
Example #49
0
    def setUp(self):
        """
        _setUp_

        Set up vital components
        """

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ["WMCore.WMBS",'WMCore.MsgService',
                                                 'WMCore.ResourceControl', 'WMCore.ThreadPool',
                                                 'WMCore.Agent.Database'],
                                useDefault = False)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)



        locationAction = self.daoFactory(classname = "Locations.New")
        pendingSlots  = self.daoFactory(classname = "Locations.SetPendingSlots")


        for site in self.sites:
            locationAction.execute(siteName = site, seName = 'se.%s' % (site), ceName = site)
            pendingSlots.execute(siteName = site, pendingSlots = 1000)


        #Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), ceName = site)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 10000, pendingSlots = 10000)


        self.testDir = self.testInit.generateWorkDir()


        # Set heartbeat
        for component in self.components:
            heartbeatAPI = HeartbeatAPI(component)
            heartbeatAPI.registerComponent()




        return
Example #50
0
    def setUp(self):
        """
        setup for test.
        """
        super(JobTrackerTest, self).setUp()
        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        # self.testInit.clearDatabase(modules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"])
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"
        ],
                                useDefault=False)
        self.testInit.setupCouch("jobtracker_t/jobs", "JobDump")
        self.testInit.setupCouch("jobtracker_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")

        # Create sites in resourceControl
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='malpaquet',
                                   pnn='se.malpaquet',
                                   ceName='malpaquet',
                                   plugin="CondorPlugin")
        resourceControl.insertThreshold(siteName='malpaquet', taskType='Processing', \
                                        maxSlots=10000, pendingSlots=10000)

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute(siteName="malpaquet",
                               pnn="malpaquet",
                               ceName="malpaquet",
                               plugin="CondorPlugin")

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="jchurchill")

        # We actually need the user name
        self.user = getpass.getuser()

        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setSchema(customModules=["WMCore.WMBS"])

        self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting")

        self.myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=self.myThread.dbi)

        myResourceControl = ResourceControl()
        myResourceControl.insertSite("T1_US_FNAL", 1000, 2000,
                                     "T1_US_FNAL_Disk", "T1_US_FNAL")
        myResourceControl.insertSite("T2_CH_CERN", 1000, 2000, "T2_CH_CERN",
                                     "T2_CH_CERN")

        self.performanceParams = {
            'timePerEvent': 12,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }
        # dummy workflow
        self.testWorkflow = Workflow(spec="spec.xml",
                                     owner="dmwm",
                                     name="testWorkflow",
                                     task="Test")
        self.testWorkflow.create()

        if PY3:
            self.assertItemsEqual = self.assertCountEqual

        return
Example #52
0
    def setUp(self):
        """
        setup for test.
        """

        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.tearDown()
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl",
            "WMCore.Agent.Database"
        ],
                                useDefault=False)
        self.testInit.setupCouch("bossair_t/jobs", "JobDump")
        self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump")

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")

        #Create sites in resourceControl
        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertSite(siteName=site,
                                       pnn='se.%s' % (site),
                                       cmsName=site,
                                       ceName=site,
                                       plugin="CondorPlugin",
                                       pendingSlots=1000,
                                       runningSlots=2000)
            resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \
                                            maxSlots = 1000, pendingSlots = 1000)
        resourceControl.insertSite(siteName='Xanadu',
                                   pnn='se.Xanadu',
                                   cmsName=site,
                                   ceName='Xanadu',
                                   plugin="TestPlugin")
        resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \
                                        maxSlots = 10000, pendingSlots = 10000)

        resourceControl.insertSite(siteName='jade-cms.hip.fi',
                                   pnn='madhatter.csc.fi',
                                   cmsName=site,
                                   ceName='jade-cms.hip.fi',
                                   plugin="ARCPlugin")
        resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \
                                        maxSlots = 100, pendingSlots = 100)
        # using this for glite submissions
        resourceControl.insertSite(siteName='grid-ce-01.ba.infn.it',
                                   pnn='storm-se-01.ba.infn.it',
                                   cmsName=site,
                                   ceName='grid-ce-01.ba.infn.it',
                                   plugin='gLitePlugin')
        resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \
                                        maxSlots = 50, pendingSlots = 50)

        # Create user
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole")

        # We actually need the user name
        self.user = getpass.getuser()

        # Change this to the working dir to keep track of error and log files from condor
        self.testDir = self.testInit.generateWorkDir()

        # Set heartbeat
        componentName = 'test'
        self.heartbeatAPI = HeartbeatAPI(componentName)
        self.heartbeatAPI.registerComponent()
        componentName = 'JobTracker'
        self.heartbeatAPI2 = HeartbeatAPI(componentName)
        self.heartbeatAPI2.registerComponent()

        return
Example #53
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Example #54
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs",
                     len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
                )
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info(
                "Skipping cache update to be submitted. (%s job in cache)",
                len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount,
                             len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s",
                              pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in self.ioboundTypes:
                nonDrainingSites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'],
                                               0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {
                'id':
                jobID,
                'requestName':
                newJob['request_name'],
                'taskName':
                newJob['task_name'],
                'taskType':
                newJob['type'],
                'cache_dir':
                newJob["cache_dir"],
                'priority':
                newJob['wf_priority'],
                'taskID':
                newJob['task_id'],
                'retry_count':
                newJob["retry_count"],
                'taskPriority':
                None,  # update from the thresholds
                'custom': {
                    'location': None
                },  # update later
                'packageDir':
                batchDir,
                'sandbox':
                loadedJob["sandbox"],  # remove before submit
                'userdn':
                loadedJob.get("ownerDN", None),
                'usergroup':
                loadedJob.get("ownerGroup", ''),
                'userrole':
                loadedJob.get("ownerRole", ''),
                'possibleSites':
                frozenset(
                    possibleLocations),  # abort and drain sites filtered out
                'potentialSites':
                frozenset(potentialLocations),  # original list of sites
                'scramArch':
                loadedJob.get("scramArch", None),
                'swVersion':
                loadedJob.get("swVersion", None),
                'name':
                loadedJob["name"],
                'proxyPath':
                loadedJob.get("proxyPath", None),
                'estimatedJobTime':
                loadedJob.get("estimatedJobTime", None),
                'estimatedDiskUsage':
                loadedJob.get("estimatedDiskUsage", None),
                'estimatedMemoryUsage':
                loadedJob.get("estimatedMemoryUsage", None),
                'numberOfCores':
                loadedJob.get("numberOfCores", 1),  # may update it later
                'inputDataset':
                loadedJob.get('inputDataset', None),
                'inputDatasetLocations':
                loadedJob.get('inputDatasetLocations', None),
                'allowOpportunistic':
                loadedJob.get('allowOpportunistic', False)
            }

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d",
                    badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
        )
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items(
                ):
                    self.taskTypePrioMap[task] = value.get(
                        'priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName][
                "total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName][
                "total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName][
                "total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName][
                "total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName][
                'thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault(
                "init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][
                jobType].setdefault("init_task_pending_jobs", taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName][
                "init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName,
                                                                 jobType)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (
                jobPrio <= highestPriorityInJobs) or (jobType
                                                      in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(
                totalPendingSlots, initTotalPending) + (
                    totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{
            "Condition": "NoPendingSlot",
            "Current": totalPendingJobs,
            "Threshold": totalPendingThreshold
        }, {
            "Condition": "NoTaskPendingSlot",
            "Current": taskPendingJobs,
            "Threshold": taskPendingThreshold
        }, {
            "Condition": "NoRunningSlot",
            "Current": totalPendingJobs + totalRunningJobs,
            "Threshold": totalJobThreshold
        }, {
            "Condition": "NoTaskRunningSlot",
            "Current": taskPendingJobs + taskRunningJobs,
            "Threshold": totalTaskTheshold
        }]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(),
                              key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn(
                            "Have a job for %s which is not in the resource control",
                            siteName)
                        continue

                    condition = self._getJobSubmitCondition(
                        jobPrio, siteName, jobType)

                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName,
                                      condition)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName][
                        "total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType][
                        "task_pending_jobs"] += 1

                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = self.currentRcThresholds[
                        siteName]['thresholds'][jobType]["priority"]

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info(
                        "Submitter reached limit of submit slots for this cycle: %i",
                        self.maxJobsThisCycle)
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s",
                     dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.",
                     len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName)
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get(
                "CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg,
                                      "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork",
                                        "warning",
                                        this_thread=True)
            self.getThresholds()
            self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(
            dbi=myThread.dbi,
            logger=logging,
            condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Example #55
0
    def testE_SiteModesTest(self):
        """
        _testE_SiteModesTest_

        Test the behavior of the submitter in response to the different
        states of the sites
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)
        nSubs = 1
        nJobs = 20

        sites = [
            'T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL'
        ]
        for site in sites:
            self.setResourceThresholds(site,
                                       pendingSlots=10,
                                       runningSlots=-1,
                                       tasks=['Processing', 'Merge'],
                                       Processing={
                                           'pendingSlots': 10,
                                           'runningSlots': -1
                                       },
                                       Merge={
                                           'pendingSlots': 10,
                                           'runningSlots': -1,
                                           'priority': 5
                                       })

        myResourceControl = ResourceControl(config)
        myResourceControl.changeSiteState('T2_US_Florida', 'Draining')
        # First test that we prefer Normal over drain, and T1 over T2/T3
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        # Actually run it
        jobSubmitter.algorithm()

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection
        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        locationDict = getLocationAction.execute([{
            'jobid': x
        } for x in result])
        for entry in locationDict:
            loc = entry['site_name']
            self.assertNotEqual(loc, 'T2_US_Florida')

        # Now set everything to down, check we don't submit anything
        for site in sites:
            myResourceControl.changeSiteState(site, 'Down')
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter.algorithm()
        # Nothing is submitted despite the empty slots at Uniandes and Florida
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Now set everything to Drain and create Merge jobs. Those should be submitted
        for site in sites:
            myResourceControl.changeSiteState(site, 'Draining')

        nSubsMerge = 1
        nJobsMerge = 5
        jobGroupList = self.createJobGroups(nSubs=nSubsMerge,
                                            nJobs=nJobsMerge,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            taskType='Merge')

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType='Merge')
        self.assertEqual(len(result), nSubsMerge * nJobsMerge)

        # Now set everything to Aborted, and create Merge jobs. Those should fail
        # since the can only run at one place
        for site in sites:
            myResourceControl.changeSiteState(site, 'Aborted')

        nSubsMerge = 1
        nJobsMerge = 5
        jobGroupList = self.createJobGroups(nSubs=nSubsMerge,
                                            nJobs=nJobsMerge,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            taskType='Merge')

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='SubmitFailed', jobType='Merge')
        self.assertEqual(len(result), nSubsMerge * nJobsMerge)
        result = getJobsAction.execute(state='Executing', jobType='Processing')
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #56
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.tasksCPU = ['Processing', 'Production']
        self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        self.minCPUSlots = 50
        self.minIOSlots = 25

        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        self.ssb = Dashboard(self.dashboard)

        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent

        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher,
                                     'forceSiteDown', [])

        # agent team (for dynamic threshold) and queueParams (drain mode)
        self.teamName = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher,
                                       'defaultAgentsNumByTeam', 5)

        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB

        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)

        # set resource control
        self.resourceControl = ResourceControl(config=self.config)

        # wmstats connection
        self.centralCouchDBReader = WMStatsReader(
            self.config.AgentStatusWatcher.centralWMStatsURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        if not self.enabled:
            logging.info(
                "This component is not enabled in the configuration. Doing nothing."
            )
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s", sitesRC)
            # first, update site status
            ssbSiteStatus = self.getSiteStatus()
            self.checkStatusChanges(sitesRC, ssbSiteStatus)

            # now fetch site slots thresholds
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                logging.error(
                    "One or more of the SSB metrics is down. Please contact the Dashboard team."
                )
                return

            logging.debug("Info from SSB: %s", sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s", traceback.format_exc())
        logging.info(
            "Resource control cycle finished updating site state and thresholds."
        )

    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_

        Get the WMStats view for agents and teams
        """
        if isDrainMode(self.config):
            # maximize pending thresholds to get this agent drained ASAP
            self.agentsNumByTeam = 1
            return

        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam(
                filterDrain=True)
        except Exception:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.warning(
                "agentInfo couch view is not available, use default value %s",
                self.agentsNumByTeam)
        else:
            self.agentsNumByTeam = agentsByTeam.get(self.teamName,
                                                    self.agentsNumByTeam)
            logging.debug(
                "Agents connected to the same team (not in DrainMode): %d",
                self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_

        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric)
        ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric)

        ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots)

        return ssbSiteSlots

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down", site)
                self.updateSiteState(site, 'Down')
            infoSSB.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down',
                                 site)
                    self.updateSiteState(site, 'Down')

        # normally set all the others
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s', site,
                             infoRC[site]['state'], infoSSB[site]['state'])
                self.updateSiteState(site, infoSSB[site]['state'])
        return

    def checkSlotsChanges(self, infoRC, infoSSB):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then updates the task level too.
        """
        logging.debug(
            "Settings for site and task pending slots: %s%% and %s%%",
            self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and site.startswith('T1_'):
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100
                infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100
            else:
                # round very small sites to the bare minimum
                infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'],
                                                self.minCPUSlots)
                infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'],
                                               self.minIOSlots)
            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']

            sitePending = max(
                int(CPUBound / self.agentsNumByTeam *
                    self.pendingSlotsSitePercent / 100), self.minCPUSlots)

            # update site slots, if needed
            if infoRC[site]['running_slots'] != CPUBound or infoRC[site][
                    'pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.info(
                    "Updating %s site thresholds for pend/runn: %d/%d", site,
                    sitePending, CPUBound)
                self.resourceControl.setJobSlotsForSite(
                    site,
                    pendingJobSlots=sitePending,
                    runningJobSlots=CPUBound)

            # now handle the task level thresholds
            self.checkTaskSlotsChanges(site, CPUBound, IOBound)

    def thresholdsByVOName(self, infoCpu, infoIo):
        """
        _thresholdsByVOName_

        Creates a dictionary with CPU and IO slots keyed by the site name.
        If any of the thresholds is missing or has an invalid value, the whole
        site thresholds is skipped.
        """
        ssbSiteSlots = {}
        for entry in infoCpu:
            if entry['Value'] is None:
                logging.warn(
                    'Site %s has invalid CPU thresholds in SSB. Taking no action',
                    entry['VOName'])
            else:
                ssbSiteSlots[entry['VOName']] = {
                    'slotsCPU': int(entry['Value'])
                }

        # then iterate over the IO slots
        for entry in infoIo:
            if entry['Value'] is None:
                logging.warn(
                    'Site %s has invalid IO thresholds in SSB. Taking no action',
                    entry['VOName'])
            else:
                ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value'])

        # Before proceeding, remove sites without both metrics
        for site in ssbSiteSlots.keys():
            if len(ssbSiteSlots[site]) != 2:
                logging.warn("Site: %s has incomplete SSB metrics, see %s",
                             site, ssbSiteSlots[site])
                ssbSiteSlots.pop(site)

        return ssbSiteSlots

    def getSiteStatus(self):
        """
        _getSiteStatus_

        Fetch site state from SSB and map it to agent state
        """
        ssbState = self.ssb.getMetric(self.siteStatusMetric)

        ssbSiteState = {}
        for site in ssbState:
            voname = site['VOName']
            status = site['Status']
            if voname not in ssbSiteState:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error(
                        "Unknown status '%s' for site %s, please check SSB",
                        status, voname)
                else:
                    ssbSiteState[voname] = {'state': statusAgent}
            else:
                logging.warning(
                    'I have a duplicated status entry in SSB for %s', voname)

        return ssbSiteState

    def getState(self, stateSSB):
        """
        _getState_

        Translates SSB states into resource control state
        """
        ssb2agent = {
            'enabled': 'Normal',
            'drain': 'Draining',
            'disabled': 'Down',
            'test': 'Draining'
        }
        # 'test' state behaviour varies between production and tier0 agents
        ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining"

        return ssb2agent.get(stateSSB)

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_

        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:", siteName, state)
            logging.error(str(ex))
            logging.error("Traceback: \n%s", traceback.format_exc())
        return

    def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound):
        """
        _checkTaskSlotsChanges_

        Update the CPU and IOBound slots for a given site.
        """
        siteTaskSlots = self.resourceControl.thresholdBySite(siteName)
        taskCPUPending = max(
            int(CPUBound / self.agentsNumByTeam *
                self.pendingSlotsTaskPercent / 100), self.minCPUSlots)
        taskIOPending = max(
            int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent /
                100), self.minIOSlots)

        updateTasks = False
        if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0][
                'task_pending_slots'] != taskCPUPending:
            updateTasks = True
        elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[
                0]['task_pending_slots'] != taskIOPending:
            updateTasks = True

        if updateTasks:
            logging.info(
                "Updating %s CPU tasks thresholds for pend/runn: %d/%d",
                siteName, taskCPUPending, CPUBound)
            self.resourceControl.insertThreshold(siteName,
                                                 taskType=self.tasksCPU,
                                                 maxSlots=CPUBound,
                                                 pendingSlots=taskCPUPending)
            logging.info(
                "Updating %s IO tasks thresholds for pend/runn: %d/%d",
                siteName, taskIOPending, IOBound)
            self.resourceControl.insertThreshold(siteName,
                                                 taskType=self.tasksIO,
                                                 maxSlots=IOBound,
                                                 pendingSlots=taskIOPending)

        if self.tier0Mode:
            # Set task thresholds for Tier0
            logging.debug("Updating %s Express and Repack task thresholds.",
                          siteName)
            expressSlots = int(CPUBound * self.runningExpressPercent / 100)
            pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent /
                                 100)
            self.resourceControl.insertThreshold(siteName, 'Express',
                                                 expressSlots, pendingExpress)

            repackSlots = int(CPUBound * self.runningRepackPercent / 100)
            pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent /
                                100)
            self.resourceControl.insertThreshold(siteName, 'Repack',
                                                 repackSlots, pendingRepack)
Example #57
0
class ResourceControlUpdater(BaseWorkerThread):
    """
    Update site status and thresholds from SSB
    """
    def __init__(self, config):
        """
        Initialize 
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        self.setVariables(self.config)
        
    def setVariables(self, config):
        """
        load all the variables from the config file
        """
        # get dashboard url, set metric columns from config
        self.dashboard = config.AgentStatusWatcher.dashboard
        self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric
        self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric
        self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric
        
        # set pending percentages from config
        self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent
        self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent
        self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent
        self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent
        
        # sites forced to down
        self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', [])
        
        # agent teams (for dynamic threshold) and queueParams (drain mode)
        self.teamNames = config.Agent.teamName
        self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5)
                
        # only SSB sites
        self.onlySSB = config.AgentStatusWatcher.onlySSB
        
        # tier mode
        self.tier0Mode = hasattr(config, "Tier0Feeder")
        self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores

        # switch this component on/off
        self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True)
       
        
    def setup(self, parameters):
        """
        Set db connection and prepare resource control
        """
        # Interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set resource control
        self.resourceControl = ResourceControl(config = self.config)
        
        # wmstats connection 
        self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
        
    def algorithm(self, parameters):
        """
        _algorithm_
        
        Update site state and thresholds, based on differences between resource
        control database and info available in SSB.
            1. Get info from Resource Control database
            2. Get info from SSB
            3. Get information about teams and number of agents from WMStats
            4. Change site state when needed (this triggers a condor clasAd fetch)
            5. Change site thresholds when needed (and task thresholds)
        Sites from SSB are validated with PhEDEx node names
        """
        # set variables every polling cycle
        self.setVariables(self.config)
        if not self.enabled:
            logging.info("This component is not enabled in the configuration. Doing nothing.")
            return

        try:
            sitesRC = self.resourceControl.listSitesSlots()
            logging.debug("Info from resource control: %s" % sitesRC)
            sitesSSB = self.getInfoFromSSB()
            if not sitesSSB:
                return
            logging.debug("Info from SSB: %s" % sitesSSB)

            # Check which site states need to be updated in the database
            sitesRC = self.checkStatusChanges(sitesRC, sitesSSB)

            # get number of agents working in the same team (not in DrainMode)
            self.getAgentsByTeam()

            # Check which site slots need to be updated in the database
            self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam)
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
        logging.info("Resource control cycle finished updating site state and thresholds.")


    def getAgentsByTeam(self):
        """
        _getAgentsByTeam_
        
        Get the WMStats view about agents and teams
        """
        agentsByTeam = {}
        try:
            agentsByTeam = self.centralCouchDBReader.agentsByTeam()
        except Exception as ex:
            logging.error("WMStats is not available or is unresponsive.")

        if not agentsByTeam:
            logging.debug("agentInfo couch view is not available, use default value %s" % self.agentsNumByTeam)
        else:
            self.agentsByTeam = agentsByTeam
            agentsCount = []
            for team in self.teamNames.split(','):
                if team not in self.agentsByTeam:
                    agentsCount.append(1)
                else:
                    agentsCount.append(self.agentsByTeam[team])
            # If agent is in several teams, we choose the team with less agents
            self.agentsNumByTeam = min(agentsCount, self.agentsNumByTeam)
            logging.debug("Agents connected to the same team (not in DrainMode): %d" % self.agentsNumByTeam)
        return

    def getInfoFromSSB(self):
        """
        _getInfoFromSSB_
        
        Get site status, CPU bound and IO bound from dashboard (SSB).

        Returns a dict of dicts where the first key is the site name.
        """
        # urls from site status board
        url_site_state = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.siteStatusMetric)
        url_cpu_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.cpuBoundMetric)
        url_io_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.ioBoundMetric)

        # get info from dashboard
        sites = urllib2.urlopen(url_site_state).read()
        cpu_bound = urllib2.urlopen(url_cpu_bound).read()
        io_bound = urllib2.urlopen(url_io_bound).read()

        # parse from json format to dictionary, get only 'csvdata'
        site_state = json.loads(sites)['csvdata']
        cpu_slots = json.loads(cpu_bound)['csvdata']
        io_slots = json.loads(io_bound)['csvdata']

        # dictionaries with status/thresholds info by VOName
        stateBySite = self.siteStatusByVOName(site_state)
        slotsCPU = self.thresholdsByVOName(cpu_slots)
        slotsIO = self.thresholdsByVOName(io_slots)

        sitesSSB = {}
        if not stateBySite or not slotsCPU or not slotsIO:
            logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.")
            return sitesSSB

        for k,v in stateBySite.iteritems():
            sitesSSB[k] = {'state': v}
            sitesSSB[k]['slotsCPU'] = slotsCPU[k] if k in slotsCPU else None
            sitesSSB[k]['slotsIO'] = slotsIO[k] if k in slotsIO else None
        return sitesSSB

    def checkStatusChanges(self, infoRC, infoSSB):
        """
        _checkStatusChanges_

        Checks which sites need to have their site state updated in
        resource control, based on:
          1. settings defined for the component (config.py)
          2. site state changes between SSB and RC

        Returns the new infoRC dict (where a few key/value pairs were
        deleted - no need to update slots information)
        """
        # First sets list of forced sites to down (HLT @FNAL is an example)
        for site in self.forceSiteDown:
            if site in infoRC and infoRC[site]['state'] != 'Down':
                logging.info("Forcing site %s to Down" % site)
                self.updateSiteState(site, 'Down')
            infoRC.pop(site, None)

        # if onlySSB sites, force all the sites not in SSB to down
        if self.onlySSB:
            for site in set(infoRC).difference(set(infoSSB)):
                if infoRC[site]['state'] != 'Down':
                    logging.info('Only SSBsites, forcing site %s to Down' % site)
                    self.updateSiteState(site, 'Down')
                infoRC.pop(site, None)

        # this time don't update infoRC since we still want to update slots info
        for site in set(infoRC).intersection(set(infoSSB)):
            if infoRC[site]['state'] != infoSSB[site]['state']:
                logging.info('Changing %s state from %s to %s' % (site, infoRC[site]['state'],
                                                                  infoSSB[site]['state']))
                self.updateSiteState(site, infoSSB[site]['state'])
        return infoRC

    def checkSlotsChanges(self, infoRC, infoSSB, agentsCount):
        """
        _checkSlotsChanges_

        Checks which sites need to have their running and/or pending
        slots updated in resource control database, based on:
          1. number of agents connected to the same team
          2. and slots provided by the Dashboard team (SSB)

        If site slots are updated, then also updates its tasks.
        """
        tasksCPU = ['Processing', 'Production']
        tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim']
        minCPUSlots, minIOSlots = 50, 25

        logging.debug("Settings for site and task pending slots: %s%% and %s%%" % 
                      (self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)) 

        for site in set(infoRC).intersection(set(infoSSB)):
            if self.tier0Mode and 'T1_' in site:
                # T1 cores utilization for Tier0
                infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores/100
                infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores/100

            # round very small sites to the bare minimum
            if infoSSB[site]['slotsCPU'] < minCPUSlots:
                infoSSB[site]['slotsCPU'] = minCPUSlots
            if infoSSB[site]['slotsIO'] < minIOSlots:
                infoSSB[site]['slotsIO'] = minIOSlots

            CPUBound = infoSSB[site]['slotsCPU']
            IOBound = infoSSB[site]['slotsIO']
            sitePending = max(int(CPUBound/agentsCount * self.pendingSlotsSitePercent/100), minCPUSlots)
            taskCPUPending = max(int(CPUBound/agentsCount * self.pendingSlotsTaskPercent/100), minCPUSlots)
            taskIOPending = max(int(IOBound/agentsCount * self.pendingSlotsTaskPercent/100), minIOSlots)

            if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending:
                # Update site running and pending slots
                logging.debug("Updating %s site thresholds for pend/runn: %d/%d" % (site, sitePending, CPUBound))
                self.resourceControl.setJobSlotsForSite(site, pendingJobSlots = sitePending,
                                                        runningJobSlots = CPUBound)
                # Update site CPU tasks running and pending slots (large running slots)
                logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d" % (site, taskCPUPending,
                                                                                        CPUBound))
                for task in tasksCPU:
                    self.resourceControl.insertThreshold(site, taskType = task, maxSlots = CPUBound,
                                                         pendingSlots = taskCPUPending)
                # Update site IO tasks running and pending slots
                logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d" % (site, taskIOPending,
                                                                                       IOBound))
                for task in tasksIO:
                    self.resourceControl.insertThreshold(site, taskType = task, maxSlots = IOBound,
                                                         pendingSlots = taskIOPending)

            if self.tier0Mode:
                # Set task thresholds for Tier0
                logging.debug("Updating %s Express and Repack task thresholds." % site)
                expressSlots = int(CPUBound * self.runningExpressPercent/100)
                pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent/100)
                self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress)

                repackSlots = int(CPUBound * self.runningRepackPercent/100)
                pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent/100)
                self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack)


    def thresholdsByVOName(self, sites):
        """
        _thresholdsByVOName_
        
        Creates a dictionary with keys->VOName and values->threshold: 
        """
        thresholdbyVOName = {}
        for site in sites:
            voname = site['VOName']
            value = site['Value']
            if voname not in thresholdbyVOName:
                if value is None: 
                    logging.warn('Site %s does not have thresholds in SSB, assuming 0' % voname) 
                    thresholdbyVOName[voname] = 0
                else:
                    thresholdbyVOName[voname] = int(value)
            else:
                logging.error('I have a duplicated threshold entry in SSB for %s' % voname) 
        return thresholdbyVOName
    
    def siteStatusByVOName(self, sites):
        """
        _siteStatusByVOName_
        
        Creates a dictionary with keys->VOName and values->status:
        """
        statusBySite = {}
        for site in sites:
            voname = site['VOName']
            status = site['Status']
            if not status: 
                logging.error('Site %s does not have status in SSB' % voname)
                continue
            if voname not in statusBySite:
                statusAgent = self.getState(str(status))
                if not statusAgent:
                    logging.error("Unkwown status '%s' for site %s, please check SSB" % (status, voname))
                    continue
                statusBySite[voname] = statusAgent
            else:
                logging.error('I have a duplicated status entry in SSB for %s' % voname) 
        return statusBySite

    def getState(self, stateSSB):
        """
        _getState_
        
        Translates SSB states into resource control state
        """
        ssb2agent = {'on':    'Normal',
                     'drain': 'Draining',
                     'down': 'Down',
                     'skip': 'Down'}

        if stateSSB in ssb2agent:
            return ssb2agent[stateSSB]
        elif stateSSB == "tier0":
            logging.debug('There is a site in tier0 status (Tier0Mode is %s)' % self.tier0Mode )
            if self.tier0Mode: 
                return "Normal"
            else:
                return "Draining"
        else:
            return None

    def updateSiteState(self, siteName, state):
        """
        _updateSiteState_
    
        Update only the site state in the resource control database.
        """
        try:
            self.resourceControl.changeSiteState(siteName, state)
        except Exception as ex:
            logging.error("Failed to update %s state to %s:" % (siteName, state))
            logging.error(str(ex))
            logging.error("Traceback: \n%s" % traceback.format_exc())
        return
Example #58
0
    def setupForKillTest(self, baAPI=None):
        """
        _setupForKillTest_

        Inject a workflow into WMBS that has a processing task, a merge task and
        a cleanup task.  Inject files into the various tasks at various
        processing states (acquired, complete, available...).  Also create jobs
        for each subscription in various states.
        """
        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMCore.WMBS",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)

        locationAction = daoFactory(classname="Locations.New")
        changeStateAction = daoFactory(classname="Jobs.ChangeState")
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   seName='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertThreshold(siteName = 'site1', taskType = 'Processing', \
                                        maxSlots = 10000, pendingSlots = 10000)

        userDN = 'someDN'
        userAction = daoFactory(classname="Users.New")
        userAction.execute(dn=userDN,
                           group_name='DEFAULT',
                           role_name='DEFAULT')

        inputFileset = Fileset("input")
        inputFileset.create()

        inputFileA = File("lfnA", locations="goodse.cern.ch")
        inputFileB = File("lfnB", locations="goodse.cern.ch")
        inputFileC = File("lfnC", locations="goodse.cern.ch")
        inputFileA.create()
        inputFileB.create()
        inputFileC.create()

        inputFileset.addFile(inputFileA)
        inputFileset.addFile(inputFileB)
        inputFileset.addFile(inputFileC)
        inputFileset.commit()

        unmergedOutputFileset = Fileset("unmerged")
        unmergedOutputFileset.create()

        unmergedFileA = File("ulfnA", locations="goodse.cern.ch")
        unmergedFileB = File("ulfnB", locations="goodse.cern.ch")
        unmergedFileC = File("ulfnC", locations="goodse.cern.ch")
        unmergedFileA.create()
        unmergedFileB.create()
        unmergedFileC.create()

        unmergedOutputFileset.addFile(unmergedFileA)
        unmergedOutputFileset.addFile(unmergedFileB)
        unmergedOutputFileset.addFile(unmergedFileC)
        unmergedOutputFileset.commit()

        mainProcWorkflow = Workflow(spec="spec1",
                                    owner="Steve",
                                    name="Main",
                                    task="Proc")
        mainProcWorkflow.create()
        mainProcMergeWorkflow = Workflow(spec="spec1",
                                         owner="Steve",
                                         name="Main",
                                         task="ProcMerge")
        mainProcMergeWorkflow.create()
        mainCleanupWorkflow = Workflow(spec="spec1",
                                       owner="Steve",
                                       name="Main",
                                       task="Cleanup")
        mainCleanupWorkflow.create()

        self.mainProcSub = Subscription(fileset=inputFileset,
                                        workflow=mainProcWorkflow,
                                        type="Processing")
        self.mainProcSub.create()
        self.mainProcSub.acquireFiles(inputFileA)
        self.mainProcSub.completeFiles(inputFileB)

        procJobGroup = JobGroup(subscription=self.mainProcSub)
        procJobGroup.create()
        self.procJobA = Job(name="ProcJobA")
        self.procJobA["state"] = "new"
        self.procJobA["location"] = "site1"
        self.procJobB = Job(name="ProcJobB")
        self.procJobB["state"] = "executing"
        self.procJobB["location"] = "site1"
        self.procJobC = Job(name="ProcJobC")
        self.procJobC["state"] = "complete"
        self.procJobC["location"] = "site1"
        self.procJobA.create(procJobGroup)
        self.procJobB.create(procJobGroup)
        self.procJobC.create(procJobGroup)

        self.mainMergeSub = Subscription(fileset=unmergedOutputFileset,
                                         workflow=mainProcMergeWorkflow,
                                         type="Merge")
        self.mainMergeSub.create()
        self.mainMergeSub.acquireFiles(unmergedFileA)
        self.mainMergeSub.failFiles(unmergedFileB)

        mergeJobGroup = JobGroup(subscription=self.mainMergeSub)
        mergeJobGroup.create()
        self.mergeJobA = Job(name="MergeJobA")
        self.mergeJobA["state"] = "exhausted"
        self.mergeJobA["location"] = "site1"
        self.mergeJobB = Job(name="MergeJobB")
        self.mergeJobB["state"] = "cleanout"
        self.mergeJobB["location"] = "site1"
        self.mergeJobC = Job(name="MergeJobC")
        self.mergeJobC["state"] = "new"
        self.mergeJobC["location"] = "site1"
        self.mergeJobA.create(mergeJobGroup)
        self.mergeJobB.create(mergeJobGroup)
        self.mergeJobC.create(mergeJobGroup)

        self.mainCleanupSub = Subscription(fileset=unmergedOutputFileset,
                                           workflow=mainCleanupWorkflow,
                                           type="Cleanup")
        self.mainCleanupSub.create()
        self.mainCleanupSub.acquireFiles(unmergedFileA)
        self.mainCleanupSub.completeFiles(unmergedFileB)

        cleanupJobGroup = JobGroup(subscription=self.mainCleanupSub)
        cleanupJobGroup.create()
        self.cleanupJobA = Job(name="CleanupJobA")
        self.cleanupJobA["state"] = "new"
        self.cleanupJobA["location"] = "site1"
        self.cleanupJobB = Job(name="CleanupJobB")
        self.cleanupJobB["state"] = "executing"
        self.cleanupJobB["location"] = "site1"
        self.cleanupJobC = Job(name="CleanupJobC")
        self.cleanupJobC["state"] = "complete"
        self.cleanupJobC["location"] = "site1"
        self.cleanupJobA.create(cleanupJobGroup)
        self.cleanupJobB.create(cleanupJobGroup)
        self.cleanupJobC.create(cleanupJobGroup)

        jobList = [
            self.procJobA, self.procJobB, self.procJobC, self.mergeJobA,
            self.mergeJobB, self.mergeJobC, self.cleanupJobA, self.cleanupJobB,
            self.cleanupJobC
        ]

        changeStateAction.execute(jobList)

        if baAPI:
            for job in jobList:
                job['plugin'] = 'TestPlugin'
                job['userdn'] = userDN
                job['usergroup'] = 'DEFAULT'
                job['userrole'] = 'DEFAULT'
                job['custom']['location'] = 'site1'
            baAPI.createNewJobs(wmbsJobs=jobList)

        # We'll create an unrelated workflow to verify that it isn't affected
        # by the killing code.
        bogusFileset = Fileset("dontkillme")
        bogusFileset.create()

        bogusFileA = File("bogus/lfnA", locations="goodse.cern.ch")
        bogusFileA.create()
        bogusFileset.addFile(bogusFileA)
        bogusFileset.commit()

        bogusWorkflow = Workflow(spec="spec2",
                                 owner="Steve",
                                 name="Bogus",
                                 task="Proc")
        bogusWorkflow.create()
        self.bogusSub = Subscription(fileset=bogusFileset,
                                     workflow=bogusWorkflow,
                                     type="Processing")
        self.bogusSub.create()
        self.bogusSub.acquireFiles(bogusFileA)
        return