def testUpdateJobSlots(self): """ _testUpdateJobSlots_ Verify that it is possible to update the number of job slots at a site. """ myResourceControl = ResourceControl() myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1") siteInfo = myResourceControl.listSiteInfo("testSite1") self.assertEqual(siteInfo["pending_slots"], 10, "Error: Pending slots is wrong.") self.assertEqual(siteInfo["running_slots"], 20, "Error: Running slots is wrong.") myResourceControl.setJobSlotsForSite("testSite1", pendingJobSlots = 20) siteInfo = myResourceControl.listSiteInfo("testSite1") self.assertEqual(siteInfo["pending_slots"], 20, "Error: Pending slots is wrong.") myResourceControl.setJobSlotsForSite("testSite1", runningJobSlots = 40) siteInfo = myResourceControl.listSiteInfo("testSite1") self.assertEqual(siteInfo["running_slots"], 40, "Error: Running slots is wrong.") myResourceControl.setJobSlotsForSite("testSite1", 5, 10) siteInfo = myResourceControl.listSiteInfo("testSite1") self.assertEqual(siteInfo["pending_slots"], 5, "Error: Pending slots is wrong.") self.assertEqual(siteInfo["running_slots"], 10, "Error: Running slots is wrong.") return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() #DAO factory for WMBS objects self.daoFactory = DAOFactory(package = "WMCore.WMBS", \ logger = logging, dbinterface = myThread.dbi) self.config = config #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) # BossAir self.bossAir = BossAirAPI(config=self.config) # Additions for caching-based JobSubmitter self.workflowTimestamps = {} self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.siteKeys = {} self.locationDict = {} self.cmsNames = {} self.drainSites = [] self.sortedSites = [] self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) # initialize the alert framework (if available) self.initAlerts(compName="JobSubmitter") try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except Exception, ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) try: logging.debug("PackageDir: %s" % self.packageDir) logging.debug("Config: %s" % config) except: pass raise JobSubmitterPollerException(msg)
def setResourceThresholds(self, site, **options): """ _setResourceThresholds_ Utility to set resource thresholds """ if not options: options = {'state': 'Normal', 'runningSlots': 10, 'pendingSlots': 5, 'tasks': ['Processing', 'Merge'], 'Processing': {'pendingSlots': 5, 'runningSlots': 10}, 'Merge': {'pendingSlots': 2, 'runningSlots': 5}} resourceControl = ResourceControl() resourceControl.insertSite(siteName=site, pnn='se.%s' % (site), ceName=site, plugin="MockPlugin", pendingSlots=options['pendingSlots'], runningSlots=options['runningSlots'], cmsName=site) for task in options['tasks']: resourceControl.insertThreshold(siteName=site, taskType=task, maxSlots=options[task]['runningSlots'], pendingSlots=options[task]['pendingSlots']) if options.get('state'): resourceControl.changeSiteState(site, options.get('state')) return
def freeSlots(multiplier=1.0, minusRunning=False, allowedStates=['Normal'], knownCmsSites=None): """ Get free resources from wmbs. Specify multiplier to apply a ratio to the actual numbers. minusRunning control if running jobs should be counted """ from WMCore.ResourceControl.ResourceControl import ResourceControl rc_sites = ResourceControl().listThresholdsForCreate() thresholds = defaultdict(lambda: 0) jobCounts = defaultdict(dict) for name, site in rc_sites.items(): if not site.get('cms_name'): logging.warning("Not fetching work for %s, cms_name not defined" % name) continue if knownCmsSites and site['cms_name'] not in knownCmsSites: logging.warning( "%s doesn't appear to be a known cms site, work may fail to be acquired for it" % site['cms_name']) if site['state'] not in allowedStates: continue slots = site['total_slots'] thresholds[site['cms_name']] += (slots * multiplier) if minusRunning: jobCounts[site['cms_name']] = dict( (k, jobCounts[site['cms_name']].get(k, 0) + site['pending_jobs'].get(k, 0)) for k in site['pending_jobs']) return dict(thresholds), dict(jobCounts)
def testInsertAllSEs2(self): """ _testInsertAllSEs2_ Test to see if we can insert all SEs and Thresholds at once Depending on the WMCore.Services.SiteDB interface """ myResourceControl = ResourceControl() taskList = [{'taskType': 'Processing', 'maxSlots': 100, 'pendingSlots' : 80}, {'taskType': 'Merge', 'maxSlots': 50, 'pendingSlots' : 30}] myResourceControl.insertAllSEs(siteName = 'test', pendingSlots = 200, runningSlots = 400, ceName = 'glidein-ce.fnal.gov', plugin = 'CondorPlugin', taskList = taskList) result = myResourceControl.listThresholdsForSubmit() self.assertTrue('test_cmssrm.fnal.gov' in result.keys()) self.assertEqual(result['test_cmssrm.fnal.gov']['cms_name'], 'T1_US_FNAL') for x in result.keys(): self.assertEqual(len(result[x]['thresholds']), 2) self.assertEqual(result[x]['total_pending_slots'], 200) self.assertEqual(result[x]['total_running_slots'], 400) for thresh in result[x]['thresholds']: if thresh['task_type'] == 'Processing': self.assertEqual(thresh['priority'], 0) self.assertEqual(thresh['max_slots'], 100) self.assertEqual(thresh['pending_slots'], 80) else: self.assertEqual(thresh['priority'], 5) self.assertEqual(thresh['max_slots'], 50) self.assertEqual(thresh['pending_slots'], 30) return
def testInsertT0(self): """ _testInsertT0_ Test to see if we can insert the Tier-0 alone with a single option """ self.createConfig() resControlPath = os.path.join(getTestBase(), "../../bin/wmagent-resource-control") env = os.environ env['PYTHONPATH'] = ":".join(sys.path) cmdline = [resControlPath, "--add-T0" ] retval = subprocess.Popen( cmdline, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, env = env) (_, _) = retval.communicate() myResourceControl = ResourceControl() result = myResourceControl.listThresholdsForSubmit() self.assertEquals(len(result), 1) self.assertTrue('CERN' in result) for x in result: self.assertEqual(len(result[x]['thresholds']), 10) self.assertEqual(result[x]['total_pending_slots'], 500) self.assertEqual(result[x]['total_running_slots'], -1) for thresh in result[x]['thresholds']: if thresh['task_type'] == 'Processing': self.assertEqual(thresh['priority'], 0) self.assertEqual(thresh['max_slots'], -1) # Verify that sites with more than one SE were added correctly. cernInfo = myResourceControl.listSiteInfo("CERN") self.assertTrue(len(cernInfo["se_name"]) == 2) return
def testThresholdsForSite(self): """ _testThresholdsForSite_ Check that we can get the thresholds in intelligible form for each site """ myResourceControl = ResourceControl() myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1") myResourceControl.insertThreshold("testSite1", "Processing", 10, 8) myResourceControl.insertThreshold("testSite1", "Merge", 5, 3) result = myResourceControl.thresholdBySite(siteName = "testSite1") procInfo = {} mergInfo = {} for res in result: if res['task_type'] == 'Processing': procInfo = res elif res['task_type'] == 'Merge': mergInfo = res self.assertEqual(procInfo.get('pending_slots', None), 20) self.assertEqual(procInfo.get('running_slots', None), 40) self.assertEqual(procInfo.get('max_slots', None), 10) self.assertEqual(procInfo.get('task_pending_slots', None), 8) self.assertEqual(mergInfo.get('pending_slots', None), 20) self.assertEqual(mergInfo.get('running_slots', None), 40) self.assertEqual(mergInfo.get('max_slots', None), 5) self.assertEqual(mergInfo.get('task_pending_slots', None), 3) return
def testAbortedState(self): """ _testAbortedState_ Check that we can kill jobs when a site is set to aborted ### We no longer need this test as we are not killing jobs that are running """ self.tempDir = self.testInit.generateWorkDir() config = self.createConfig() myResourceControl = ResourceControl(config) myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1", "T1_US_FNAL", "MockPlugin") myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2", "T1_IT_CNAF", "MockPlugin") myResourceControl.insertThreshold("testSite1", "Processing", 20, 10) myResourceControl.insertThreshold("testSite1", "Merge", 200, 100) myResourceControl.insertThreshold("testSite2", "Processing", 50, 25) myResourceControl.insertThreshold("testSite2", "Merge", 135, 65) self.createJobs() myResourceControl.changeSiteState("testSite1", "Aborted") ## Now check the tempDir for a FWJR for the killed job reportPath = os.path.join(self.tempDir, "Report.0.pkl") report = Report() report.load(reportPath) self.assertEqual(report.getExitCode(), 61301) return
def testListSiteInfo(self): """ _testListSiteInfo_ Verify that the listSiteInfo() methods works properly. """ myResourceControl = ResourceControl() myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1") myResourceControl.insertSite("testSite2", 100, 200, "testSE2", "testCE2") siteInfo = myResourceControl.listSiteInfo("testSite1") self.assertEqual( siteInfo["site_name"], "testSite1", "Error: Site name is wrong." ) self.assertEqual( siteInfo["se_name"], ["testSE1"], "Error: SE name is wrong." ) self.assertEqual( siteInfo["ce_name"], "testCE1", "Error: CE name is wrong." ) self.assertEqual( siteInfo["pending_slots"], 10, "Error: Pending slots is wrong." ) self.assertEqual( siteInfo["running_slots"], 20, "Error: Pending slots is wrong." ) return
def setUp(self): """ _setUp_ Set everything up. """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl" ], useDefault=False) self.testInit.setupCouch("jobsubmittercaching_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmittercaching_t/fwjrs", "FWJRDump") resourceControl = ResourceControl() for siteName in ["T1_US_FNAL", "T1_UK_RAL"]: resourceControl.insertSite(siteName=siteName, seName="se.%s" % (siteName), ceName=siteName, plugin="CondorPlugin", cmsName=siteName) resourceControl.insertThreshold(siteName=siteName, taskType="Processing", maxSlots=10000, pendingSlots=10000) self.testDir = self.testInit.generateWorkDir() #os.environ["COUCHDB"] = "jobsubmittercaching_t" self.configFile = EmulatorSetup.setupWMAgentConfig() return
def setUp(self): myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() #self.tearDown() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database" ], useDefault=False) self.daoFactory = DAOFactory(package="WMCore.BossAir", logger=myThread.logger, dbinterface=myThread.dbi) resourceControl = ResourceControl() resourceControl.insertSite(siteName='Xanadu', pnn='se.Xanadu', ceName='Xanadu', plugin="TestPlugin") resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) # Create user wmbsFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) newuser = wmbsFactory(classname="Users.New") newuser.execute(dn="mnorman", group_name="phgroup", role_name="cmsrole")
def testThresholdPriority(self): """ _testThresholdPriority_ Test that we get things back in priority order """ myResourceControl = ResourceControl() myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1") myResourceControl.insertThreshold("testSite1", "Processing", 10, 8) myResourceControl.insertThreshold("testSite1", "Merge", 5, 3) # test default task priorities result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite1']['thresholds']['Merge']['priority'], 4) self.assertEqual(result['testSite1']['thresholds']['Processing']['priority'], 0) myResourceControl.changeTaskPriority("Merge", 3) myResourceControl.changeTaskPriority("Processing", 1) result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite1']['thresholds']['Merge']['priority'], 3) self.assertEqual(result['testSite1']['thresholds']['Processing']['priority'], 1) myResourceControl.changeTaskPriority("Merge", 1) myResourceControl.changeTaskPriority("Processing", 3) result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite1']['thresholds']['Merge']['priority'], 1) self.assertEqual(result['testSite1']['thresholds']['Processing']['priority'], 3) return
def testJobSiteDrain(self): """ _testJobSiteDrain_ Test the behavior of jobs pending to a single site that is in drain mode """ workload = self.createTestWorkload() config = self.getConfig() jobSubmitter = JobSubmitterPoller(config=config) myResourceControl = ResourceControl(config) changeState = ChangeState(config) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") nSubs = 1 nJobs = 30 site = 'T2_US_Nebraska' self.setResourceThresholds(site, pendingSlots=100, runningSlots=100, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10, 'runningSlots': 10}, Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5}) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[site], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # submit first 10 jobs jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) myResourceControl.changeSiteState(site, 'Draining') # site is now in drain, so don't submit anything jobSubmitter.algorithm() # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='created', jobType="Processing") self.assertEqual(len(result), 20) result = getJobsAction.execute(state='submitfailed', jobType="Processing") self.assertEqual(len(result), 0) # make sure the drain grace period expires... time.sleep(3) jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) # the remaining jobs should have gone to submitfailed by now result = getJobsAction.execute(state='submitfailed', jobType="Processing") self.assertEqual(len(result), 20) result = getJobsAction.execute(state='created', jobType="Processing") self.assertEqual(len(result), 0)
def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also, create some dummy locations. """ super(JobCreatorTest, self).setUp() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=[ 'WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database' ], useDefault=False) self.couchdbname = "jobcreator_t" self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump") self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump") self.configFile = EmulatorSetup.setupWMAgentConfig() myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.daoFactory(classname="Locations.New") for site in self.sites: locationAction.execute(siteName=site, pnn=site) # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn=site, ceName=site) resourceControl.insertThreshold(siteName=site, taskType='Processing', maxSlots=10000, pendingSlots=10000) self.resourceControl = resourceControl self._setup = True self._teardown = False self.testDir = self.testInit.generateWorkDir() self.cwd = os.getcwd() # Set heartbeat self.componentName = 'JobCreator' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() if PY3: self.assertItemsEqual = self.assertCountEqual return
def setUp(self): """ _setUp_ """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.WMBS"]) self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting") myThread = threading.currentThread() self.myThread = myThread daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.WMBSFactory = daoFactory config = self.getConfig() self.changer = ChangeState(config) myResourceControl = ResourceControl() myResourceControl.insertSite("T1_US_FNAL", 10, 20, "T1_US_FNAL_Disk", "T1_US_FNAL") myResourceControl.insertSite("T1_US_FNAL", 10, 20, "T3_US_FNALLPC", "T1_US_FNAL") myResourceControl.insertSite("T2_CH_CERN", 10, 20, "T2_CH_CERN", "T2_CH_CERN") self.fileset1 = Fileset(name="TestFileset1") for fileNum in range(11): newFile = File("/some/file/name%d" % fileNum, size=1000, events=100) newFile.addRun(Run(1, *[1])) newFile.setLocation('T1_US_FNAL_Disk') self.fileset1.addFile(newFile) self.fileset1.create() workflow1 = Workflow(spec="spec.xml", owner="hufnagel", name="TestWorkflow1", task="Test") workflow1.create() self.subscription1 = Subscription(fileset=self.fileset1, workflow=workflow1, split_algo="Harvest", type="Harvesting") self.subscription1.create() self.configFile = EmulatorSetup.setupWMAgentConfig() return
def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.ssb2AgentStatus = { 'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining', 'unknown': None } self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config _token = config.AgentStatusWatcher.grafanaToken self.grafanaURL = config.AgentStatusWatcher.grafanaURL self.grafanaAPIName = config.AgentStatusWatcher.grafanaSSB self.grafana = Grafana(_token, configDict={"endpoint": self.grafanaURL}) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader( self.config.General.centralWMStatsURL)
def setup(self, parameters): """ Set db connection and prepare resource control """ # Interface to WMBS/BossAir db myThread = threading.currentThread() # set resource control self.resourceControl = ResourceControl(config = self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL)
def setUp(self): """ _setUp_ Set up vital components """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMCore.WMBS",'WMCore.MsgService', 'WMCore.ResourceControl', 'WMCore.ThreadPool', 'WMCore.Agent.Database'], useDefault = False) myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) locationAction = self.daoFactory(classname = "Locations.New") pendingSlots = self.daoFactory(classname = "Locations.SetPendingSlots") for site in self.sites: locationAction.execute(siteName = site, seName = 'se.%s' % (site), ceName = site) pendingSlots.execute(siteName = site, pendingSlots = 1000) #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), ceName = site) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) self.testDir = self.testInit.generateWorkDir() # Set heartbeat for component in self.components: heartbeatAPI = HeartbeatAPI(component) heartbeatAPI.registerComponent() return
def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader( self.config.AgentStatusWatcher.centralWMStatsURL)
def testThresholdPriority(self): """ _testThresholdPriority_ Test that we get things back in priority order """ myResourceControl = ResourceControl() myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1") myResourceControl.insertThreshold("testSite1", "Processing", 10, 8) myResourceControl.insertThreshold("testSite1", "Merge", 5, 3) myResourceControl.changeTaskPriority("Merge", 3) myResourceControl.changeTaskPriority("Processing", 1) result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Merge') self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Processing') myResourceControl.insertThreshold("testSite1", "Processing", 10, 8) myResourceControl.insertThreshold("testSite1", "Merge", 5, 3) myResourceControl.changeTaskPriority("Merge", 1) myResourceControl.changeTaskPriority("Processing", 3) # Should now be in reverse order result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite1']['thresholds'][1]['task_type'], 'Merge') self.assertEqual(result['testSite1']['thresholds'][0]['task_type'], 'Processing') myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2") myResourceControl.insertThreshold("testSite2", "Processing", 10, 8) myResourceControl.insertThreshold("testSite2", "Merge", 5, 3) # Should be in the same order for site 1 and 2 result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite2']['thresholds'][0]['task_type'], result['testSite1']['thresholds'][0]['task_type']) self.assertEqual(result['testSite2']['thresholds'][1]['task_type'], result['testSite1']['thresholds'][1]['task_type']) myResourceControl.changeTaskPriority("Merge", 4) result = myResourceControl.listThresholdsForSubmit() self.assertEqual(result['testSite2']['thresholds'][0]['priority'], 4) return
def mapSitetoPNN(sites): """ Receives a list of site names, query resource control and return a list of PNNs. """ from WMCore.ResourceControl.ResourceControl import ResourceControl if not len(sites): return [] fakePNNs = [] rControl = ResourceControl() for site in sites: fakePNNs.extend(rControl.listSiteInfo(site)['pnn']) return fakePNNs
def setUp(self): """ setup for test. """ super(JobTrackerTest, self).setUp() myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() # self.testInit.clearDatabase(modules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl"]) self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl" ], useDefault=False) self.testInit.setupCouch("jobtracker_t/jobs", "JobDump") self.testInit.setupCouch("jobtracker_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") # Create sites in resourceControl resourceControl = ResourceControl() resourceControl.insertSite(siteName='malpaquet', pnn='se.malpaquet', ceName='malpaquet', plugin="CondorPlugin") resourceControl.insertThreshold(siteName='malpaquet', taskType='Processing', \ maxSlots=10000, pendingSlots=10000) locationAction = self.daoFactory(classname="Locations.New") locationAction.execute(siteName="malpaquet", pnn="malpaquet", ceName="malpaquet", plugin="CondorPlugin") # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="jchurchill") # We actually need the user name self.user = getpass.getuser() self.testDir = self.testInit.generateWorkDir() self.configFile = EmulatorSetup.setupWMAgentConfig()
def testInsertAllSEs(self): """ _testInsertAllSEs_ Test to see if we can insert all SEs and Thresholds at once Depending on the WMCore.Services.SiteDB interface """ self.createConfig() resControlPath = os.path.join(getTestBase(), "../../bin/wmagent-resource-control") env = os.environ env['PYTHONPATH'] = ":".join(sys.path) cmdline = [ resControlPath, "--add-all-sites", "--plugin=CondorPlugin", "--pending-slots=100", "--running-slots=500", "--emulator" ] retval = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) (_, _) = retval.communicate() cmdline = [resControlPath, "--priority=20", "--task-type=Processing"] retval = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) (_, _) = retval.communicate() myResourceControl = ResourceControl() result = myResourceControl.listThresholdsForSubmit() self.assertTrue('T1_US_FNAL' in result.keys()) for x in result.keys(): self.assertEqual(len(result[x]['thresholds']), 8) self.assertEqual(result[x]['total_pending_slots'], 100) self.assertEqual(result[x]['total_running_slots'], 500) for thresh in result[x]['thresholds']: if thresh['task_type'] == 'Processing': self.assertEqual(thresh['priority'], 20) self.assertEqual(thresh['max_slots'], 500) # Verify that sites with more than one SE were added correctly. nebInfo = myResourceControl.listSiteInfo("T2_US_Nebraska") self.assertTrue(len(nebInfo["pnn"]) == 1) return
def testChangeSiteState(self): """ _testNewState_ Check that we can change the state between different values and retrieve it through the threshold methods """ self.tempDir = self.testInit.generateWorkDir() config = self.createConfig() myResourceControl = ResourceControl(config) myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1") myResourceControl.insertThreshold("testSite1", "Processing", 10, 5) result = myResourceControl.listThresholdsForCreate() self.assertEqual(result['testSite1']['state'], 'Normal', 'Error: Wrong site state') myResourceControl.changeSiteState("testSite1", "Down") result = myResourceControl.listThresholdsForCreate() self.assertEqual(result['testSite1']['state'], 'Down', 'Error: Wrong site state')
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.testInit.setSchema(customModules=["WMCore.WMBS"]) self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting") self.myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=self.myThread.dbi) myResourceControl = ResourceControl() myResourceControl.insertSite("T1_US_FNAL", 1000, 2000, "T1_US_FNAL_Disk", "T1_US_FNAL") myResourceControl.insertSite("T2_CH_CERN", 1000, 2000, "T2_CH_CERN", "T2_CH_CERN") self.performanceParams = { 'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400 } # dummy workflow self.testWorkflow = Workflow(spec="spec.xml", owner="dmwm", name="testWorkflow", task="Test") self.testWorkflow.create() if PY3: self.assertItemsEqual = self.assertCountEqual return
def testChangeState(self): """ _testChangeState_ Check that we can change the state between different values and retrieve it through the threshold methods """ myResourceControl = ResourceControl() myResourceControl.insertSite("testSite1", 20, 40, "testSE1", "testCE1") myResourceControl.insertThreshold("testSite1", "Processing", 10, 5, priority=1) result = myResourceControl.listThresholdsForCreate() self.assertEqual(result['testSite1']['state'], 'Normal', 'Error: Wrong site state') myResourceControl.changeSiteState("testSite1", "Down") result = myResourceControl.listThresholdsForCreate() self.assertEqual(result['testSite1']['state'], 'Down', 'Error: Wrong site state')
def freeSlots(multiplier=1.0, minusRunning=False, allowedStates=['Normal'], knownCmsSites=None): """ Get free resources from wmbs. Specify multiplier to apply a ratio to the actual numbers. minusRunning control if running jobs should be counted """ from WMCore.ResourceControl.ResourceControl import ResourceControl rc_sites = ResourceControl().listThresholdsForCreate() sites = defaultdict(lambda: 0) for name, site in rc_sites.items(): if not site.get('cms_name'): logging.warning("Not fetching work for %s, cms_name not defined" % name) continue if knownCmsSites and site['cms_name'] not in knownCmsSites: logging.warning( "%s doesn't appear to be a known cms site, work may fail to be acquired for it" % site['cms_name']) if site['state'] not in allowedStates: continue slots = site['total_slots'] if minusRunning: slots -= site['pending_jobs'] sites[site['cms_name']] += (slots * multiplier) # At the end delete entries < 1 # This allows us to combine multiple sites under the same CMS_Name # Without going nuts for site in sites.keys(): if sites[site] < 1: del sites[site] return dict(sites)
def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.tearDown() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database" ], useDefault=False) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn='se.%s' % (site), cmsName=site, ceName=site, plugin="CondorPlugin", pendingSlots=1000, runningSlots=2000) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 1000, pendingSlots = 1000) resourceControl.insertSite(siteName='Xanadu', pnn='se.Xanadu', cmsName=site, ceName='Xanadu', plugin="TestPlugin") resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) resourceControl.insertSite(siteName='jade-cms.hip.fi', pnn='madhatter.csc.fi', cmsName=site, ceName='jade-cms.hip.fi', plugin="ARCPlugin") resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \ maxSlots = 100, pendingSlots = 100) # using this for glite submissions resourceControl.insertSite(siteName='grid-ce-01.ba.infn.it', pnn='storm-se-01.ba.infn.it', cmsName=site, ceName='grid-ce-01.ba.infn.it', plugin='gLitePlugin') resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \ maxSlots = 50, pendingSlots = 50) # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole") # We actually need the user name self.user = getpass.getuser() # Change this to the working dir to keep track of error and log files from condor self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = 'test' self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = 'JobTracker' self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache( ) self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return
def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = [ 'T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL' ] for site in sites: self.setResourceThresholds(site, pendingSlots=10, runningSlots=-1, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 10, 'runningSlots': -1 }, Merge={ 'pendingSlots': 10, 'runningSlots': -1, 'priority': 5 }) myResourceControl = ResourceControl(config) myResourceControl.changeSiteState('T2_US_Florida', 'Draining') # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") locationDict = getLocationAction.execute([{ 'jobid': x } for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertNotEqual(loc, 'T2_US_Florida') # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, 'Down') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Drain and create Merge jobs. Those should be submitted for site in sites: myResourceControl.changeSiteState(site, 'Draining') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, 'Aborted') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='SubmitFailed', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), nSubs * nJobs) return