def testD_SubmitFailed(self): """ _testD_SubmitFailed_ Check if jobs without a possible site to run at go to SubmitFailed """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = [], workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName)) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) return
def testF_OverloadTest(self): """ _OverloadTest_ Test and see what happens if you put in more jobs Then the sites can handle """ resourceControl = ResourceControl() for site in self.sites: resourceControl.insertThreshold(siteName=site, taskType="Silly", maxSlots=1) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), type="Silly", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() # Should be one job for each site nSites = len(self.sites) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSites) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Silly") self.assertEqual(len(result), nSites) result = getJobsAction.execute(state="Created", jobType="Silly") self.assertEqual(len(result), nJobs * nSubs - nSites) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testD_SubmitFailed(self): """ _testD_SubmitFailed_ Check if jobs without a possible site to run at go to SubmitFailed """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site=[], workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='SubmitFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testCaching(self): """ _testCaching_ Verify that JobSubmitter caching works. """ mySubmitterPoller = JobSubmitterPoller(self.createConfig()) mySubmitterPoller.refreshCache() self.assertEqual(len(mySubmitterPoller.cachedJobIDs), 0, "Error: The job cache should be empty.") self.injectJobs() mySubmitterPoller.refreshCache() # Verify the cache is full self.assertEqual(len(mySubmitterPoller.cachedJobIDs), 20, "Error: The job cache should contain 20 jobs.") killWorkflow("wf001") mySubmitterPoller.refreshCache() # Verify that the workflow is gone from the cache self.assertEqual(len(mySubmitterPoller.cachedJobIDs), 10, "Error: The job cache should contain 10 jobs.") killWorkflow("wf002") mySubmitterPoller.refreshCache() # Verify that the workflow is gone from the cache self.assertEqual(len(mySubmitterPoller.cachedJobIDs), 0, "Error: The job cache should be empty.") return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from PyCondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill( jtok, errorCode=61301 ) # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') baAPI.kill(jobs=idleJobs) del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') baAPI.kill(jobs = idleJobs) del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) pipe.communicate() del jobSubmitter return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from CondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill(jtok, errorCode=71301) # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down) return
def testJobSiteDrain(self): """ _testJobSiteDrain_ Test the behavior of jobs pending to a single site that is in drain mode """ workload = self.createTestWorkload() config = self.getConfig() jobSubmitter = JobSubmitterPoller(config=config) myResourceControl = ResourceControl(config) changeState = ChangeState(config) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") nSubs = 1 nJobs = 30 site = 'T2_US_Nebraska' self.setResourceThresholds(site, pendingSlots=100, runningSlots=100, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10, 'runningSlots': 10}, Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5}) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[site], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # submit first 10 jobs jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) myResourceControl.changeSiteState(site, 'Draining') # site is now in drain, so don't submit anything jobSubmitter.algorithm() # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='created', jobType="Processing") self.assertEqual(len(result), 20) result = getJobsAction.execute(state='submitfailed', jobType="Processing") self.assertEqual(len(result), 0) # make sure the drain grace period expires... time.sleep(3) jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) # the remaining jobs should have gone to submitfailed by now result = getJobsAction.execute(state='submitfailed', jobType="Processing") self.assertEqual(len(result), 20) result = getJobsAction.execute(state='created', jobType="Processing") self.assertEqual(len(result), 0)
def testF_PollerProfileTest(self): """ _testF_PollerProfileTest_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 100 nJobs = 100 sites = ['T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 20000, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 10000, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10000, 'runningSlots' :-1, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Actually run it startTime = time.time() cProfile.runctx("jobSubmitter.algorithm()", globals(), locals(), filename = "testStats.stat") stopTime = time.time() print "Job took %f seconds to complete" % (stopTime - startTime) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats() return
def testCaching(self): """ _testCaching_ Verify that JobSubmitter caching works. """ config = self.createConfig() mySubmitterPoller = JobSubmitterPoller(config) mySubmitterPoller.refreshCache() self.assertEqual(len(mySubmitterPoller.cachedJobIDs), 0, "Error: The job cache should be empty.") self.injectJobs() mySubmitterPoller.refreshCache() # Verify the cache is full self.assertEqual( len(mySubmitterPoller.cachedJobIDs), 20, "Error: The job cache should contain 20 jobs. Contains: %i" % len(mySubmitterPoller.cachedJobIDs)) killWorkflow("wf001", jobCouchConfig=config) mySubmitterPoller.refreshCache() # Verify that the workflow is gone from the cache self.assertEqual( len(mySubmitterPoller.cachedJobIDs), 10, "Error: The job cache should contain 10 jobs. Contains: %i" % len(mySubmitterPoller.cachedJobIDs)) killWorkflow("wf002", jobCouchConfig=config) mySubmitterPoller.refreshCache() # Verify that the workflow is gone from the cache self.assertEqual( len(mySubmitterPoller.cachedJobIDs), 0, "Error: The job cache should be empty. Contains: %i" % len(mySubmitterPoller.cachedJobIDs)) return
def preInitialization(self): """ Setup the worker thread for jobSubmitter """ logging.info("JobSubmitter.preInitialization") # Add event loop to worker manager myThread = threading.currentThread() pollInterval = self.config.JobSubmitter.pollInterval logging.info("Setting poll interval to %s seconds", pollInterval) myThread.workerThreadManager.addWorker(JobSubmitterPoller(self.config), pollInterval) return
def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = ['T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 10, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 10, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10, 'runningSlots' :-1, 'priority' : 5}) myResourceControl = ResourceControl() myResourceControl.changeSiteState('T2_US_Florida', 'Draining') # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, site = ['se.%s' % x for x in sites], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName)) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") locationDict = getLocationAction.execute([{'jobid' : x} for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertNotEqual(loc, 'T2_US_Florida') # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, 'Down') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, site = ['se.%s' % x for x in sites], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName)) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, 'Aborted') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs = nSubsMerge, nJobs = nJobsMerge, site = ['se.%s' % x for x in sites], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), taskType = 'Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state = 'SubmitFailed', jobType = 'Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state = 'Executing', jobType = 'Processing') self.assertEqual(len(result), nSubs * nJobs) return
def testD_WhiteListBlackList(self): """ _testD_WhiteListBlackList_ Test the whitelist/blacklist implementation Trust the jobCreator to get this in the job right """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 sites = ['T2_US_Florida', 'T2_TW_Taiwan', 'T2_CH_CERN', 'T3_CO_Uniandes'] for site in sites: self.setResourceThresholds(site, pendingSlots = 1000, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 5000, 'runningSlots' :-1}, Merge = {'pendingSlots' : 1000, 'runningSlots' :-1, 'priority' : 5}) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, site = 'se.%s' % sites[-1], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), bl = sites[:-1]) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at T3_CO_Uniandes # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") locationDict = getLocationAction.execute([{'jobid' : x} for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertEqual(loc, 'T3_CO_Uniandes') # Run again and test the whiteList jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = 'se.%s' % 'T2_CH_CERN', workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), wl = ['T2_CH_CERN']) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Run it jobSubmitter.algorithm() # You'll have jobs from the previous run still in the database result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # All jobs should be at CERN or Uniandes locationDict = getLocationAction.execute([{'jobid' : x} for x in result]) for entry in locationDict[nSubs * nJobs:]: loc = entry['site_name'] self.assertEqual(loc, 'T2_CH_CERN') # Run again with an invalid whitelist # After this point, the original two sets of jobs will be executing # The rest of the jobs should move to submitFailed jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = 'se.%s' % 'T2_CH_CERN', workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), wl = ['T2_US_Namibia']) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Jobs should be gone getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Run again with all sites blacklisted jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = ['se.%s' % x for x in sites], workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), bl = sites) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) return
def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 sites = ['T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 10, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 50, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10, 'runningSlots' :-1, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', name = 'OldestWorkflow') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', name = 'NewestWorkflow') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', name = 'OldestWorkflow')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Move pending jobs to running getRunJobID = self.baDaoFactory(classname = "LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname = "SetStatus") for idx in range(2): result = getJobsAction.execute(state = 'Executing') binds = [] for jobId in result: binds.append({'id' : jobId, 'retry_count' : 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory(classname = "Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow['name'], 'OldestWorkflow') return
def testCaching(self): """ _testCaching_ Verify that JobSubmitter caching works. """ config = self.createConfig() mySubmitterPoller = JobSubmitterPoller(config) mySubmitterPoller.getThresholds() mySubmitterPoller.refreshCache() self.assertEqual(len(mySubmitterPoller.jobDataCache), 0, "Error: The job cache should be empty.") self.injectJobs() mySubmitterPoller.refreshCache() # Verify the cache is full self.assertEqual(len(mySubmitterPoller.jobDataCache), 20, "Error: The job cache should contain 20 jobs. Contains: %i" % len( mySubmitterPoller.jobDataCache)) killWorkflow("wf001", jobCouchConfig=config) mySubmitterPoller.refreshCache() # Verify that the workflow is gone from the cache self.assertEqual(len(mySubmitterPoller.jobDataCache), 10, "Error: The job cache should contain 10 jobs. Contains: %i" % len( mySubmitterPoller.jobDataCache)) killWorkflow("wf002", jobCouchConfig=config) mySubmitterPoller.refreshCache() # Verify that the workflow is gone from the cache self.assertEqual(len(mySubmitterPoller.jobDataCache), 0, "Error: The job cache should be empty. Contains: %i" % len(mySubmitterPoller.jobDataCache)) return
def testMemoryProfile(self): """ _testMemoryProfile_ Creates 20k jobs and keep refreshing the cache and submitting them between the components cycle Example using memory_profiler library, unfortunately the source code has to be updated with decorators. NOTE: Never run it on jenkins """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) # myResourceControl = ResourceControl(config) nSubs = 20 nJobs = 100 sites = ['T2_US_Florida', 'T2_RU_INR', 'T3_CO_Uniandes', 'T1_US_FNAL'] allSites = CRIC().PSNtoPNNMap('*') for site in allSites: self.setResourceThresholds(site, pendingSlots=20000, runningSlots=999999, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10000, 'runningSlots': 999999}, Merge={'pendingSlots': 10000, 'runningSlots': 999999, 'priority': 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # Actually run it jobSubmitter.algorithm() # cycle 1 self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # myResourceControl.changeSiteState('T2_US_Florida', 'Draining') jobSubmitter.algorithm() # cycle 2 self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # myResourceControl.changeSiteState('T2_RU_INR', 'Draining') jobSubmitter.algorithm() # cycle 3 self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # myResourceControl.changeSiteState('T3_CO_Uniandes', 'Draining') jobSubmitter.algorithm() # cycle 4 # myResourceControl.changeSiteState('T2_RU_INR', 'Normal') jobSubmitter.algorithm() # cycle 5 # myResourceControl.changeSiteState('T2_US_Florida', 'Normal') jobSubmitter.algorithm() # cycle 6 # myResourceControl.changeSiteState('T2_RU_INR', 'Normal') jobSubmitter.algorithm() # cycle 7 # myResourceControl.changeSiteState('T3_CO_Uniandes', 'Normal') jobSubmitter.algorithm() # cycle 8 jobSubmitter.algorithm() # cycle 9, nothing to submit return
def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = [ 'T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL' ] for site in sites: self.setResourceThresholds(site, pendingSlots=10, runningSlots=-1, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 10, 'runningSlots': -1 }, Merge={ 'pendingSlots': 10, 'runningSlots': -1, 'priority': 5 }) myResourceControl = ResourceControl(config) myResourceControl.changeSiteState('T2_US_Florida', 'Draining') # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") locationDict = getLocationAction.execute([{ 'jobid': x } for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertNotEqual(loc, 'T2_US_Florida') # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, 'Down') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Drain and create Merge jobs. Those should be submitted for site in sites: myResourceControl.changeSiteState(site, 'Draining') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, 'Aborted') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='SubmitFailed', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), nSubs * nJobs) return
def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=10, runningSlots=10000, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 50, 'runningSlots': 10000 }, Merge={ 'pendingSlots': 10, 'runningSlots': 10000, 'priority': 5 }) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') jobGroupList.extend( self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='NewestWorkflow') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Move pending jobs to running getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for idx in range(2): result = getJobsAction.execute(state='Executing') binds = [] for jobId in result: binds.append({'id': jobId, 'retry_count': 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory( classname="Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow['name'], 'OldestWorkflow') return
def testD_CreamCETest(self): """ _CreamCETest_ This is for submitting to Cream CEs. Don't use it. """ return nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() config.JobSubmitter.pluginName = "CreamPlugin" changeState = ChangeState(config) nSubs = 1 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") # Add a new site siteName = "creamSite" ceName = "https://cream-1-fzk.gridka.de:8443/ce-cream/services/CREAM2 pbs cmsXS" # ceName = "127.0.0.1" locationAction = self.daoFactory(classname="Locations.New") pendingSlots = self.daoFactory(classname="Locations.SetPendingSlots") locationAction.execute(siteName=siteName, seName=siteName, ceName=ceName) pendingSlots.execute(siteName=siteName, pendingSlots=1000) resourceControl = ResourceControl() resourceControl.insertSite(siteName=siteName, seName=siteName, ceName=ceName) resourceControl.insertThreshold(siteName=siteName, taskType="Processing", maxSlots=10000) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=siteName, ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() if os.path.exists("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree(self.testDir, "CacheDir") return
def testE_WhiteListBlackList(self): """ _WhiteListBlackList_ Test the whitelist/blacklist implementation Trust the jobCreator to get this in the job right """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), bl=["T2_US_Florida", "T2_TW_Taiwan", "T1_CH_CERN"], ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() if os.path.isdir("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree("%s" % self.testDir, os.path.join(os.getcwd(), "CacheDir")) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at UCSD submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) # submitFile = os.listdir(config.JobSubmitter.submitDir)[0] self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD") # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # Run again and test the whiteList jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), wl=["T2_US_UCSD"], ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() if os.path.isdir("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree("%s" % self.testDir, os.path.join(os.getcwd(), "CacheDir")) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # You'll have jobs from the previous run still in the database result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # All jobs should be at UCSD submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD", noIndex=True) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # Run again with an invalid whitelist # NOTE: After this point, the original two sets of jobs will be executing # The rest of the jobs should move to submitFailed jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), wl=["T2_US_Namibia"], ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() # Check to make sure we have running jobs # nRunning = getCondorRunningJobs(self.user) # self.assertEqual(nRunning, 0) # Jobs should be gone getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state="SubmitFailed", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # Run again with all sites blacklisted jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), bl=self.sites, ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Jobs should be gone getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state="SubmitFailed", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testG_IndexErrorTest(self): """ _IndexErrorTest_ Check to see you get proper indexes for the jobPackages if you have more jobs then you normally run at once. """ workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() config.JobSubmitter.jobsPerWorker = 1 config.JobSubmitter.collectionSize = 1 changeState = ChangeState(config) nSubs = 1 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site="se.T2_US_UCSD", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() if os.path.exists("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree(self.testDir, "CacheDir") # Check that jobs are in the right state result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check on the JDL submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD", indexFlag=True) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds( site, pendingSlots=10, runningSlots=-1, tasks=["Processing", "Merge"], Processing={"pendingSlots": 50, "runningSlots": -1}, Merge={"pendingSlots": 10, "runningSlots": -1, "priority": 5}, ) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, name="OldestWorkflow", ) jobGroupList.extend( self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, taskType="Merge", ) ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, name="NewestWorkflow", ) jobGroupList.extend( self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, name="OldestWorkflow", ) ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Move pending jobs to running getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for idx in range(2): result = getJobsAction.execute(state="Executing") binds = [] for jobId in result: binds.append({"id": jobId, "retry_count": 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x["id"] for x in runJobIds], "Running") # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory(classname="Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow["name"], "OldestWorkflow") return
def testA_BasicTest(self): """ Use the MockPlugin to create a simple test Check to see that all the jobs were "submitted", don't care about thresholds """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 20 site = "T2_US_UCSD" self.setResourceThresholds(site, pendingSlots=50, runningSlots=100, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 50, 'runningSlots': 100 }, Merge={ 'pendingSlots': 50, 'runningSlots': 100 }) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [['T2_US_UCSD']]) # Run another cycle, it shouldn't submit anything. There isn't anything to submit jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nSubs = 1 nJobs = 10 # Submit another 10 jobs jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType="Merge") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Check that the jobs are available for submission and run another cycle result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that the last 10 jobs were submitted as well. result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) return
def testB_thresholdTest(self): """ _testB_thresholdTest_ Check that the threshold management is working, this requires checks on pending/running jobs globally at a site and per task/site """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=50, runningSlots=220, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 45, 'runningSlots': 200 }, Merge={ 'pendingSlots': 10, 'runningSlots': 20, 'priority': 5 }) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that jobs are in the right state, # here we are limited by the pending threshold for the Processing task (45) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [['T1_US_FNAL']]) # Run another cycle, it shouldn't submit anything. Jobs are still in pending jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site nSubs = 1 nJobs = 10 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Now let's test running thresholds # The scenario will be setup as follows: Move all current jobs as running # Create 300 Processing jobs and 300 merge jobs # Run 5 polling cycles, moving all pending jobs to running in between # Result is, merge is left at 30 running 0 pending and processing is left at 240 running 0 pending # Processing has 110 jobs in queue and Merge 280 # This tests all threshold dynamics including the prioritization of merge over processing nSubs = 1 nJobs = 300 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) jobGroupList.extend( self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for i in range(5): result = getJobsAction.execute(state='Executing') binds = [] for jobId in result: binds.append({'id': jobId, 'retry_count': 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), 240) result = getJobsAction.execute(state='Created', jobType='Processing') self.assertEqual(len(result), 110) result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), 30) result = getJobsAction.execute(state='Created', jobType='Merge') self.assertEqual(len(result), 280) return
def testD_PrototypeChain(self): """ _PrototypeChain_ Prototype the BossAir workflow """ dummymyThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'SimpleCondorPlugin' baAPI = BossAirAPI(config=config, insertStates=True) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site='se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobTracker = JobTrackerPoller(config=config) statusPoller = StatusPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nSubs * nJobs) # Check WMBS getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) statusPoller.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nSubs * nJobs) # Tracker should do nothing jobTracker.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Wait for jobs to timeout due to short Pending wait period time.sleep(12) statusPoller.algorithm() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Timeout', complete='0') self.assertEqual(len(newJobs), nSubs * nJobs) # Jobs should be gone nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nSubs * nJobs) # Because they timed out, they all should have failed jobTracker.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='JobFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return
def testA_BasicTest(self): """ Use the CondorGlobusPlugin to create a very simple test Check to see that all the jobs were submitted Parse and test the JDL files See what condor says """ workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site="se.T2_US_UCSD", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for id in result: loc = getLocationAction.execute(jobid=id) self.assertEqual(loc, [["T2_US_UCSD"]]) # Check on the JDL submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD") # if os.path.exists('CacheDir'): # shutil.rmtree('CacheDir') # shutil.copytree(self.testDir, 'CacheDir') # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # This should do nothing jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site="se.T2_US_UCSD", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testA_BasicTest(self): """ Use the MockPlugin to create a simple test Check to see that all the jobs were "submitted", don't care about thresholds """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 20 site = 'T2_US_UCSD' self.setResourceThresholds(site, pendingSlots = 50, runningSlots = 100, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 50, 'runningSlots' : 100}, Merge = {'pendingSlots' : 50, 'runningSlots' : 100}) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid = jobId) self.assertEqual(loc, [['T2_US_UCSD']]) # Run another cycle, it shouldn't submit anything. There isn't anything to submit jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) nSubs = 1 nJobs = 10 # Submit another 10 jobs jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % site, taskType = "Merge") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Check that the jobs are available for submission and run another cycle result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() #Check that the last 10 jobs were submitted as well. result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), nSubs * nJobs) return
def testD_PrototypeChain(self): """ _PrototypeChain_ Prototype the BossAir workflow """ myThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobTracker = JobTrackerPoller(config = config) statusPoller = StatusPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nSubs * nJobs) # Check WMBS getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) statusPoller.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nSubs * nJobs) # Tracker should do nothing jobTracker.algorithm() result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Wait for jobs to timeout due to short Pending wait period time.sleep(12) statusPoller.algorithm() newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Timeout', complete = '0') self.assertEqual(len(newJobs), nSubs * nJobs) # Jobs should be gone nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nSubs * nJobs) # Because they timed out, they all should have failed jobTracker.algorithm() result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'JobFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) return
def testB_thresholdTest(self): """ _testB_thresholdTest_ Check that the threshold management is working, this requires checks on pending/running jobs globally at a site and per task/site """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 10 sites = ['T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 50, runningSlots = 200, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 45, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10, 'runningSlots' : 20, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that jobs are in the right state, # here we are limited by the pending threshold for the Processing task (45) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 45) # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid = jobId) self.assertEqual(loc, [['T1_US_FNAL']]) # Run another cycle, it shouldn't submit anything. Jobs are still in pending jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 45) # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site nSubs = 1 nJobs = 10 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 45) # Now let's test running thresholds # The scenario will be setup as follows: Move all current jobs as running # Create 300 Processing jobs and 300 merge jobs # Run 5 polling cycles, moving all pending jobs to running in between # Result is, merge is left at 25 running 0 pending and processing is left at 215 running 0 pending # Processing has 135 jobs in queue and Merge 285 # This tests all threshold dynamics including the prioritization of merge over processing nSubs = 1 nJobs = 300 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') getRunJobID = self.baDaoFactory(classname = "LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname = "SetStatus") for _ in range(5): result = getJobsAction.execute(state = 'Executing') binds = [] for jobId in result: binds.append({'id' : jobId, 'retry_count' : 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Executing', jobType = 'Processing') self.assertEqual(len(result), 215) result = getJobsAction.execute(state = 'Created', jobType = 'Processing') self.assertEqual(len(result), 135) result = getJobsAction.execute(state = 'Executing', jobType = 'Merge') self.assertEqual(len(result), 25) result = getJobsAction.execute(state = 'Created', jobType = 'Merge') self.assertEqual(len(result), 285) return
def testA_StraightThrough(self): """ _StraightThrough_ Just run everything straight through without any variations """ # Do pre-submit job check nRunning = getCondorRunningJobs() self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() name = 'WMAgent_Test1' site = self.sites[0] nSubs = 5 nFiles = 10 workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') # Create a collection of files self.createFileCollection(name = name, nSubs = nSubs, nFiles = nFiles, workflowURL = workloadPath, site = site) ############################################################ # Test the JobCreator config.Agent.componentName = 'JobCreator' testJobCreator = JobCreatorPoller(config = config) testJobCreator.algorithm() time.sleep(5) # Did all jobs get created? getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), nSubs*nFiles) # Count database objects result = myThread.dbi.processData('SELECT * FROM wmbs_sub_files_acquired')[0].fetchall() self.assertEqual(len(result), nSubs * nFiles) # Find the test directory testDirectory = os.path.join(self.testDir, 'TestWorkload', 'ReReco') self.assertTrue('JobCollection_1_0' in os.listdir(testDirectory)) self.assertTrue(len(os.listdir(testDirectory)) <= 20) groupDirectory = os.path.join(testDirectory, 'JobCollection_1_0') # First job should be in here self.assertTrue('job_1' in os.listdir(groupDirectory)) jobFile = os.path.join(groupDirectory, 'job_1', 'job.pkl') self.assertTrue(os.path.isfile(jobFile)) f = open(jobFile, 'r') job = cPickle.load(f) f.close() self.assertEqual(job['workflow'], name) self.assertEqual(len(job['input_files']), 1) self.assertEqual(os.path.basename(job['sandbox']), 'TestWorkload-Sandbox.tar.bz2') ############################################################### # Now test the JobSubmitter config.Agent.componentName = 'JobSubmitter' testJobSubmitter = JobSubmitterPoller(config = config) testJobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") for id in result: loc = getLocationAction.execute(jobid = id) self.assertEqual(loc, [[site]]) # Check to make sure we have running jobs nRunning = getCondorRunningJobs() self.assertEqual(nRunning, nFiles * nSubs) ################################################################# # Now the JobTracker config.Agent.componentName = 'JobTracker' testJobTracker = JobTrackerPoller(config = config) testJobTracker.setup() testJobTracker.algorithm() # Running the algo without removing the jobs should do nothing result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) condorRM() time.sleep(1) # All jobs gone? nRunning = getCondorRunningJobs() self.assertEqual(nRunning, 0) testJobTracker.algorithm() time.sleep(5) # Running the algo without removing the jobs should do nothing result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Complete', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) ################################################################# # Now the JobAccountant # First you need to load all jobs self.getFWJRAction = self.daoFactory(classname = "Jobs.GetFWJRByState") completeJobs = self.getFWJRAction.execute(state = "complete") # Create reports for all jobs self.createReports(jobs = completeJobs, retryCount = 0) config.Agent.componentName = 'JobAccountant' testJobAccountant = JobAccountantPoller(config = config) testJobAccountant.setup() # It should do something with the jobs testJobAccountant.algorithm() # All the jobs should be done now result = getJobsAction.execute(state = 'Complete', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Success', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) ####################################################################### # Now the JobArchiver config.Agent.componentName = 'JobArchiver' testJobArchiver = JobArchiverPoller(config = config) testJobArchiver.algorithm() # All the jobs should be cleaned up result = getJobsAction.execute(state = 'Success', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Cleanout', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) logDir = os.path.join(self.testDir, 'logs') for job in completeJobs: self.assertFalse(os.path.exists(job['fwjr_path'])) jobFolder = 'JobCluster_%i' \ % (int(job['id']/config.JobArchiver.numberOfJobsToCluster)) jobPath = os.path.join(logDir, jobFolder, 'Job_%i.tar' %(job['id'])) self.assertTrue(os.path.isfile(jobPath)) self.assertTrue(os.path.getsize(jobPath) > 0) ########################################################################### # Now the TaskAchiver config.Agent.componentName = 'TaskArchiver' testTaskArchiver = TaskArchiverPoller(config = config) testTaskArchiver.algorithm() result = getJobsAction.execute(state = 'Cleanout', jobType = "Processing") self.assertEqual(len(result), 0) for jdict in completeJobs: job = Job(id = jdict['id']) self.assertFalse(job.exists()) if os.path.isdir('testDir'): shutil.rmtree('testDir') shutil.copytree('%s' %self.testDir, os.path.join(os.getcwd(), 'testDir')) return
def testD_WhiteListBlackList(self): """ _testD_WhiteListBlackList_ Test the whitelist/blacklist implementation Trust the jobCreator to get this in the job right """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 sites = [ 'T2_US_Florida', 'T2_TW_Taiwan', 'T2_CH_CERN', 'T3_CO_Uniandes' ] for site in sites: self.setResourceThresholds(site, pendingSlots=1000, runningSlots=-1, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 5000, 'runningSlots': -1 }, Merge={ 'pendingSlots': 1000, 'runningSlots': -1, 'priority': 5 }) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site='se.%s' % sites[-1], task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), bl=sites[:-1]) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at T3_CO_Uniandes # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") locationDict = getLocationAction.execute([{ 'jobid': x } for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertEqual(loc, 'T3_CO_Uniandes') # Run again and test the whiteList jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site='se.%s' % 'T2_CH_CERN', workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), wl=['T2_CH_CERN']) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Run it jobSubmitter.algorithm() # You'll have jobs from the previous run still in the database result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # All jobs should be at CERN or Uniandes locationDict = getLocationAction.execute([{ 'jobid': x } for x in result]) for entry in locationDict[nSubs * nJobs:]: loc = entry['site_name'] self.assertEqual(loc, 'T2_CH_CERN') # Run again with an invalid whitelist # After this point, the original two sets of jobs will be executing # The rest of the jobs should move to submitFailed jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site='se.%s' % 'T2_CH_CERN', workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), wl=['T2_US_Namibia']) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Jobs should be gone getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state='SubmitFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Run again with all sites blacklisted jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site=['se.%s' % x for x in sites], workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), bl=sites) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state='SubmitFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) return