def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') baAPI.kill(jobs=idleJobs) del jobSubmitter return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from PyCondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill( jtok, errorCode=61301 ) # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') baAPI.kill(jobs = idleJobs) del jobSubmitter return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from CondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill(jtok, errorCode=71301) # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down) return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from CondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' baAPI = BossAirAPI(config = config) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') print idleJobs for job in idleJobs : print job['id'] baAPI.updateSiteInformation(idleJobs, info = None) return
def testC_CondorTest(self): """ _CondorTest_ This test works on the CondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) baAPI = BossAirAPI(config = config) print self.testDir jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'CondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs = jobList, info = info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs = jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs = jobList, info = info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), 0) #newJobs = baAPI._loadByStatus(status = 'Removed') #self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status = 'Removed', complete = '0') self.assertEqual(len(newJobs), nJobs) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') # init alert system self.initAlerts(compName="StatusPoller") return def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: logging.info("Running job status poller algorithm...") self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() self.sendAlert(6, msg=str(ex)) raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) self.sendAlert(6, msg=msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if not self.timeouts: # Then we've set ourselves to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = [] # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero", job['id']) continue if timeout and statusTime: if time.time() - float(statusTime) > float(timeout): # Timeout status is used by JobTracker to fail jobs in WMBS database logging.info( "Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState) job['status'] = 'Timeout' jobsToKill.append(job) # We need to show that the jobs are in state timeout # and then kill them. myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKill) self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[71304], errorCode=71304) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def testG_gLiteTest(self): """ _gLiteTest_ This test works on the gLitePlugin, checking all of its functions with a single set of jobs """ config = self.getConfig() config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf' config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/' config.BossAir.gLiteProcesses = 2 config.BossAir.gLitePrefixEnv = "/lib64/" config.BossAir.pluginNames.append("gLitePlugin") config.BossAir.manualProxyPath = environ['X509_USER_PROXY'] config.Agent.serverDN = "/we/bypass/myproxy/logon" #config.BossAir.pluginNames = ["gLitePlugin"] baAPI = BossAirAPI(config=config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs=nJobs, location='grid-ce-01.ba.infn.it') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath) newuser = self.daoFactory(classname="Users.New") newuser.execute(dn=userdn) for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'grid-ce-01.ba.infn.it'} job['location'] = 'grid-ce-01.ba.infn.it' job['plugin'] = 'gLitePlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = userdn job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs=jobList) # Should be new jobs newJobs = baAPI._loadByStatus(status='New') self.assertNotEqual(len(newJobs), nJobs) time.sleep(2) baAPI.track() # Should be not anymore marked as new newJobs = baAPI._loadByStatus('New', 0) self.assertNotEqual(len(newJobs), nJobs) # Killing all the jobs baAPI.kill(jobList) #time.sleep(15) baAPI.track() ## Issues running tests below due to glite delay on marking job as killed # Should be just running jobs #killedJobs = baAPI._loadByStatus('Cancelled by user', 0) #self.assertEqual(len(killedJobs), 0) # Check if they're complete #completeJobs = baAPI.getComplete() #self.assertEqual(len(completeJobs), nJobs) return
def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config = config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs = jobDummies) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs = jobDummies) result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') return @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: logging.info("Running job status poller algorithm...") self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if not self.timeouts: # Then we've set ourselves to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = defaultdict(list) # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero", job['id']) continue if timeout and statusTime: if time.time() - float(statusTime) > float(timeout): # Timeout status is used by JobTracker to fail jobs in WMBS database logging.info("Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState) job['status'] = 'Timeout' jobsToKill[globalState].append(job) timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306} # We need to show that the jobs are in state timeout # and then kill them. jobsToKillList = flattenList(jobsToKill.values()) myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKillList) for preJobStatus in jobsToKill: eCode = timeOutCodeMap.get(preJobStatus, 71307) # it shouldn't have 71307 (states should be among Running, Pending, Error) self.bossAir.kill(jobs=jobsToKill[preJobStatus], killMsg=WM_JOB_ERROR_CODES[eCode], errorCode=eCode) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def testG_gLiteTest(self): """ _gLiteTest_ This test works on the gLitePlugin, checking all of its functions with a single set of jobs """ config = self.getConfig() config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh' config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf' config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/' config.BossAir.gLiteProcesses = 2 config.BossAir.gLitePrefixEnv = "/lib64/" config.BossAir.pluginNames.append("gLitePlugin") config.BossAir.manualProxyPath = environ['X509_USER_PROXY'] config.Agent.serverDN = "/we/bypass/myproxy/logon" #config.BossAir.pluginNames = ["gLitePlugin"] baAPI = BossAirAPI(config = config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'grid-ce-01.ba.infn.it') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath) newuser = self.daoFactory(classname = "Users.New") newuser.execute(dn = userdn) for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'grid-ce-01.ba.infn.it'} job['location'] = 'grid-ce-01.ba.infn.it' job['plugin'] = 'gLitePlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = userdn job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs = jobList) # Should be new jobs newJobs = baAPI._loadByStatus(status = 'New') self.assertNotEqual(len(newJobs), nJobs) time.sleep(2) baAPI.track() # Should be not anymore marked as new newJobs = baAPI._loadByStatus('New', 0) self.assertNotEqual(len(newJobs), nJobs) # Killing all the jobs baAPI.kill( jobList ) #time.sleep(15) baAPI.track() ## Issues running tests below due to glite delay on marking job as killed # Should be just running jobs #killedJobs = baAPI._loadByStatus('Cancelled by user', 0) #self.assertEqual(len(killedJobs), 0) # Check if they're complete #completeJobs = baAPI.getComplete() #self.assertEqual(len(completeJobs), nJobs) return
def testH_ARCTest(self): """ _ARCTest_ This test works on the ARCPlugin, checking all of its functions with a single set of jobs """ nRunning = getNArcJobs() self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginNames.append("ARCPlugin") #config.BossAir.pluginNames = ["ARCPlugin"] baAPI = BossAirAPI(config = config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi') #baAPI.createNewJobs(wmbsJobs = jobDummies) #changeState = ChangeState(config) #changeState.propagate(jobDummies, 'created', 'new') #changeState.propagate(jobDummies, 'executing', 'created') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'jade-cms.hip.fi'} job['location'] = 'jade-cms.hip.fi' job['plugin'] = 'ARCPlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = 'edelmann' job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 for j in rJobs: if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]: baAPI.kill(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 succ, fail = baAPI.submit(jobs = jobList) time.sleep(30) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 idStr = "" for j in rJobs: idStr += " " + j['gridid'] if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) # Now kill 'em manually no_jobs = True while no_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") >= 0: # It seems the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: no_jobs = False # Just to be sure, if the jobs were already finished, do a # 'ngclean' too. command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] # Make sure the killing of the jobs reaches the info.sys. still_jobs = True while still_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") < 0: # It seems the killing of the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: still_jobs = False # See what happened baAPI.track() idJobs = baAPI._loadByID(rJobs) nActiveJobs = 0 nRemovedJobs = 0 for j in idJobs: if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]: nActiveJobs += 1 if j['status'] in [ "KILLING", "KILLED", "LOST" ]: nRemovedJobs += 1 self.assertEqual(nActiveJobs, 0) self.assertEqual(nRemovedJobs, nJobs) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts', {}) # init alert system self.initAlerts(compName="StatusPoller") return def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() self.sendAlert(6, msg=str(ex)) raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) self.sendAlert(6, msg=msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if self.timeouts == {}: # Then we've set outself to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = [] # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero" % job['id']) continue if timeout != None and statusTime != None: if time.time() - float(statusTime) > float(timeout): # Then the job needs to be killed. logging.info("Killing job %i because it has exceeded timeout for status %s" % (job['id'], globalState)) job['status'] = 'Timeout' jobsToKill.append(job) # We need to show that the jobs are in state timeout # and then kill them. myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKill) self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[61304], errorCode=61304) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs=jobDummies) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs=jobDummies) result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return
def testC_CondorTest(self): """ _CondorTest_ This test works on the SimpleCondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI = BossAirAPI(config=config, insertStates=True) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'SimpleCondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} # info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs=jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) # newJobs = baAPI._loadByStatus(status = 'Removed') # self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status='Removed', complete='0') self.assertEqual(len(newJobs), nJobs) return