def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states and perform proper actions with the jobs, according to the state """ timeNow = int(time.time()) state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303} executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if jobInfo: bossAir = BossAirAPI(self.config) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in state2ExitCode) ercode = state2ExitCode.get(state, 71300) bossAir.kill(jobtokill, errorCode=ercode) # only now that jobs were updated by the plugin, we flip the site state setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, stateTime=timeNow, conn=self.getDBConn(), transaction=self.existingTransaction()) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states and perform proper actions with the jobs, according to the state """ state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303} executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if jobInfo: bossAir = BossAirAPI(self.config, noSetup=True) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in state2ExitCode) ercode = state2ExitCode.get(state, 71300) bossAir.kill(jobtokill, errorCode=ercode) # only now that jobs were updated by the plugin, we flip the site state setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, conn=self.getDBConn(), transaction=self.existingTransaction()) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname = "Locations.SetState") setStateAction.execute(siteName = siteName, state = state, conn = self.getDBConn(), transaction = self.existingTransaction()) executingJobs = self.wmbsDAOFactory(classname = "Jobs.ListByState") jobInfo = executingJobs.execute(state = 'executing') if not jobInfo: # then no jobs to look at return bossAir = BossAirAPI(self.config, noSetup = True) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in ("Aborted","Draining","Down")) if state == "Aborted": ercode=71301 elif state == "Draining": ercode=71302 elif state == "Down": ercode=71303 else: ercode=71300 bossAir.kill(jobtokill, errorCode=ercode) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, conn=self.getDBConn(), transaction=self.existingTransaction()) executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if not jobInfo: # then no jobs to look at return bossAir = BossAirAPI(self.config, noSetup=True) jobtokill = bossAir.updateSiteInformation( jobInfo, siteName, state in ("Aborted", "Draining", "Down")) if state == "Aborted": ercode = 71301 elif state == "Draining": ercode = 71302 elif state == "Down": ercode = 71303 else: ercode = 71300 bossAir.kill(jobtokill, errorCode=ercode) return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") existingTransaction = False if myThread.transaction.conn: existingTransaction = True else: myThread.transaction.begin() killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn, transaction=True) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn, transaction=True) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) changeState.propagate(liveWMBSJob, "killed", liveJob["state"]) killableJobs.append(liveJob) # Now kill them try: bossAir.kill(jobs=killableJobs) except BossAirException, ex: # Something's gone wrong # Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. pass
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig = None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) killFilesAction = daoFactory(classname = "Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname = "Jobs.KillWorkflow") existingTransaction = False if myThread.transaction.conn: existingTransaction = True else: myThread.transaction.begin() killFilesAction.execute(workflowName = workflowName, conn = myThread.transaction.conn, transaction = True) liveJobs = killJobsAction.execute(workflowName = workflowName, conn = myThread.transaction.conn, transaction = True) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config = bossAirConfig, noSetup = True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id = liveJob["id"]) liveWMBSJob.update(liveJob) changeState.propagate(liveWMBSJob, "killed", liveJob["state"]) killableJobs.append(liveJob) # Now kill them try: bossAir.kill(jobs = killableJobs) except BossAirException, ex: # Something's gone wrong # Jobs not killed! logging.error("Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. pass
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from PyCondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill( jtok, errorCode=61301 ) # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') baAPI.kill(jobs=idleJobs) del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') baAPI.kill(jobs = idleJobs) del jobSubmitter return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from CondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill(jtok, errorCode=71301) # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname = "Locations.SetState") setStateAction.execute(siteName = siteName, state = state, conn = self.getDBConn(), transaction = self.existingTransaction()) if state == "Aborted" and self.config: # Kill all jobs in the batch system assigned to this site executingJobs = self.wmbsDAOFactory(classname = "Jobs.ListByStateAndLocation") jobIds = executingJobs.execute(state = 'executing', location = siteName) bossAir = BossAirAPI(self.config, noSetup = True) bossAir.kill(jobIds, errorCode = 61301) return
def testH_ARCTest(self): """ _ARCTest_ This test works on the ARCPlugin, checking all of its functions with a single set of jobs """ nRunning = getNArcJobs() self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginNames.append("ARCPlugin") #config.BossAir.pluginNames = ["ARCPlugin"] baAPI = BossAirAPI(config = config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi') #baAPI.createNewJobs(wmbsJobs = jobDummies) #changeState = ChangeState(config) #changeState.propagate(jobDummies, 'created', 'new') #changeState.propagate(jobDummies, 'executing', 'created') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'jade-cms.hip.fi'} job['location'] = 'jade-cms.hip.fi' job['plugin'] = 'ARCPlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = 'edelmann' job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 for j in rJobs: if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]: baAPI.kill(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 succ, fail = baAPI.submit(jobs = jobList) time.sleep(30) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 idStr = "" for j in rJobs: idStr += " " + j['gridid'] if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) # Now kill 'em manually no_jobs = True while no_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") >= 0: # It seems the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: no_jobs = False # Just to be sure, if the jobs were already finished, do a # 'ngclean' too. command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] # Make sure the killing of the jobs reaches the info.sys. still_jobs = True while still_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") < 0: # It seems the killing of the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: still_jobs = False # See what happened baAPI.track() idJobs = baAPI._loadByID(rJobs) nActiveJobs = 0 nRemovedJobs = 0 for j in idJobs: if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]: nActiveJobs += 1 if j['status'] in [ "KILLING", "KILLED", "LOST" ]: nRemovedJobs += 1 self.assertEqual(nActiveJobs, 0) self.assertEqual(nRemovedJobs, nJobs) return
def testG_gLiteTest(self): """ _gLiteTest_ This test works on the gLitePlugin, checking all of its functions with a single set of jobs """ config = self.getConfig() config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh' config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf' config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/' config.BossAir.gLiteProcesses = 2 config.BossAir.gLitePrefixEnv = "/lib64/" config.BossAir.pluginNames.append("gLitePlugin") config.BossAir.manualProxyPath = environ['X509_USER_PROXY'] config.Agent.serverDN = "/we/bypass/myproxy/logon" #config.BossAir.pluginNames = ["gLitePlugin"] baAPI = BossAirAPI(config = config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'grid-ce-01.ba.infn.it') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath) newuser = self.daoFactory(classname = "Users.New") newuser.execute(dn = userdn) for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'grid-ce-01.ba.infn.it'} job['location'] = 'grid-ce-01.ba.infn.it' job['plugin'] = 'gLitePlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = userdn job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs = jobList) # Should be new jobs newJobs = baAPI._loadByStatus(status = 'New') self.assertNotEqual(len(newJobs), nJobs) time.sleep(2) baAPI.track() # Should be not anymore marked as new newJobs = baAPI._loadByStatus('New', 0) self.assertNotEqual(len(newJobs), nJobs) # Killing all the jobs baAPI.kill( jobList ) #time.sleep(15) baAPI.track() ## Issues running tests below due to glite delay on marking job as killed # Should be just running jobs #killedJobs = baAPI._loadByStatus('Cancelled by user', 0) #self.assertEqual(len(killedJobs), 0) # Check if they're complete #completeJobs = baAPI.getComplete() #self.assertEqual(len(completeJobs), nJobs) return
def testC_CondorTest(self): """ _CondorTest_ This test works on the CondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) baAPI = BossAirAPI(config = config) print self.testDir jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'CondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs = jobList, info = info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs = jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs = jobList, info = info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), 0) #newJobs = baAPI._loadByStatus(status = 'Removed') #self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status = 'Removed', complete = '0') self.assertEqual(len(newJobs), nJobs) return
def testG_gLiteTest(self): """ _gLiteTest_ This test works on the gLitePlugin, checking all of its functions with a single set of jobs """ config = self.getConfig() config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf' config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/' config.BossAir.gLiteProcesses = 2 config.BossAir.gLitePrefixEnv = "/lib64/" config.BossAir.pluginNames.append("gLitePlugin") config.BossAir.manualProxyPath = environ['X509_USER_PROXY'] config.Agent.serverDN = "/we/bypass/myproxy/logon" #config.BossAir.pluginNames = ["gLitePlugin"] baAPI = BossAirAPI(config=config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs=nJobs, location='grid-ce-01.ba.infn.it') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath) newuser = self.daoFactory(classname="Users.New") newuser.execute(dn=userdn) for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'grid-ce-01.ba.infn.it'} job['location'] = 'grid-ce-01.ba.infn.it' job['plugin'] = 'gLitePlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = userdn job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs=jobList) # Should be new jobs newJobs = baAPI._loadByStatus(status='New') self.assertNotEqual(len(newJobs), nJobs) time.sleep(2) baAPI.track() # Should be not anymore marked as new newJobs = baAPI._loadByStatus('New', 0) self.assertNotEqual(len(newJobs), nJobs) # Killing all the jobs baAPI.kill(jobList) #time.sleep(15) baAPI.track() ## Issues running tests below due to glite delay on marking job as killed # Should be just running jobs #killedJobs = baAPI._loadByStatus('Cancelled by user', 0) #self.assertEqual(len(killedJobs), 0) # Check if they're complete #completeJobs = baAPI.getComplete() #self.assertEqual(len(completeJobs), nJobs) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') # init alert system self.initAlerts(compName="StatusPoller") return def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: logging.info("Running job status poller algorithm...") self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() self.sendAlert(6, msg=str(ex)) raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) self.sendAlert(6, msg=msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if not self.timeouts: # Then we've set ourselves to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = [] # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero", job['id']) continue if timeout and statusTime: if time.time() - float(statusTime) > float(timeout): # Timeout status is used by JobTracker to fail jobs in WMBS database logging.info( "Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState) job['status'] = 'Timeout' jobsToKill.append(job) # We need to show that the jobs are in state timeout # and then kill them. myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKill) self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[71304], errorCode=71304) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) killableJobs.append(liveJob) # Now kill them try: logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName) bossAir.kill(jobs=killableJobs, workflowName=workflowName) except BossAirException as ex: # Something's gone wrong. Jobs not killed! logging.error("Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. liveWMBSJobs = defaultdict(list) for liveJob in liveJobs: if liveJob["state"] == "killed": # Then we've killed it already continue liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) liveWMBSJobs[liveJob["state"]].append(liveWMBSJob) for state, jobsByState in liveWMBSJobs.items(): if len(jobsByState) > 100 and state != "executing": # if there are to many jobs skip the couch and dashboard update # TODO: couch and dashboard need to be updated or parallel. changeState.check("killed", state) changeState.persist(jobsByState, "killed", state) else: changeState.propagate(jobsByState, "killed", state) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') return @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: logging.info("Running job status poller algorithm...") self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if not self.timeouts: # Then we've set ourselves to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = defaultdict(list) # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero", job['id']) continue if timeout and statusTime: if time.time() - float(statusTime) > float(timeout): # Timeout status is used by JobTracker to fail jobs in WMBS database logging.info("Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState) job['status'] = 'Timeout' jobsToKill[globalState].append(job) timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306} # We need to show that the jobs are in state timeout # and then kill them. jobsToKillList = flattenList(jobsToKill.values()) myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKillList) for preJobStatus in jobsToKill: eCode = timeOutCodeMap.get(preJobStatus, 71307) # it shouldn't have 71307 (states should be among Running, Pending, Error) self.bossAir.kill(jobs=jobsToKill[preJobStatus], killMsg=WM_JOB_ERROR_CODES[eCode], errorCode=eCode) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def testC_CondorTest(self): """ _CondorTest_ This test works on the SimpleCondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI = BossAirAPI(config=config, insertStates=True) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'SimpleCondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} # info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs=jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) # newJobs = baAPI._loadByStatus(status = 'Removed') # self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status='Removed', complete='0') self.assertEqual(len(newJobs), nJobs) return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) killableJobs.append(liveJob) # Now kill them try: logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName) bossAir.kill(jobs=killableJobs, workflowName=workflowName) except BossAirException as ex: # Something's gone wrong. Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. liveWMBSJobs = defaultdict(list) for liveJob in liveJobs: if liveJob["state"] == "killed": # Then we've killed it already continue liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) liveWMBSJobs[liveJob["state"]].append(liveWMBSJob) for state, jobsByState in liveWMBSJobs.items(): if len(jobsByState) > 100 and state != "executing": # if there are to many jobs skip the couch and dashboard update # TODO: couch and dashboard need to be updated or parallel. changeState.check("killed", state) changeState.persist(jobsByState, "killed", state) else: changeState.propagate(jobsByState, "killed", state) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts', {}) # init alert system self.initAlerts(compName="StatusPoller") return def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() self.sendAlert(6, msg=str(ex)) raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) self.sendAlert(6, msg=msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if self.timeouts == {}: # Then we've set outself to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = [] # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero" % job['id']) continue if timeout != None and statusTime != None: if time.time() - float(statusTime) > float(timeout): # Then the job needs to be killed. logging.info("Killing job %i because it has exceeded timeout for status %s" % (job['id'], globalState)) job['status'] = 'Timeout' jobsToKill.append(job) # We need to show that the jobs are in state timeout # and then kill them. myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKill) self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[61304], errorCode=61304) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)