def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config, insertStates=True) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(results[0]['Pending'], nJobs) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states and perform proper actions with the jobs, according to the state """ timeNow = int(time.time()) state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303} executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if jobInfo: bossAir = BossAirAPI(self.config) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in state2ExitCode) ercode = state2ExitCode.get(state, 71300) bossAir.kill(jobtokill, errorCode=ercode) # only now that jobs were updated by the plugin, we flip the site state setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, stateTime=timeNow, conn=self.getDBConn(), transaction=self.existingTransaction()) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, conn=self.getDBConn(), transaction=self.existingTransaction()) executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if not jobInfo: # then no jobs to look at return bossAir = BossAirAPI(self.config, noSetup=True) jobtokill = bossAir.updateSiteInformation( jobInfo, siteName, state in ("Aborted", "Draining", "Down")) if state == "Aborted": ercode = 71301 elif state == "Draining": ercode = 71302 elif state == "Down": ercode = 71303 else: ercode = 71300 bossAir.kill(jobtokill, errorCode=ercode) return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() #DAO factory for WMBS objects self.daoFactory = DAOFactory(package = "WMCore.WMBS", \ logger = logging, dbinterface = myThread.dbi) self.config = config #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) # BossAir self.bossAir = BossAirAPI(config=self.config) # Additions for caching-based JobSubmitter self.workflowTimestamps = {} self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.siteKeys = {} self.locationDict = {} self.cmsNames = {} self.drainSites = [] self.sortedSites = [] self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) # initialize the alert framework (if available) self.initAlerts(compName="JobSubmitter") try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except Exception, ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) try: logging.debug("PackageDir: %s" % self.packageDir) logging.debug("Config: %s" % config) except: pass raise JobSubmitterPollerException(msg)
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") existingTransaction = False if myThread.transaction.conn: existingTransaction = True else: myThread.transaction.begin() killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn, transaction=True) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn, transaction=True) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) changeState.propagate(liveWMBSJob, "killed", liveJob["state"]) killableJobs.append(liveJob) # Now kill them try: bossAir.kill(jobs=killableJobs) except BossAirException, ex: # Something's gone wrong # Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. pass
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') baAPI.kill(jobs=idleJobs) del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from PyCondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill( jtok, errorCode=61301 ) # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
def testKillWorkflow(self): """ _testKillWorkflow_ Verify that workflow killing works correctly. """ baAPI = BossAirAPI(config=self.config, insertStates=True) # Create nine jobs self.setupForKillTest(baAPI=baAPI) self.assertEqual(len(baAPI._listRunJobs()), 9) killWorkflow("Main", self.config, self.config) self.verifyFileKillStatus() self.verifyJobKillStatus() self.assertEqual(len(baAPI._listRunJobs()), 8) return
def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") # initialize the alert framework (if available) self.initAlerts(compName="JobTracker")
def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config = self.config) self.reqmgr = RequestManager({'endpoint' : self.config.JobUpdater.reqMgrUrl}) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname = "Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname = "Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname = "Jobs.GetNumberOfJobsForWorkflowTaskStatus")
def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'mnorman' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job['plugin'], 'CondorPlugin') return
def testKillWorkflow(self): """ _testKillWorkflow_ Verify that workflow killing works correctly. """ configFile = EmulatorSetup.setupWMAgentConfig() config = loadConfigurationFile(configFile) baAPI = BossAirAPI(config=config) # Create nine jobs self.setupForKillTest(baAPI=baAPI) self.assertEqual(len(baAPI._listRunJobs()), 9) killWorkflow("Main", config, config) self.verifyFileKillStatus() self.verifyJobKillStatus() self.assertEqual(len(baAPI._listRunJobs()), 8) EmulatorSetup.deleteConfig(configFile) return
def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs=jobDummies) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs=jobDummies) result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return
def testA_APITest(self): """ _APITest_ This is a commissioning test that has very little to do with anything except loading the code. """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # We should have loaded a plugin self.assertTrue('TestPlugin' in baAPI.plugins.keys()) result = myThread.dbi.processData( "SELECT name FROM bl_status")[0].fetchall() statusList = [] for i in result: statusList.append(i.values()[0]) # We should have the plugin states in the database self.assertEqual(statusList.sort(), ['New', 'Dead', 'Gone'].sort()) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) print(jobDummies) baAPI.createNewJobs(wmbsJobs=jobDummies) runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), 0) raisesException = False self.assertRaises(BossAirException, baAPI._loadByStatus, status='FalseStatus') # Change the job status and update it for job in newJobs: job['status'] = 'Dead' baAPI._updateJobs(jobs=newJobs) # Test whether we see the job status as updated newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), nJobs) # Can we load by BossAir ID? loadedJobs = baAPI._loadByID(jobs=deadJobs) self.assertEqual(len(loadedJobs), nJobs) # Can we load via WMBS? loadedJobs = baAPI.loadByWMBS(wmbsJobs=jobDummies) self.assertEqual(len(loadedJobs), nJobs) # See if we can delete jobs baAPI._deleteJobs(jobs=deadJobs) # Confirm that they're gone deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), 0) self.assertEqual(len(baAPI.jobs), 0) return
def testD_MyProxyDelegation(self): """ _MyProxyDelegation_ Test whether we can delegate a proxy via myproxy to this job IMPORTANT: If you are going to run this test you will have to set the serverCert/Key config options to point to your local server cert. You will also have to run this job with your DN. I don't recommend figuring out how to do this without knowing what you're doing in regards to proxy stuff. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing proxyDir = os.path.join(self.testDir, 'proxyDir') os.mkdir(proxyDir) config = self.getConfig() config.BossAir.removeTime = -10.0 config.BossAir.pluginNames.append('VanillaCondorPlugin') config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem' config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem' config.BossAir.myproxyServer = 'myproxy.cern.ch' config.BossAir.proxyDir = proxyDir config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a' userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632' nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI = BossAirAPI(config=config) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'VanillaCondorPlugin' tmpJob['owner'] = userDN tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs=jobList, info=info) proxyFile = os.listdir(proxyDir)[0] stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \ % os.path.join(proxyDir, proxyFile)) self.assertEqual( stdout.split('\n')[0], 'subject : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN) # Now kill 'em manually command = ['condor_rm', self.user] SubprocessAlgos.runCommand(cmd=command, shell=False) return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache( ) self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return
def testG_gLiteTest(self): """ _gLiteTest_ This test works on the gLitePlugin, checking all of its functions with a single set of jobs """ config = self.getConfig() config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf' config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/' config.BossAir.gLiteProcesses = 2 config.BossAir.gLitePrefixEnv = "/lib64/" config.BossAir.pluginNames.append("gLitePlugin") config.BossAir.manualProxyPath = environ['X509_USER_PROXY'] config.Agent.serverDN = "/we/bypass/myproxy/logon" #config.BossAir.pluginNames = ["gLitePlugin"] baAPI = BossAirAPI(config=config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs=nJobs, location='grid-ce-01.ba.infn.it') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath) newuser = self.daoFactory(classname="Users.New") newuser.execute(dn=userdn) for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'grid-ce-01.ba.infn.it'} job['location'] = 'grid-ce-01.ba.infn.it' job['plugin'] = 'gLitePlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = userdn job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs=jobList) # Should be new jobs newJobs = baAPI._loadByStatus(status='New') self.assertNotEqual(len(newJobs), nJobs) time.sleep(2) baAPI.track() # Should be not anymore marked as new newJobs = baAPI._loadByStatus('New', 0) self.assertNotEqual(len(newJobs), nJobs) # Killing all the jobs baAPI.kill(jobList) #time.sleep(15) baAPI.track() ## Issues running tests below due to glite delay on marking job as killed # Should be just running jobs #killedJobs = baAPI._loadByStatus('Cancelled by user', 0) #self.assertEqual(len(killedJobs), 0) # Check if they're complete #completeJobs = baAPI.getComplete() #self.assertEqual(len(completeJobs), nJobs) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testC_CondorTest(self): """ _CondorTest_ This test works on the SimpleCondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI = BossAirAPI(config=config, insertStates=True) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'SimpleCondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} # info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs=jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) # newJobs = baAPI._loadByStatus(status = 'Removed') # self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status='Removed', complete='0') self.assertEqual(len(newJobs), nJobs) return
def testE_FullChain(self): """ _FullChain_ Full test going through the chain; using polling cycles and everything """ from WMComponent.JobSubmitter.JobSubmitter import JobSubmitter from WMComponent.JobStatusLite.JobStatusLite import JobStatusLite from WMComponent.JobTracker.JobTracker import JobTracker myThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'SimpleCondorPlugin' baAPI = BossAirAPI(config=config, insertStates=True) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site='se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitter(config=config) jobTracker = JobTracker(config=config) jobStatus = JobStatusLite(config=config) jobSubmitter.prepareToStart() jobTracker.prepareToStart() jobStatus.prepareToStart() # What should happen here: # 1) The JobSubmitter should submit the jobs # 2) Because of the ridiculously short time on pending jobs # the JobStatus poller should mark the jobs as done # and kill them. # 3) The JobTracker should realize there are finished jobs # # So at the end of several polling cycles, the jobs should all # be done, but be in the failed status (they timed out) time.sleep(20) myThread.workerThreadManager.terminateWorkers() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='JobFailed', jobType="Processing") self.assertEqual(len(result), nJobs * nSubs) return
def testD_PrototypeChain(self): """ _PrototypeChain_ Prototype the BossAir workflow """ dummymyThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'SimpleCondorPlugin' baAPI = BossAirAPI(config=config, insertStates=True) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site='se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobTracker = JobTrackerPoller(config=config) statusPoller = StatusPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nSubs * nJobs) # Check WMBS getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) statusPoller.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nSubs * nJobs) # Tracker should do nothing jobTracker.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Wait for jobs to timeout due to short Pending wait period time.sleep(12) statusPoller.algorithm() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Timeout', complete='0') self.assertEqual(len(newJobs), nSubs * nJobs) # Jobs should be gone nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nSubs * nJobs) # Because they timed out, they all should have failed jobTracker.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='JobFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) killableJobs.append(liveJob) # Now kill them try: logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName) bossAir.kill(jobs=killableJobs, workflowName=workflowName) except BossAirException as ex: # Something's gone wrong. Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. liveWMBSJobs = defaultdict(list) for liveJob in liveJobs: if liveJob["state"] == "killed": # Then we've killed it already continue liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) liveWMBSJobs[liveJob["state"]].append(liveWMBSJob) for state, jobsByState in liveWMBSJobs.items(): if len(jobsByState) > 100 and state != "executing": # if there are to many jobs skip the couch and dashboard update # TODO: couch and dashboard need to be updated or parallel. changeState.check("killed", state) changeState.persist(jobsByState, "killed", state) else: changeState.propagate(jobsByState, "killed", state) return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except Exception as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) try: logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) except: pass raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} return
def testH_ARCTest(self): """ _ARCTest_ This test works on the ARCPlugin, checking all of its functions with a single set of jobs """ nRunning = getNArcJobs() self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginNames.append("ARCPlugin") #config.BossAir.pluginNames = ["ARCPlugin"] baAPI = BossAirAPI(config = config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi') #baAPI.createNewJobs(wmbsJobs = jobDummies) #changeState = ChangeState(config) #changeState.propagate(jobDummies, 'created', 'new') #changeState.propagate(jobDummies, 'executing', 'created') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'jade-cms.hip.fi'} job['location'] = 'jade-cms.hip.fi' job['plugin'] = 'ARCPlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = 'edelmann' job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 for j in rJobs: if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]: baAPI.kill(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 succ, fail = baAPI.submit(jobs = jobList) time.sleep(30) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 idStr = "" for j in rJobs: idStr += " " + j['gridid'] if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) # Now kill 'em manually no_jobs = True while no_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") >= 0: # It seems the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: no_jobs = False # Just to be sure, if the jobs were already finished, do a # 'ngclean' too. command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] # Make sure the killing of the jobs reaches the info.sys. still_jobs = True while still_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") < 0: # It seems the killing of the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: still_jobs = False # See what happened baAPI.track() idJobs = baAPI._loadByID(rJobs) nActiveJobs = 0 nRemovedJobs = 0 for j in idJobs: if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]: nActiveJobs += 1 if j['status'] in [ "KILLING", "KILLED", "LOST" ]: nRemovedJobs += 1 self.assertEqual(nActiveJobs, 0) self.assertEqual(nRemovedJobs, nJobs) return