def testF_LinearAlgo(self): """ _testLinearAlgo_ Test the linear algorithm to make sure it loads and works """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing': 'LinearAlgo'} config.RetryManager.section_("LinearAlgo") config.RetryManager.LinearAlgo.section_("Processing") config.RetryManager.LinearAlgo.Processing.coolOffTime = { 'create': 10, 'submit': 10, 'job': 10 } changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 12) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) return
def __init__(self, config): """ init jobCreator """ BaseWorkerThread.__init__(self) myThread = threading.currentThread() # DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.setBulkCache = self.daoFactory(classname="Jobs.SetCache") self.countJobs = self.daoFactory( classname="Jobs.GetNumberOfJobsPerWorkflow") self.subscriptionList = self.daoFactory( classname="Subscriptions.ListIncomplete") self.setFWJRPath = self.daoFactory(classname="Jobs.SetFWJRPath") # information self.config = config # Variables self.defaultJobType = config.JobCreator.defaultJobType self.limit = getattr(config.JobCreator, 'fileLoadLimit', 500) self.agentNumber = int(getattr(config.Agent, 'agentNumber', 0)) self.agentName = getattr(config.Agent, 'hostName', '') self.glideinLimits = getattr(config.JobCreator, 'GlideInRestriction', None) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="JobCreator") try: self.jobCacheDir = getattr( config.JobCreator, 'jobCacheDir', os.path.join(config.JobCreator.componentDir, 'jobCacheDir')) self.check() except WMException: raise except Exception as ex: msg = "Unhandled exception while setting up jobCacheDir!\n" msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) raise JobCreatorException(msg) self.changeState = ChangeState(self.config) return
def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id=1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
def setUp(self): """ _setUp_ """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.WMBS"]) self.splitterFactory = SplitterFactory(package="WMCore.JobSplitting") myThread = threading.currentThread() self.myThread = myThread daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) self.WMBSFactory = daoFactory config = self.getConfig() self.changer = ChangeState(config) myResourceControl = ResourceControl() myResourceControl.insertSite("SomeSite", 10, 20, "SomeSE", "SomeCE") myResourceControl.insertSite("SomeSite", 10, 20, "SomeSE2", "SomeCE") myResourceControl.insertSite("SomeSite2", 10, 20, "SomeSE3", "SomeCE2") self.fileset1 = Fileset(name="TestFileset1") for file in range(11): newFile = File("/some/file/name%d" % file, size=1000, events=100) newFile.addRun(Run(1, *[1])) newFile.setLocation('SomeSE') self.fileset1.addFile(newFile) self.fileset1.create() workflow1 = Workflow(spec="spec.xml", owner="hufnagel", name="TestWorkflow1", task="Test") workflow1.create() self.subscription1 = Subscription(fileset=self.fileset1, workflow=workflow1, split_algo="Harvest", type="Harvesting") self.subscription1.create() self.configFile = EmulatorSetup.setupWMAgentConfig() return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') baAPI.kill(jobs=idleJobs) del jobSubmitter return
def testF_PollerProfileTest(self): """ _testF_PollerProfileTest_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 100 nJobs = 100 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots = 20000, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 10000, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10000, 'runningSlots' :-1, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = site) jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = site, taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Actually run it startTime = time.time() cProfile.runctx("JobSubmitterPoller(config=config).algorithm()", globals(), locals(), filename="testStats.stat") stopTime = time.time() print("Job took %f seconds to complete" % (stopTime - startTime)) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats() return
def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ nJobs = 100 workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) startTime = time.time() cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename="profStats.stat") stopTime = time.time() idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def __init__(self, config, insertStates=False): """ __init__ BossAir should work with the standard config structure of WMAgent """ WMConnectionBase.__init__(self, daoPackage="WMCore.BossAir") myThread = threading.currentThread() self.config = config self.plugins = {} self.states = [] self.jobs = [] self.pluginDir = config.BossAir.pluginDir # This is the default state jobs are created in self.newState = getattr(config.BossAir, 'newState', 'New') # Get any proxy info self.checkProxy = getattr(config.BossAir, 'checkProxy', False) self.cert = getattr(config.BossAir, 'cert', None) self.stateMachine = ChangeState(self.config) # Create a factory to load plugins self.pluginFactory = WMFactory("plugins", self.pluginDir) self.daoFactory = DAOFactory(package="WMCore.BossAir", logger=myThread.logger, dbinterface=myThread.dbi) self.deleteDAO = self.daoFactory(classname="DeleteJobs") self.stateDAO = self.daoFactory(classname="NewState") self.loadByWMBSDAO = self.daoFactory(classname="LoadByWMBSID") self.updateDAO = self.daoFactory(classname="UpdateJobs") self.newJobDAO = self.daoFactory(classname="NewJobs") self.runningJobDAO = self.daoFactory(classname="LoadRunning") self.completeJobDAO = self.daoFactory(classname="LoadComplete") self.loadJobsDAO = self.daoFactory(classname="LoadByStatus") self.completeDAO = self.daoFactory(classname="CompleteJob") self.monitorDAO = self.daoFactory(classname="JobStatusForMonitoring") self.states = None self.loadPlugin(insertStates) return
def testUpdateFailedDoc(self): """ _testUpdateFailedDoc_ Verify that the update function will work correctly and not throw a 500 error if the doc didn't make it into the database for some reason. """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname="Locations.New") locationAction.execute("site1", seName="somese.cern.ch") testWorkflow = Workflow(spec="spec.xml", owner="Steve", name="wf001", task=self.taskName) testWorkflow.create() testFileset = Fileset(name="TestFileset") testFileset.create() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased") testSubscription.create() testFileA = File(lfn="SomeLFNA", events=1024, size=2048, locations=set(["somese.cern.ch"])) testFileA.create() testFileset.addFile(testFileA) testFileset.commit() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroup = jobFactory(files_per_job=1)[0] testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Merge" testJobA["couch_record"] = str(testJobA["id"]) change.propagate([testJobA], "new", "none") testJobADoc = change.jobsdatabase.document(testJobA["couch_record"]) self.assertTrue(testJobADoc.has_key("states")) self.assertTrue(testJobADoc["states"].has_key("1")) return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from PyCondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill(jtok, errorCode=71301) # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.changeState = ChangeState(self.config) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.loadAction = self.daoFactory( classname="Jobs.LoadFromIDWithWorkflow") # Variables self.numberOfJobsToCluster = getattr(self.config.JobArchiver, "numberOfJobsToCluster", 1000) self.numberOfJobsToArchive = getattr(self.config.JobArchiver, "numberOfJobsToArchive", 10000) # initialize the alert framework (if available) self.initAlerts(compName="JobArchiver") try: self.logDir = getattr( config.JobArchiver, 'logDir', os.path.join(config.JobArchiver.componentDir, 'logDir')) if not os.path.isdir(self.logDir): os.makedirs(self.logDir) except Exception as ex: msg = "Unhandled exception while setting up logDir!\n" msg += str(ex) logging.exception(msg) raise JobArchiverPollerException(msg) try: self.workQueue = queueFromConfig(self.config) except Exception as ex: msg = "Could not load workQueue" msg += str(ex) logging.error(msg) # raise JobArchiverPollerException(msg) self.handleWorkflowInjection = getattr(self.config.JobArchiver, 'handleInjected', True) return
def testB_SpeedTest(self): """ _SpeedTest_ Tests the components, as in sees if they load. Otherwise does nothing. """ return myThread = threading.currentThread() config = self.getConfig() self.nJobs = 2000 testJobGroup = self.createTestJobGroup() changer = ChangeState(config) cacheDir = os.path.join(self.testDir, 'test') for job in testJobGroup.jobs: job["outcome"] = "success" job.save() path = os.path.join(cacheDir, job['name']) os.makedirs(path) f = open('%s/%s.out' %(path, job['name']),'w') f.write(job['name']) f.close() job.setCache(path) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'success', 'complete') testJobArchiver = JobArchiverPoller(config = config) cProfile.runctx("testJobArchiver.algorithm()", globals(), locals(), filename = "testStats.stat") p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats(.2) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
def testCheck(self): """ This is the test class for function Check from module ChangeState """ change = ChangeState(self.config, "changestate_t") # Run through all good state transitions and assert that they work for state in self.transitions.keys(): for dest in self.transitions[state]: change.check(dest, state) dummystates = ['dummy1', 'dummy2', 'dummy3', 'dummy4'] # Then run through some bad state transistions and assertRaises(AssertionError) for state in self.transitions.keys(): for dest in dummystates: self.assertRaises(AssertionError, change.check, dest, state) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException( 'Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName="ErrorHandler") # Some exit codes imply an immediate failure, non-configurable self.exitCodes.extend(WMJobPermanentSystemErrors) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.maxRetries = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName).get("MaxRetries") if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException( 'Max retries for the default job type must be specified') self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config self.changeState = ChangeState(self.config) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.loadAction = self.daoFactory( classname="Jobs.LoadFromIDWithWorkflow") # Variables self.numberOfJobsToCluster = getattr(self.config.JobArchiver, "numberOfJobsToCluster", 1000) # initialize the alert framework (if available) self.initAlerts(compName="JobArchiver") try: self.uploadPublishDir = getattr( self.config.JobArchiver, 'uploadPublishDir', os.path.join(config.JobArchiver.componentDir, 'publishDir')) self.logDir = getattr( config.JobArchiver, 'logDir', os.path.join(config.JobArchiver.componentDir, 'logDir')) if not os.path.isdir(self.logDir): os.makedirs(self.logDir) if not os.path.isdir(self.uploadPublishDir): os.makedirs(self.uploadPublishDir) except Exception, ex: msg = "Unhandled exception while setting up logDir and/or uploadPublishDir!\n" msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) try: logging.debug("Directory: %s" % self.logDir) logging.debug("Config: %s" % config) except: pass raise JobArchiverPollerException(msg)
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) pluginPath = getattr(self.config.RetryManager, "pluginPath", "WMComponent.RetryManager.PlugIns") self.pluginFactory = WMFactory("plugins", pluginPath) self.changeState = ChangeState(self.config) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") # initialize the alert framework (if available) (self.sendAlert()) self.initAlerts(compName="RetryManager") # get needed plugins self.plugins = {} self.typePluginsAssoc = getattr(self.config.RetryManager, 'plugins', {}) self.typePluginsAssoc.setdefault('default', 'DefaultRetryAlgo') for pluginName in self.typePluginsAssoc.values(): try: plugin = self.pluginFactory.loadObject(classname=pluginName, args=config) self.plugins[pluginName] = plugin except Exception as ex: msg = "Error loading plugin %s on path %s\n" % (pluginName, pluginPath) msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) raise RetryManagerException(msg) return
def testC_Job(self): """ WMComponent_t.RetryManager_t.RetryManager_t:testJob() Mimics creation of component and test jobs failed in create stage. """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) return
def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ return import cProfile, pstats nJobs = 1000 testJobGroup = self.createTestJobGroup(nJobs=nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) startTime = time.time() #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat") testErrorHandler.algorithm() stopTime = time.time() idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='CreateCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") # initialize the alert framework (if available) self.initAlerts(compName="JobTracker")
def thrashCouch(): """ _thrashCouch_ """ jobs = { "new": set(), "created": set(), "executing": set(), "complete": set(), "success": set(), "cleanout": set() } config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) changeState = ChangeState(config) myReport = Report() myReport.unpersist( os.path.join( getWMBASE(), "test/python/WMComponent_t/JobAccountant_t/fwjrs/LoadTest00.pkl")) for i in range(500): jobs = createJobs() changeState.recordInCouch(jobs, "created", "new") changeState.recordInCouch(jobs, "executing", "created") changeState.recordInCouch(jobs, "complete", "executing") for job in jobs: job["fwjr"] = myReport changeState.recordInCouch(jobs, "success", "complete") for job in jobs: job["fwjr"] = None changeState.recordInCouch(jobs, "cleanout", "success") #time.sleep(10) return
def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job['plugin'], 'CondorPlugin') return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) self.setupComponentParam() return
def setUp(self): myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.WMBS"], useDefault=False) self.databaseName = "couchapp_t_0" # Setup config for couch connections config = self.testInit.getConfiguration() self.testInit.setupCouch(self.databaseName, "WorkloadSummary") self.testInit.setupCouch( "%s/jobs" % config.JobStateMachine.couchDBName, "JobDump") self.testInit.setupCouch( "%s/fwjrs" % config.JobStateMachine.couchDBName, "FWJRDump") self.testInit.setupCouch(config.JobStateMachine.summaryStatsDBName, "SummaryStats") # Create couch server and connect to databases self.couchdb = CouchServer(config.JobStateMachine.couchurl) self.jobsdatabase = self.couchdb.connectDatabase( "%s/jobs" % config.JobStateMachine.couchDBName) self.fwjrdatabase = self.couchdb.connectDatabase( "%s/fwjrs" % config.JobStateMachine.couchDBName) self.statsumdatabase = self.couchdb.connectDatabase( config.JobStateMachine.summaryStatsDBName) # Create changeState self.changeState = ChangeState(config) self.config = config # Create testDir self.testDir = self.testInit.generateWorkDir() return
def testC_Jobs(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs() Mimics creation of component and test jobs failed in execute stage. """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) return
def __init__(self, **configDict): """ init jobCreator """ myThread = threading.currentThread() self.transaction = myThread.transaction #DAO factory for WMBS objects self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = logging, dbinterface = myThread.dbi) # WMCore splitter factory for splitting up jobs. self.splitterFactory = SplitterFactory() config = Configuration() config.section_("JobStateMachine") config.JobStateMachine.couchurl = configDict["couchURL"] config.JobStateMachine.couch_retries = configDict["defaultRetries"] config.JobStateMachine.couchDBName = configDict["couchDBName"] self.config = config #Variables self.jobCacheDir = configDict['jobCacheDir'] self.defaultJobType = configDict['defaultJobType'] self.limit = configDict.get('fileLoadLimit', 500) self.createWorkArea = CreateWorkArea() self.changeState = ChangeState(self.config) return
def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs=jobDummies) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs=jobDummies) result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return
def testY_MultipleIterations(self): """ _MultipleIterations_ Paranoia based check to see if I'm saving class instances correctly """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'submitfailed', 'Created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) # Make a new jobGroup for a second run testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) # Set job state changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') # Set them to go off for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 200) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs * 2) return
def testI_MultipleJobTypes(self): """ _testI_MultipleJobTypes_ Check that we can configure different retry algorithms for different job types, including a default for nonspecified types. Also check that two job types can share the same retry algorithm but with different parameters """ # Let's create 4 job groups processingJobGroup = self.createTestJobGroup(nJobs=10, retryOnce=True) productionJobGroup = self.createTestJobGroup(nJobs=15, subType="Production", retryOnce=True) mergeJobGroup = self.createTestJobGroup(nJobs=20, subType="Merge", retryOnce=True) skimJobGroup = self.createTestJobGroup(nJobs=5, subType="Skim", retryOnce=True) # Set an adequate config # Processing jobs get the PauseAlgo with pauseCount 4 # Production jobs get the ExponentialAlgo # Merge jobs get the PauseAlgo but with pauseCount 2 which is the default # Skim jobs are not configured, so they get the default SquaredAlgo config = self.getConfig() config.RetryManager.plugins = { 'Processing': 'PauseAlgo', 'Production': 'ExponentialAlgo', 'Merge': 'PauseAlgo', 'default': 'SquaredAlgo' } config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = { 'create': 30, 'submit': 30, 'job': 30 } config.RetryManager.PauseAlgo.Processing.pauseCount = 4 config.RetryManager.PauseAlgo.section_("default") config.RetryManager.PauseAlgo.default.coolOffTime = { 'create': 60, 'submit': 60, 'job': 60 } config.RetryManager.PauseAlgo.default.pauseCount = 2 config.RetryManager.section_("ExponentialAlgo") config.RetryManager.ExponentialAlgo.section_("Production") config.RetryManager.ExponentialAlgo.Production.coolOffTime = { 'create': 30, 'submit': 30, 'job': 30 } config.RetryManager.ExponentialAlgo.section_("default") config.RetryManager.ExponentialAlgo.default.coolOffTime = { 'create': 60, 'submit': 60, 'job': 60 } config.RetryManager.section_("SquaredAlgo") config.RetryManager.SquaredAlgo.section_("Skim") config.RetryManager.SquaredAlgo.Skim.coolOffTime = { 'create': 30, 'submit': 30, 'job': 30 } config.RetryManager.SquaredAlgo.section_("default") config.RetryManager.SquaredAlgo.default.coolOffTime = { 'create': 60, 'submit': 60, 'job': 60 } # Start the state changer and RetryManager changer = ChangeState(config) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) # Create the jobs for the first time changer.propagate(processingJobGroup.jobs, 'created', 'new') # Let's start with the processing jobs and the pauseAlgo for count in range(1, 5): # Fail the jobs changer.propagate(processingJobGroup.jobs, 'executing', 'created') changer.propagate(processingJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(processingJobGroup.jobs, 'jobcooloff', 'jobfailed') # Check that the cooloff time is strictly enforced # First a job time just below the cooloff time for job in processingJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(processingJobGroup.jobs), "Jobs went into cooloff without the proper timing") # Now above the cooloff time for job in processingJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) - 5) testRetryManager.algorithm(None) # Make sure the jobs get created again or go to paused if count < 4: idList = self.getJobs.execute(state='created') else: idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), len(processingJobGroup.jobs), "Jobs didn't change state correctly") # Unpause them so they don't interfere with subsequent tests changer.propagate(processingJobGroup.jobs, 'created', 'jobpaused') changer.propagate(processingJobGroup.jobs, 'executing', 'created') # Now the production jobs and the exponential algo changer.propagate(productionJobGroup.jobs, 'created', 'new') for count in range(1, 3): changer.propagate(productionJobGroup.jobs, 'executing', 'created') changer.propagate(productionJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(productionJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in productionJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - pow(30, count) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(productionJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in productionJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - pow(30, count) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), len(productionJobGroup.jobs), "Jobs didn't change state correctly") # Send them to executing changer.propagate(productionJobGroup.jobs, 'executing', 'created') # Now the merge jobs and the paused algo with different parameters changer.propagate(mergeJobGroup.jobs, 'created', 'new') for count in range(1, 3): changer.propagate(mergeJobGroup.jobs, 'executing', 'created') changer.propagate(mergeJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(mergeJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in mergeJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(mergeJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in mergeJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 60 * pow(count, 2) - 5) testRetryManager.algorithm(None) if count < 2: idList = self.getJobs.execute(state='created') else: idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), len(mergeJobGroup.jobs), "Jobs didn't change state correctly") # Send them to executing changer.propagate(mergeJobGroup.jobs, 'created', 'jobpaused') changer.propagate(mergeJobGroup.jobs, 'executing', 'created') # Now the skim jobs and the squared algo changer.propagate(skimJobGroup.jobs, 'created', 'new') for count in range(1, 3): changer.propagate(skimJobGroup.jobs, 'executing', 'created') changer.propagate(skimJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(skimJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in skimJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(skimJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in skimJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), len(skimJobGroup.jobs), "Jobs didn't change state correctly")