def testY_MultipleIterations(self): """ _MultipleIterations_ Paranoia based check to see if I'm saving class instances correctly """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, "submitfailed", "Created") changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed") idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="Created") self.assertEqual(len(idList), self.nJobs) # Make a new jobGroup for a second run testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) # Set job state changer.propagate(testJobGroup.jobs, "submitfailed", "created") changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed") # Set them to go off for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 200) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="Created") self.assertEqual(len(idList), self.nJobs * 2) return
def testZ_Profile(self): """ _Profile_ Do a basic profiling of the algo """ return import cProfile, pstats nJobs = 1000 testJobGroup = self.createTestJobGroup(nJobs = nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed') idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 150) startTime = time.time() #cProfile.runctx("testRetryManager.algorithm()", globals(), locals(), filename = "profStats.stat") testRetryManager.algorithm(None) stopTime = time.time() idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'New') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def preInitialization(self): """ Initializes plugins for different messages """ # Add event loop to worker manager myThread = threading.currentThread() pollInterval = self.config.RetryManager.pollInterval logging.info("Setting poll interval to %s seconds", pollInterval) myThread.workerThreadManager.addWorker(RetryManagerPoller(self.config), pollInterval)
def testF_LinearAlgo(self): """ _testLinearAlgo_ Test the linear algorithm to make sure it loads and works """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing': 'LinearAlgo'} config.RetryManager.section_("LinearAlgo") config.RetryManager.LinearAlgo.section_("Processing") config.RetryManager.LinearAlgo.Processing.coolOffTime = { 'create': 10, 'submit': 10, 'job': 10 } changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 12) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) return
def testZ_Profile(self): """ _Profile_ Do a basic profiling of the algo """ return import cProfile, pstats nJobs = 1000 testJobGroup = self.createTestJobGroup(nJobs = nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed') idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 150) startTime = time.time() #cProfile.runctx("testRetryManager.algorithm()", globals(), locals(), filename = "profStats.stat") testRetryManager.algorithm(None) stopTime = time.time() idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'New') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def testF_LinearAlgo(self): """ _testLinearAlgo_ Test the linear algorithm to make sure it loads and works """ testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing' : 'LinearAlgo'} config.RetryManager.section_("LinearAlgo") config.RetryManager.LinearAlgo.section_("Processing") config.RetryManager.LinearAlgo.Processing.coolOffTime = {'create': 10, 'submit': 10, 'job': 10} changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 12) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) return
def testC_Job(self): """ WMComponent_t.RetryManager_t.RetryManager_t:testJob() Mimics creation of component and test jobs failed in create stage. """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) return
def testD_SquaredAlgo(self): """ _testSquaredAlgo_ Test the squared algorithm to make sure it loads and works """ testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.pluginName = 'SquaredAlgo' config.RetryManager.coolOffTime = {'create': 10, 'submit': 10, 'job': 10} changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 12) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs)
def testE_ExponentialAlgo(self): """ _testExponentialAlgo_ Test the exponential algorithm to make sure it loads and works """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {"Processing": "ExponentialAlgo"} config.RetryManager.section_("ExponentialAlgo") config.RetryManager.ExponentialAlgo.section_("Processing") config.RetryManager.ExponentialAlgo.Processing.coolOffTime = {"create": 10, "submit": 10, "job": 10} changer = ChangeState(config) changer.propagate(testJobGroup.jobs, "created", "new") changer.propagate(testJobGroup.jobs, "submitfailed", "created") changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed") changer.propagate(testJobGroup.jobs, "created", "submitcooloff") changer.propagate(testJobGroup.jobs, "submitfailed", "created") changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed") idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 12) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="SubmitCooloff") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="Created") self.assertEqual(len(idList), self.nJobs)
def testC_Job(self): """ WMComponent_t.RetryManager_t.RetryManager_t:testJob() Mimics creation of component and test jobs failed in create stage. """ testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.pluginName = 'DefaultRetryAlgo' changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) return
def testC_Job(self): """ WMComponent_t.RetryManager_t.RetryManager_t:testJob() Mimics creation of component and test jobs failed in create stage. """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, "created", "new") changer.propagate(testJobGroup.jobs, "executing", "created") changer.propagate(testJobGroup.jobs, "complete", "executing") changer.propagate(testJobGroup.jobs, "jobfailed", "complete") changer.propagate(testJobGroup.jobs, "jobcooloff", "jobfailed") idList = self.getJobs.execute(state="JobCooloff") self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="JobCooloff") self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state="JobCooloff") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="Created") self.assertEqual(len(idList), self.nJobs) return
def testH_PauseAlgo(self): testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.pluginName = 'PauseAlgo' config.RetryManager.coolOffTime = {'create': 20, 'submit': 20, 'job': 20} config.RetryManager.pauseCount = 2 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Making sure that jobs are in submitcooloff state idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 20) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Make sure that no change happens after timeout idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 400) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'jobpaused') self.assertEqual(len(idList), self.nJobs) # emulating ops retrying the job changer.propagate(testJobGroup.jobs, 'new', 'createpaused') # Making sure it did the right thing idList = self.getJobs.execute(state = 'new') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 8000) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 160000) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'jobpaused') self.assertEqual(len(idList), self.nJobs) return
def testH_PauseAlgo(self): """ _testH_PauseAlgo_ Test the pause algorithm, note that given pauseCount = n, the job will run first n + 1 times before being paused. After that it will be paused each n times """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing': 'PauseAlgo'} config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = { 'create': 20, 'submit': 20, 'job': 20 } config.RetryManager.PauseAlgo.Processing.pauseCount = 2 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) # Making sure that jobs are not created ahead of time for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 15) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 25) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Make sure that no change happens before timeout for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 75) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be paused for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin pauses them testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), self.nJobs) # Emulating ops retrying the job changer.propagate(testJobGroup.jobs, 'created', 'jobpaused') # Making sure it did the right thing idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 175) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 185) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 315) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobcooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 325) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), self.nJobs) return
def testI_MultipleJobTypes(self): """ _testI_MultipleJobTypes_ Check that we can configure different retry algorithms for different job types, including a default for nonspecified types. Also check that two job types can share the same retry algorithm but with different parameters """ #Let's create 4 job groups processingJobGroup = self.createTestJobGroup(nJobs = 10, retryOnce = True) productionJobGroup = self.createTestJobGroup(nJobs = 15, subType = "Production", retryOnce = True) mergeJobGroup = self.createTestJobGroup(nJobs = 20, subType = "Merge", retryOnce = True) skimJobGroup = self.createTestJobGroup(nJobs = 5, subType = "Skim", retryOnce = True) #Set an adequate config #Processing jobs get the PauseAlgo with pauseCount 4 #Production jobs get the ExponentialAlgo #Merge jobs get the PauseAlgo but with pauseCount 2 which is the default #Skim jobs are not configured, so they get the default SquaredAlgo config = self.getConfig() config.RetryManager.plugins = {'Processing' : 'PauseAlgo', 'Production' : 'ExponentialAlgo', 'Merge' : 'PauseAlgo', 'default' : 'SquaredAlgo'} config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = {'create': 30, 'submit': 30, 'job': 30} config.RetryManager.PauseAlgo.Processing.pauseCount = 4 config.RetryManager.PauseAlgo.section_("default") config.RetryManager.PauseAlgo.default.coolOffTime = {'create': 60, 'submit': 60, 'job': 60} config.RetryManager.PauseAlgo.default.pauseCount = 2 config.RetryManager.section_("ExponentialAlgo") config.RetryManager.ExponentialAlgo.section_("Production") config.RetryManager.ExponentialAlgo.Production.coolOffTime = {'create': 30, 'submit': 30, 'job': 30} config.RetryManager.ExponentialAlgo.section_("default") config.RetryManager.ExponentialAlgo.default.coolOffTime = {'create': 60, 'submit': 60, 'job': 60} config.RetryManager.section_("SquaredAlgo") config.RetryManager.SquaredAlgo.section_("Skim") config.RetryManager.SquaredAlgo.Skim.coolOffTime = {'create': 30, 'submit': 30, 'job': 30} config.RetryManager.SquaredAlgo.section_("default") config.RetryManager.SquaredAlgo.default.coolOffTime = {'create': 60, 'submit': 60, 'job': 60} #Start the state changer and RetryManager changer = ChangeState(config) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) #Create the jobs for the first time changer.propagate(processingJobGroup.jobs, 'created', 'new') # Let's start with the processing jobs and the pauseAlgo for count in range(1,5): #Fail the jobs changer.propagate(processingJobGroup.jobs, 'executing', 'created') changer.propagate(processingJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(processingJobGroup.jobs, 'jobcooloff', 'jobfailed') #Check that the cooloff time is strictly enforced #First a job time just below the cooloff time for job in processingJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 30*pow(count,2) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), len(processingJobGroup.jobs), "Jobs went into cooloff without the proper timing") #Now above the cooloff time for job in processingJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 30*pow(count,2) - 5) testRetryManager.algorithm(None) #Make sure the jobs get created again or go to paused if count < 4: idList = self.getJobs.execute(state = 'created') else: idList = self.getJobs.execute(state = 'jobpaused') self.assertEqual(len(idList), len(processingJobGroup.jobs), "Jobs didn't change state correctly") #Unpause them so they don't interfere with subsequent tests changer.propagate(processingJobGroup.jobs, 'created', 'jobpaused') changer.propagate(processingJobGroup.jobs, 'executing', 'created') #Now the production jobs and the exponential algo changer.propagate(productionJobGroup.jobs, 'created', 'new') for count in range(1,3): changer.propagate(productionJobGroup.jobs, 'executing', 'created') changer.propagate(productionJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(productionJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in productionJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - pow(30,count) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), len(productionJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in productionJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - pow(30,count) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), len(productionJobGroup.jobs), "Jobs didn't change state correctly") #Send them to executing changer.propagate(productionJobGroup.jobs, 'executing', 'created') #Now the merge jobs and the paused algo with different parameters changer.propagate(mergeJobGroup.jobs, 'created', 'new') for count in range(1,3): changer.propagate(mergeJobGroup.jobs, 'executing', 'created') changer.propagate(mergeJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(mergeJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in mergeJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 30*pow(count,2) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), len(mergeJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in mergeJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 60*pow(count,2) - 5) testRetryManager.algorithm(None) if count < 2: idList = self.getJobs.execute(state = 'created') else: idList = self.getJobs.execute(state = 'jobpaused') self.assertEqual(len(idList), len(mergeJobGroup.jobs), "Jobs didn't change state correctly") #Send them to executing changer.propagate(mergeJobGroup.jobs, 'created', 'jobpaused') changer.propagate(mergeJobGroup.jobs, 'executing', 'created') #Now the skim jobs and the squared algo changer.propagate(skimJobGroup.jobs, 'created', 'new') for count in range(1,3): changer.propagate(skimJobGroup.jobs, 'executing', 'created') changer.propagate(skimJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(skimJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in skimJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 30*pow(count,2) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), len(skimJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in skimJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 30*pow(count,2) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), len(skimJobGroup.jobs), "Jobs didn't change state correctly")
def testH_PauseAlgo(self): """ _testH_PauseAlgo_ Test the pause algorithm, note that given pauseCount = n, the job will run first n + 1 times before being paused. After that it will be paused each n times """ testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing' : 'PauseAlgo'} config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = {'create': 20, 'submit': 20, 'job': 20} config.RetryManager.PauseAlgo.Processing.pauseCount = 2 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) # Making sure that jobs are not created ahead of time for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 15) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 25) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Make sure that no change happens before timeout for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 75) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be paused for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 85) # Make sure that the plugin pauses them testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'jobpaused') self.assertEqual(len(idList), self.nJobs) # Emulating ops retrying the job changer.propagate(testJobGroup.jobs, 'created', 'jobpaused') # Making sure it did the right thing idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 175) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 185) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 315) testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'jobcooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 325) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'jobpaused') self.assertEqual(len(idList), self.nJobs) return
def testI_MultipleJobTypes(self): """ _testI_MultipleJobTypes_ Check that we can configure different retry algorithms for different job types, including a default for nonspecified types. Also check that two job types can share the same retry algorithm but with different parameters """ # Let's create 4 job groups processingJobGroup = self.createTestJobGroup(nJobs=10, retryOnce=True) productionJobGroup = self.createTestJobGroup(nJobs=15, subType="Production", retryOnce=True) mergeJobGroup = self.createTestJobGroup(nJobs=20, subType="Merge", retryOnce=True) skimJobGroup = self.createTestJobGroup(nJobs=5, subType="Skim", retryOnce=True) # Set an adequate config # Processing jobs get the PauseAlgo with pauseCount 4 # Production jobs get the ExponentialAlgo # Merge jobs get the PauseAlgo but with pauseCount 2 which is the default # Skim jobs are not configured, so they get the default SquaredAlgo config = self.getConfig() config.RetryManager.plugins = { 'Processing': 'PauseAlgo', 'Production': 'ExponentialAlgo', 'Merge': 'PauseAlgo', 'default': 'SquaredAlgo' } config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = { 'create': 30, 'submit': 30, 'job': 30 } config.RetryManager.PauseAlgo.Processing.pauseCount = 4 config.RetryManager.PauseAlgo.section_("default") config.RetryManager.PauseAlgo.default.coolOffTime = { 'create': 60, 'submit': 60, 'job': 60 } config.RetryManager.PauseAlgo.default.pauseCount = 2 config.RetryManager.section_("ExponentialAlgo") config.RetryManager.ExponentialAlgo.section_("Production") config.RetryManager.ExponentialAlgo.Production.coolOffTime = { 'create': 30, 'submit': 30, 'job': 30 } config.RetryManager.ExponentialAlgo.section_("default") config.RetryManager.ExponentialAlgo.default.coolOffTime = { 'create': 60, 'submit': 60, 'job': 60 } config.RetryManager.section_("SquaredAlgo") config.RetryManager.SquaredAlgo.section_("Skim") config.RetryManager.SquaredAlgo.Skim.coolOffTime = { 'create': 30, 'submit': 30, 'job': 30 } config.RetryManager.SquaredAlgo.section_("default") config.RetryManager.SquaredAlgo.default.coolOffTime = { 'create': 60, 'submit': 60, 'job': 60 } # Start the state changer and RetryManager changer = ChangeState(config) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) # Create the jobs for the first time changer.propagate(processingJobGroup.jobs, 'created', 'new') # Let's start with the processing jobs and the pauseAlgo for count in range(1, 5): # Fail the jobs changer.propagate(processingJobGroup.jobs, 'executing', 'created') changer.propagate(processingJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(processingJobGroup.jobs, 'jobcooloff', 'jobfailed') # Check that the cooloff time is strictly enforced # First a job time just below the cooloff time for job in processingJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(processingJobGroup.jobs), "Jobs went into cooloff without the proper timing") # Now above the cooloff time for job in processingJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) - 5) testRetryManager.algorithm(None) # Make sure the jobs get created again or go to paused if count < 4: idList = self.getJobs.execute(state='created') else: idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), len(processingJobGroup.jobs), "Jobs didn't change state correctly") # Unpause them so they don't interfere with subsequent tests changer.propagate(processingJobGroup.jobs, 'created', 'jobpaused') changer.propagate(processingJobGroup.jobs, 'executing', 'created') # Now the production jobs and the exponential algo changer.propagate(productionJobGroup.jobs, 'created', 'new') for count in range(1, 3): changer.propagate(productionJobGroup.jobs, 'executing', 'created') changer.propagate(productionJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(productionJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in productionJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - pow(30, count) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(productionJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in productionJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - pow(30, count) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), len(productionJobGroup.jobs), "Jobs didn't change state correctly") # Send them to executing changer.propagate(productionJobGroup.jobs, 'executing', 'created') # Now the merge jobs and the paused algo with different parameters changer.propagate(mergeJobGroup.jobs, 'created', 'new') for count in range(1, 3): changer.propagate(mergeJobGroup.jobs, 'executing', 'created') changer.propagate(mergeJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(mergeJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in mergeJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(mergeJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in mergeJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 60 * pow(count, 2) - 5) testRetryManager.algorithm(None) if count < 2: idList = self.getJobs.execute(state='created') else: idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), len(mergeJobGroup.jobs), "Jobs didn't change state correctly") # Send them to executing changer.propagate(mergeJobGroup.jobs, 'created', 'jobpaused') changer.propagate(mergeJobGroup.jobs, 'executing', 'created') # Now the skim jobs and the squared algo changer.propagate(skimJobGroup.jobs, 'created', 'new') for count in range(1, 3): changer.propagate(skimJobGroup.jobs, 'executing', 'created') changer.propagate(skimJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(skimJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in skimJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) + 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual( len(idList), len(skimJobGroup.jobs), "Jobs went into cooloff without the proper timing") for job in skimJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 30 * pow(count, 2) - 5) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), len(skimJobGroup.jobs), "Jobs didn't change state correctly")
def testG_ProcessingAlgo(self): """ _ProcessingAlgo_ Test for the ProcessingAlgo Prototype """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing': 'ProcessingAlgo'} config.RetryManager.section_("ProcessingAlgo") config.RetryManager.ProcessingAlgo.section_("default") config.RetryManager.ProcessingAlgo.default.coolOffTime = { 'create': 10, 'submit': 10, 'job': 10 } changer = ChangeState(config) fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") report = Report() report.load(fwjrPath) for job in testJobGroup.jobs: job['fwjr'] = report job['retry_count'] = 0 report.save( os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') testRetryManager = RetryManagerPoller(config) testRetryManager.algorithm() idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: j = Job(id=job['id']) j.load() self.assertEqual(j['retry_count'], 1) report.save( os.path.join(j['cache_dir'], "Report.%i.pkl" % j['retry_count'])) config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [8020] testRetryManager2 = RetryManagerPoller(config) testRetryManager2.algorithm() idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: j = Job(id=job['id']) j.load() self.assertEqual(j['retry_count'], 5) # Now test timeout testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs) # Cycle jobs for job in testJobGroup2.jobs: job['fwjr'] = report job['retry_count'] = 0 report.save( os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])) changer.propagate(testJobGroup2.jobs, 'created', 'new') changer.propagate(testJobGroup2.jobs, 'executing', 'created') changer.propagate(testJobGroup2.jobs, 'complete', 'executing') changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup2.jobs: j = Job(id=job['id']) j.load() self.assertEqual(j['retry_count'], 0) config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [] config.RetryManager.ProcessingAlgo.default.MaxRunTime = 1 testRetryManager3 = RetryManagerPoller(config) testRetryManager3.algorithm() idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs * 2) for job in testJobGroup2.jobs: j = Job(id=job['id']) j.load() self.assertEqual(j['retry_count'], 5) return
def testSubmitPaused(self): testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.pluginName = 'PauseAlgo' config.RetryManager.coolOffTime = {'create': 20, 'submit': 20, 'job': 20} config.RetryManager.pauseCount = 2 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') # Making sure that jobs are in submitcooloff state idList = self.getJobs.execute(state = 'SubmitCoolOff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 20) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 400) # Make sure they end up in submitpaused testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'submitpaused') self.assertEqual(len(idList), self.nJobs) # emulating ops retrying the job changer.propagate(testJobGroup.jobs, 'created', 'submitpaused') # Fail them out again changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 8000) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID = job["id"], stateTime = int(time.time()) - 160000) # Make sure they end up finally (again) in the paused state testRetryManager.algorithm(None) idList = self.getJobs.execute(state = 'submitpaused') self.assertEqual(len(idList), self.nJobs) return
def testH_PauseAlgo(self): """ _testH_PauseAlgo_ Test the pause algorithm, note that given pauseCount = n, the job will run first n + 1 times before being paused. After that it will be paused each n times """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) # adding a 2nd job group testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing': 'PauseAlgo'} config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = { 'create': 20, 'submit': 20, 'job': 20 } config.RetryManager.PauseAlgo.Processing.pauseCount = 2 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) report = Report() # Making sure that jobs are not created ahead of time for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 15) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 25) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Make sure that no change happens before timeout for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 75) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be paused for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin pauses them testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), self.nJobs) # Emulating ops retrying the job changer.propagate(testJobGroup.jobs, 'created', 'jobpaused') # Making sure it did the right thing idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 175) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 185) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 315) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobcooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 325) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), self.nJobs) # a configurable retry count per job type {jobExitCodeA: pauseCountB} config.RetryManager.PauseAlgo.Processing.retryErrorCodes = { 8020: 1, 12345: 1, 5555: 2 } testRetryManager2 = RetryManagerPoller(config) testRetryManager2.algorithm() fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") report.load(fwjrPath) for job in testJobGroup2.jobs: job['fwjr'] = report job['retry_count'] = 0 report.save( os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])) # fail the jobs changer.propagate(testJobGroup2.jobs, 'created', 'new') changer.propagate(testJobGroup2.jobs, 'executing', 'created') changer.propagate(testJobGroup2.jobs, 'complete', 'executing') changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed') # Giving time so they can be paused for job in testJobGroup2.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin sent those jobs to the next state: testRetryManager2.algorithm() # job exit code is 8020, so it is supposed to be retried one time. # Meaning, that here we should have 10 jobs (from the first part of the test) in jobpaused # and 10 jobs in created state idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) idList2 = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList2), self.nJobs) # save a second job report - with a retry count = 1 for job in testJobGroup2.jobs: j = Job(id=job['id']) j.load() j['retry_count'] = 1 self.assertEqual(j['retry_count'], 1) report.save( os.path.join(j['cache_dir'], "Report.%i.pkl" % j['retry_count'])) # Fail them out again changer.propagate(testJobGroup2.jobs, 'executing', 'created') changer.propagate(testJobGroup2.jobs, 'complete', 'executing') changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup2.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 175) # not sure if this check is needed: idList = self.getJobs.execute(state='jobcooloff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be paused for job in testJobGroup2.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin sent those jobs to paused state: testRetryManager2.algorithm(None) idList = self.getJobs.execute(state='jobpaused') # And again, in total, there should be 10+10=20 jobs in jobpaused self.assertEqual(len(idList), self.nJobs * 2) return
def testG_ProcessingAlgo(self): """ _ProcessingAlgo_ Test for the ProcessingAlgo Prototype """ testJobGroup = self.createTestJobGroup(nJobs = self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing' : 'ProcessingAlgo'} config.RetryManager.section_("ProcessingAlgo") config.RetryManager.ProcessingAlgo.section_("default") config.RetryManager.ProcessingAlgo.default.coolOffTime = {'create': 10, 'submit': 10, 'job': 10} changer = ChangeState(config) fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") report = Report() report.load(fwjrPath) for job in testJobGroup.jobs: job['fwjr'] = report job['retry_count'] = 0 report.save(os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') testRetryManager = RetryManagerPoller(config) testRetryManager.algorithm() idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: j = Job(id = job['id']) j.load() self.assertEqual(j['retry_count'], 1) report.save(os.path.join(j['cache_dir'], "Report.%i.pkl" % j['retry_count'])) config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [8020] testRetryManager2 = RetryManagerPoller(config) testRetryManager2.algorithm() idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: j = Job(id = job['id']) j.load() self.assertEqual(j['retry_count'], 5) # Now test timeout testJobGroup2 = self.createTestJobGroup(nJobs = self.nJobs) # Cycle jobs for job in testJobGroup2.jobs: job['fwjr'] = report job['retry_count'] = 0 report.save(os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])) changer.propagate(testJobGroup2.jobs, 'created', 'new') changer.propagate(testJobGroup2.jobs, 'executing', 'created') changer.propagate(testJobGroup2.jobs, 'complete', 'executing') changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup2.jobs: j = Job(id = job['id']) j.load() self.assertEqual(j['retry_count'], 0) config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [] config.RetryManager.ProcessingAlgo.default.MaxRunTime = 1 testRetryManager3 = RetryManagerPoller(config) testRetryManager3.algorithm() idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs * 2) for job in testJobGroup2.jobs: j = Job(id = job['id']) j.load() self.assertEqual(j['retry_count'], 5) return
def testY_MultipleIterations(self): """ _MultipleIterations_ Paranoia based check to see if I'm saving class instances correctly """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'submitfailed', 'Created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) # Make a new jobGroup for a second run testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) # Set job state changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed') # Set them to go off for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 200) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs * 2) return
def testH_PauseAlgo(self): """ _testH_PauseAlgo_ Test the pause algorithm, note that given pauseCount = n, the job will run first n + 1 times before being paused. After that it will be paused each n times """ testJobGroup = self.createTestJobGroup(nJobs=self.nJobs) # adding a 2nd job group testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs) config = self.getConfig() config.RetryManager.plugins = {'Processing': 'PauseAlgo'} config.RetryManager.section_("PauseAlgo") config.RetryManager.PauseAlgo.section_("Processing") config.RetryManager.PauseAlgo.Processing.coolOffTime = {'create': 20, 'submit': 20, 'job': 20} config.RetryManager.PauseAlgo.Processing.pauseCount = 2 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') testRetryManager = RetryManagerPoller(config) testRetryManager.setup(None) report = Report() # Making sure that jobs are not created ahead of time for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 15) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 25) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') # Make sure that no change happens before timeout for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 75) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be paused for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin pauses them testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), self.nJobs) # Emulating ops retrying the job changer.propagate(testJobGroup.jobs, 'created', 'jobpaused') # Making sure it did the right thing idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 175) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='JobCoolOff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be retried for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 185) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) # Fail them out again changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing') changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 315) testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobcooloff') self.assertEqual(len(idList), self.nJobs) for job in testJobGroup.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 325) # Make sure that the plugin allowed them to go back to created state testRetryManager.algorithm(None) idList = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList), self.nJobs) # a configurable retry count per job type {jobExitCodeA: pauseCountB} config.RetryManager.PauseAlgo.Processing.retryErrorCodes = {8020: 1, 12345: 1, 5555: 2} testRetryManager2 = RetryManagerPoller(config) testRetryManager2.algorithm() fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") report.load(fwjrPath) for job in testJobGroup2.jobs: job['fwjr'] = report job['retry_count'] = 0 report.save(os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])) # fail the jobs changer.propagate(testJobGroup2.jobs, 'created', 'new') changer.propagate(testJobGroup2.jobs, 'executing', 'created') changer.propagate(testJobGroup2.jobs, 'complete', 'executing') changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed') # Giving time so they can be paused for job in testJobGroup2.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin sent those jobs to the next state: testRetryManager2.algorithm() # job exit code is 8020, so it is supposed to be retried one time. # Meaning, that here we should have 10 jobs (from the first part of the test) in jobpaused # and 10 jobs in created state idList = self.getJobs.execute(state='created') self.assertEqual(len(idList), self.nJobs) idList2 = self.getJobs.execute(state='jobpaused') self.assertEqual(len(idList2), self.nJobs) # save a second job report - with a retry count = 1 for job in testJobGroup2.jobs: j = Job(id=job['id']) j.load() j['retry_count'] = 1 self.assertEqual(j['retry_count'], 1) report.save(os.path.join(j['cache_dir'], "Report.%i.pkl" % j['retry_count'])) # Fail them out again changer.propagate(testJobGroup2.jobs, 'executing', 'created') changer.propagate(testJobGroup2.jobs, 'complete', 'executing') changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete') changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed') for job in testJobGroup2.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 175) # not sure if this check is needed: idList = self.getJobs.execute(state='jobcooloff') self.assertEqual(len(idList), self.nJobs) # Giving time so they can be paused for job in testJobGroup2.jobs: self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 85) # Make sure that the plugin sent those jobs to paused state: testRetryManager2.algorithm(None) idList = self.getJobs.execute(state='jobpaused') # And again, in total, there should be 10+10=20 jobs in jobpaused self.assertEqual(len(idList), self.nJobs * 2) return