def testC_Jobs(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs() Mimics creation of component and test jobs failed in execute stage. """ workloadName = "TestWorkload" workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl") testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, "created", "new") changer.propagate(testJobGroup.jobs, "executing", "created") changer.propagate(testJobGroup.jobs, "complete", "executing") changer.propagate(testJobGroup.jobs, "jobfailed", "complete") idList = self.getJobs.execute(state="JobFailed") self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state="JobFailed") self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state="JobCooloff") self.assertEqual(len(idList), self.nJobs) return
def testC_Jobs(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs() Mimics creation of component and test jobs failed in execute stage. """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), self.nJobs) return
def testA_Create(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate() Mimics creation of component and test jobs failed in create stage. """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath, workloadName = workloadName) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'createfailed', 'created') idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), 0) #These should go directly to exhausted idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) # Check that it showed up in ACDC collection = self.dataCS.getDataCollection(workloadName) # Now look at what's inside self.assertTrue(len(collection['filesets']) > 0) for fileset in collection["filesets"]: counter = 0 for f in fileset.listFiles(): counter += 1 self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"]) self.assertEqual(f['events'], 10) self.assertEqual(f['size'], 1024) self.assertEqual(f['parents'], [u'/this/is/a/parent']) self.assertTrue(f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]], "Unknown lumi %s" % f['runs'][0]['lumis']) self.assertTrue(f['merged'], 1) self.assertTrue(f['first_event'], 88) self.assertEqual(counter, 20) return
def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ nJobs = 100 workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) startTime = time.time() cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename="profStats.stat") stopTime = time.time() idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, retry_count = 5, workloadPath = workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id = 1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
def testB_Submit(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit() Mimics creation of component and test jobs failed in submit stage. """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'submitfailed', 'created') idList = self.getJobs.execute(state='SubmitFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='SubmitFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='SubmitCooloff') self.assertEqual(len(idList), self.nJobs) return
def preInitialization(self): """ Initializes plugins for different messages """ # Add event loop to worker manager myThread = threading.currentThread() pollInterval = self.config.ErrorHandler.pollInterval logging.info("Setting poll interval to %s seconds" % pollInterval) myThread.workerThreadManager.addWorker(ErrorHandlerPoller(self.config), pollInterval)
def testA_Create(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate() Mimics creation of component and test jobs failed in create stage. """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, workloadName=workloadName) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'createfailed', 'created') idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) # These should go directly to exhausted idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) # Check that it showed up in ACDC collection = self.dataCS.getDataCollection(workloadName) # Now look at what's inside self.assertTrue(len(collection['filesets']) > 0) for fileset in collection["filesets"]: counter = 0 for f in fileset.listFiles(): counter += 1 self.assertTrue( f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"]) self.assertEqual(f['events'], 10) self.assertEqual(f['size'], 1024) self.assertEqual(f['parents'], [u'/this/is/a/parent']) self.assertTrue( f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]], "Unknown lumi %s" % f['runs'][0]['lumis']) self.assertTrue(f['merged'], 1) self.assertTrue(f['first_event'], 88) self.assertEqual(counter, 20) return
def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ return import cProfile, pstats nJobs = 1000 testJobGroup = self.createTestJobGroup(nJobs = nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) startTime = time.time() #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat") testErrorHandler.algorithm() stopTime = time.time() idList = self.getJobs.execute(state = 'CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'CreateCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def testC_Jobs(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs() Mimics creation of component and test jobs failed in execute stage. """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), self.nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) return
def testD_Exhausted(self): """ _testExhausted_ Test that the system can exhaust jobs correctly """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath) config = self.getConfig() config.ErrorHandler.maxRetries = 1 changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testSubscription = Subscription(id=1) # You should only have one testSubscription.load() testSubscription.loadData() # Do we have files to start with? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) # Did we fail the files? self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0) self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
def testZ_Profile(self): """ _testProfile_ Do a full profile of the poller """ return import cProfile, pstats nJobs = 1000 testJobGroup = self.createTestJobGroup(nJobs=nJobs) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'createfailed', 'new') idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), nJobs) testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) startTime = time.time() #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat") testErrorHandler.algorithm() stopTime = time.time() idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='CreateCooloff') self.assertEqual(len(idList), nJobs) print("Took %f seconds to run polling algo" % (stopTime - startTime)) p = pstats.Stats('profStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return
def testE_FailJobs(self): """ _FailJobs_ Test our ability to fail jobs based on the information in the FWJR """ workloadName = 'TestWorkload' workload = self.createWorkload(workloadName = workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, workloadPath = workloadPath, fwjrPath = fwjrPath) config = self.getConfig() config.ErrorHandler.readFWJR = True config.ErrorHandler.failureExitCodes = [8020] changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler = ErrorHandlerPoller(config) testErrorHandler.setup(None) testErrorHandler.algorithm(None) # This should exhaust all jobs due to exit code idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.failureExitCodes = [] config.ErrorHandler.maxFailTime = -10 testErrorHandler2 = ErrorHandlerPoller(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler2.algorithm(None) # This should exhaust all jobs due to timeout idList = self.getJobs.execute(state = 'JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'JobCooloff') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state = 'Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.maxFailTime = 24 * 3600 config.ErrorHandler.passExitCodes = [8020] testErrorHandler3 = ErrorHandlerPoller(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler3.algorithm(None) idList = self.getJobs.execute(state = 'Created') self.assertEqual(len(idList), self.nJobs) return
def testE_FailJobs(self): """ _FailJobs_ Test our ability to fail jobs based on the information in the FWJR """ workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') fwjrPath = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t", "fwjrs/badBackfillJobReport.pkl") testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, fwjrPath=fwjrPath) badJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, fwjrPath=None, fileModifier='bad') config = self.getConfig() config.ErrorHandler.readFWJR = True changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') changer.propagate(badJobGroup.jobs, 'created', 'new') changer.propagate(badJobGroup.jobs, 'executing', 'created') changer.propagate(badJobGroup.jobs, 'complete', 'executing') changer.propagate(badJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.exitCodesNoRetry = [8020] testErrorHandler.algorithm(None) # This should exhaust all jobs due to exit code # Except those with no fwjr idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.maxFailTime = -10 testErrorHandler2 = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler2.reqAuxDB = None changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler2.algorithm(None) # This should exhaust all jobs due to timeout idList = self.getJobs.execute(state='JobFailed') self.assertEqual(len(idList), 0) idList = self.getJobs.execute(state='JobCooloff') self.assertEqual(len(idList), self.nJobs) idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), self.nJobs) config.ErrorHandler.maxFailTime = 24 * 3600 config.ErrorHandler.passExitCodes = [8020] testErrorHandler3 = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler3.reqAuxDB = None changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'executing', 'created') changer.propagate(testJobGroup.jobs, 'complete', 'executing') changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete') testErrorHandler3.algorithm(None) # This should pass all jobs due to exit code idList = self.getJobs.execute(state='Created') self.assertEqual(len(idList), self.nJobs) return
def testA_Create(self): """ WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate() Mimics creation of component and test jobs failed in create stage. """ njobs = 4 workloadName = 'TestWorkload' self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName, 'WMSandbox', 'WMWorkload.pkl') # testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, testJobGroup = self.createTestJobGroup(nJobs=njobs, workloadPath=workloadPath, workloadName=workloadName) config = self.getConfig() changer = ChangeState(config) changer.propagate(testJobGroup.jobs, 'created', 'new') changer.propagate(testJobGroup.jobs, 'createfailed', 'created') idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), njobs) testErrorHandler = ErrorHandlerPoller(config) # set reqAuxDB None for the test, testErrorHandler.reqAuxDB = None testErrorHandler.setup(None) testErrorHandler.algorithm(None) idList = self.getJobs.execute(state='CreateFailed') self.assertEqual(len(idList), 0) # These should go directly to exhausted idList = self.getJobs.execute(state='Exhausted') self.assertEqual(len(idList), njobs) # Check that it showed up in ACDC collection = self.dataCS.getDataCollection(workloadName) # Now look at what's inside self.assertTrue(len(collection['filesets']) > 0) for fileset in collection["filesets"]: counter = 0 for f in fileset.listFiles(): counter += 1 self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"]) self.assertEqual(f['events'], 10) self.assertEqual(f['size'], 1024) self.assertEqual(f['parents'], [u'/this/is/a/parent']) self.assertTrue(f['runs'][0]['run_number'] == 10) if f['lfn'] == "/this/is/a/lfnA": self.assertItemsEqual(f['runs'][0]['lumis'], [12312]) elif f['lfn'] == "/this/is/a/lfnB": self.assertItemsEqual(f['runs'][0]['lumis'], [12314, 12315, 12316]) else: self.assertFail("File name is not known: %s" % f['lfn']) self.assertEqual(f['merged'], 0) self.assertEqual(f['first_event'], 88) self.assertEqual(counter, njobs * 2) # each job has 2 files (thus 4 times duplicate) return