Beispiel #1
0
    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Beispiel #2
0
    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = "TestWorkload"

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "executing", "created")
        changer.propagate(testJobGroup.jobs, "complete", "executing")
        changer.propagate(testJobGroup.jobs, "jobfailed", "complete")

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), self.nJobs)
        return
Beispiel #3
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath,
                                               workloadName = workloadName)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'created')

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), 0)

        #These should go directly to exhausted
        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]],
                                "Unknown lumi %s" % f['runs'][0]['lumis'])
                self.assertTrue(f['merged'], 1)
                self.assertTrue(f['first_event'], 88)
            self.assertEqual(counter, 20)
        return
Beispiel #4
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        nJobs = 100
        workloadName = 'TestWorkload'
        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=nJobs, workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)

        startTime = time.time()
        cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename="profStats.stat")
        stopTime = time.time()

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Beispiel #5
0
    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, retry_count = 5,
                                               workloadPath = workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testSubscription = Subscription(id = 1) # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)


        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        

        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
Beispiel #6
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        startTime = time.time()
        #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat")
        testErrorHandler.algorithm()
        stopTime = time.time()

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)
        
        return
Beispiel #7
0
    def testB_Submit(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit()

        Mimics creation of component and test jobs failed in submit stage.
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')

        idList = self.getJobs.execute(state='SubmitFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='SubmitFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Beispiel #8
0
    def testE_FailJobs(self):
        """
        _FailJobs_

        Test our ability to fail jobs based on the information in the FWJR
        """
        workloadName = 'TestWorkload'
        
        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath,
                                               fwjrPath = fwjrPath)
        
        config = self.getConfig()
        config.ErrorHandler.readFWJR         = True
        config.ErrorHandler.failureExitCodes = [8020]
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        # This should exhaust all jobs due to exit code
        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.failureExitCodes = []
        config.ErrorHandler.maxFailTime      = -10
        testErrorHandler2 = ErrorHandlerPoller(config)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler2.algorithm(None)

        # This should exhaust all jobs due to timeout
        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.maxFailTime      = 24 * 3600
        config.ErrorHandler.passExitCodes    = [8020]
        testErrorHandler3 = ErrorHandlerPoller(config)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler3.algorithm(None)

        idList = self.getJobs.execute(state = 'Created')
        self.assertEqual(len(idList), self.nJobs)

        return
Beispiel #9
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """
        njobs = 4
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        #        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
        testJobGroup = self.createTestJobGroup(nJobs=njobs,
                                               workloadPath=workloadPath,
                                               workloadName=workloadName)
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'created')

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), njobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        # These should go directly to exhausted
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), njobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(f['runs'][0]['run_number'] == 10)
                if f['lfn'] == "/this/is/a/lfnA":
                    self.assertItemsEqual(f['runs'][0]['lumis'], [12312])
                elif f['lfn'] == "/this/is/a/lfnB":
                    self.assertItemsEqual(f['runs'][0]['lumis'], [12314, 12315, 12316])
                else:
                    self.assertFail("File name is not known: %s" % f['lfn'])
                self.assertEqual(f['merged'], 0)
                self.assertEqual(f['first_event'], 88)
            self.assertEqual(counter, njobs * 2)  # each job has 2 files (thus 4 times duplicate)
        return