예제 #1
0
    def testZ_Profile(self):
        """
        _Profile_

        Do a basic profiling of the algo
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 150)

        startTime = time.time()
        #cProfile.runctx("testRetryManager.algorithm()", globals(), locals(), filename = "profStats.stat")
        testRetryManager.algorithm(None)
        stopTime  = time.time()

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'New')
        self.assertEqual(len(idList), nJobs)


        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
예제 #2
0
    def preInitialization(self):
        """
        Initializes plugins for different messages
        """

        # Add event loop to worker manager
        myThread = threading.currentThread()
        pollInterval = self.config.RetryManager.pollInterval
        logging.info("Setting poll interval to %s seconds", pollInterval)
        myThread.workerThreadManager.addWorker(RetryManagerPoller(self.config), pollInterval)
예제 #3
0
    def testF_LinearAlgo(self):
        """
        _testLinearAlgo_

        Test the linear algorithm to make sure it loads and works
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'LinearAlgo'}
        config.RetryManager.section_("LinearAlgo")
        config.RetryManager.LinearAlgo.section_("Processing")
        config.RetryManager.LinearAlgo.Processing.coolOffTime = {
            'create': 10,
            'submit': 10,
            'job': 10
        }
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 5)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 12)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        return
예제 #4
0
    def testC_Job(self):
        """
        WMComponent_t.RetryManager_t.RetryManager_t:testJob()

        Mimics creation of component and test jobs failed in create stage.
        """
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)
        return
예제 #5
0
    def testY_MultipleIterations(self):
        """
        _MultipleIterations_

        Paranoia based check to see if I'm saving class instances correctly
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'Created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        # Make a new jobGroup for a second run
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # Set job state
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        # Set them to go off
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 200)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs * 2)

        return
예제 #6
0
    def testI_MultipleJobTypes(self):
        """
        _testI_MultipleJobTypes_

        Check that we can configure different retry algorithms for different
        job types, including a default for nonspecified types.
        Also check that two job types can share the same retry algorithm
        but with different parameters
        """

        # Let's create 4 job groups
        processingJobGroup = self.createTestJobGroup(nJobs=10, retryOnce=True)
        productionJobGroup = self.createTestJobGroup(nJobs=15,
                                                     subType="Production",
                                                     retryOnce=True)
        mergeJobGroup = self.createTestJobGroup(nJobs=20,
                                                subType="Merge",
                                                retryOnce=True)
        skimJobGroup = self.createTestJobGroup(nJobs=5,
                                               subType="Skim",
                                               retryOnce=True)

        # Set an adequate config
        # Processing jobs get the PauseAlgo with pauseCount 4
        # Production jobs get the ExponentialAlgo
        # Merge jobs get the PauseAlgo but with pauseCount 2 which is the default
        # Skim jobs are not configured, so they get the default SquaredAlgo
        config = self.getConfig()
        config.RetryManager.plugins = {
            'Processing': 'PauseAlgo',
            'Production': 'ExponentialAlgo',
            'Merge': 'PauseAlgo',
            'default': 'SquaredAlgo'
        }
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 4
        config.RetryManager.PauseAlgo.section_("default")
        config.RetryManager.PauseAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }
        config.RetryManager.PauseAlgo.default.pauseCount = 2
        config.RetryManager.section_("ExponentialAlgo")
        config.RetryManager.ExponentialAlgo.section_("Production")
        config.RetryManager.ExponentialAlgo.Production.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.ExponentialAlgo.section_("default")
        config.RetryManager.ExponentialAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }
        config.RetryManager.section_("SquaredAlgo")
        config.RetryManager.SquaredAlgo.section_("Skim")
        config.RetryManager.SquaredAlgo.Skim.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.SquaredAlgo.section_("default")
        config.RetryManager.SquaredAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }

        # Start the state changer and RetryManager
        changer = ChangeState(config)
        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        # Create the jobs for the first time
        changer.propagate(processingJobGroup.jobs, 'created', 'new')

        # Let's start with the processing jobs and the pauseAlgo
        for count in range(1, 5):
            # Fail the jobs
            changer.propagate(processingJobGroup.jobs, 'executing', 'created')
            changer.propagate(processingJobGroup.jobs, 'jobfailed',
                              'executing')
            changer.propagate(processingJobGroup.jobs, 'jobcooloff',
                              'jobfailed')

            # Check  that the cooloff time is strictly enforced
            # First a job time just below the cooloff time
            for job in processingJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(processingJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")

            # Now above the cooloff time
            for job in processingJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            # Make sure the jobs get created again or go to paused
            if count < 4:
                idList = self.getJobs.execute(state='created')
            else:
                idList = self.getJobs.execute(state='jobpaused')
            self.assertEqual(len(idList), len(processingJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Unpause them so they don't interfere with subsequent tests
        changer.propagate(processingJobGroup.jobs, 'created', 'jobpaused')
        changer.propagate(processingJobGroup.jobs, 'executing', 'created')

        # Now the production jobs and the exponential algo
        changer.propagate(productionJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(productionJobGroup.jobs, 'executing', 'created')
            changer.propagate(productionJobGroup.jobs, 'jobfailed',
                              'executing')
            changer.propagate(productionJobGroup.jobs, 'jobcooloff',
                              'jobfailed')

            for job in productionJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        pow(30, count) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(productionJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")
            for job in productionJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        pow(30, count) - 5)
            testRetryManager.algorithm(None)

            idList = self.getJobs.execute(state='created')
            self.assertEqual(len(idList), len(productionJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Send them to executing
        changer.propagate(productionJobGroup.jobs, 'executing', 'created')

        # Now the merge jobs and the paused algo with different parameters
        changer.propagate(mergeJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(mergeJobGroup.jobs, 'executing', 'created')
            changer.propagate(mergeJobGroup.jobs, 'jobfailed', 'executing')
            changer.propagate(mergeJobGroup.jobs, 'jobcooloff', 'jobfailed')

            for job in mergeJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(mergeJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")

            for job in mergeJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        60 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            if count < 2:
                idList = self.getJobs.execute(state='created')
            else:
                idList = self.getJobs.execute(state='jobpaused')
            self.assertEqual(len(idList), len(mergeJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Send them to executing
        changer.propagate(mergeJobGroup.jobs, 'created', 'jobpaused')
        changer.propagate(mergeJobGroup.jobs, 'executing', 'created')

        # Now the skim jobs and the squared algo
        changer.propagate(skimJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(skimJobGroup.jobs, 'executing', 'created')
            changer.propagate(skimJobGroup.jobs, 'jobfailed', 'executing')
            changer.propagate(skimJobGroup.jobs, 'jobcooloff', 'jobfailed')

            for job in skimJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(skimJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")
            for job in skimJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            idList = self.getJobs.execute(state='created')
            self.assertEqual(len(idList), len(skimJobGroup.jobs),
                             "Jobs didn't change state correctly")
예제 #7
0
    def testH_PauseAlgo(self):
        """
        _testH_PauseAlgo_

        Test the pause algorithm, note that given pauseCount = n, the
        job will run first n + 1 times before being paused.
        After that it will be paused each n times
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # adding a 2nd job group
        testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'PauseAlgo'}
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 20,
            'submit': 20,
            'job': 20
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 2
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        report = Report()

        # Making sure that jobs are not created ahead of time
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 15)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 25)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        # Make sure that no change happens before timeout
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 75)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin pauses them
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # Emulating ops retrying the job
        changer.propagate(testJobGroup.jobs, 'created', 'jobpaused')

        # Making sure it did the right thing
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 185)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 315)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 325)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # a configurable retry count per job type {jobExitCodeA: pauseCountB}
        config.RetryManager.PauseAlgo.Processing.retryErrorCodes = {
            8020: 1,
            12345: 1,
            5555: 2
        }

        testRetryManager2 = RetryManagerPoller(config)
        testRetryManager2.algorithm()

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        report.load(fwjrPath)
        for job in testJobGroup2.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))

        # fail the jobs
        changer.propagate(testJobGroup2.jobs, 'created', 'new')
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        # Giving time so they can be paused
        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin sent those jobs to the next state:
        testRetryManager2.algorithm()
        # job exit code is 8020, so it is supposed to be retried one time.
        # Meaning, that here we should have 10 jobs (from the first part of the test) in jobpaused
        # and 10 jobs in created state

        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        idList2 = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList2), self.nJobs)

        # save a second job report - with a retry count = 1
        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            j['retry_count'] = 1
            self.assertEqual(j['retry_count'], 1)
            report.save(
                os.path.join(j['cache_dir'],
                             "Report.%i.pkl" % j['retry_count']))

        # Fail them out again
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)

        # not sure if this check is needed:
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin sent those jobs to paused state:
        testRetryManager2.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        # And again, in total, there should be 10+10=20 jobs in jobpaused
        self.assertEqual(len(idList), self.nJobs * 2)

        return
예제 #8
0
    def testG_ProcessingAlgo(self):
        """
        _ProcessingAlgo_

        Test for the ProcessingAlgo Prototype
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'ProcessingAlgo'}
        config.RetryManager.section_("ProcessingAlgo")
        config.RetryManager.ProcessingAlgo.section_("default")
        config.RetryManager.ProcessingAlgo.default.coolOffTime = {
            'create': 10,
            'submit': 10,
            'job': 10
        }
        changer = ChangeState(config)
        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")
        report = Report()
        report.load(fwjrPath)
        for job in testJobGroup.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 1)
            report.save(
                os.path.join(j['cache_dir'],
                             "Report.%i.pkl" % j['retry_count']))

        config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [8020]
        testRetryManager2 = RetryManagerPoller(config)
        testRetryManager2.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 5)

        # Now test timeout
        testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs)

        # Cycle jobs
        for job in testJobGroup2.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))
        changer.propagate(testJobGroup2.jobs, 'created', 'new')
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 0)

        config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = []
        config.RetryManager.ProcessingAlgo.default.MaxRunTime = 1
        testRetryManager3 = RetryManagerPoller(config)
        testRetryManager3.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs * 2)

        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 5)

        return
예제 #9
0
    def testH_PauseAlgo(self):
        """
        _testH_PauseAlgo_

        Test the pause algorithm, note that given pauseCount = n, the
        job will run first n + 1 times before being paused.
        After that it will be paused each n times
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'PauseAlgo'}
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 20,
            'submit': 20,
            'job': 20
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 2
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        # Making sure that jobs are not created ahead of time
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 15)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 25)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        # Make sure that no change happens before timeout
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 75)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin pauses them
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # Emulating ops retrying the job
        changer.propagate(testJobGroup.jobs, 'created', 'jobpaused')

        # Making sure it did the right thing
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 185)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 315)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 325)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        return