Example #1
0
    def testD_SubmitFailed(self):
        """
        _testD_SubmitFailed_

        Check if jobs without a possible site to run at go to SubmitFailed
        """
        workloadName = "basicWorkload"
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            site = [],
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName))

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config = config)
        jobSubmitter.algorithm()

        # Jobs should go to submit failed
        getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs")
        result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #2
0
    def testB_Submit(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit()
        
        Mimics creation of component and test jobs failed in submit stage.
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')
        
        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')

        idList = self.getJobs.execute(state = 'SubmitFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'SubmitFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #3
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')
        sn = "T2_US_UCSD"

        # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist
        # in BossAir_t.py
        baAPI.updateSiteInformation(idleJobs, sn, True)

        # Now kill 'em manually
        #        command = ['condor_rm', self.user]
        #        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        #        pipe.communicate()
        
        del jobSubmitter

        return
Example #4
0
    def testB_Submit(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit()

        Mimics creation of component and test jobs failed in submit stage.
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')

        idList = self.getJobs.execute(state='SubmitFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='SubmitFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #5
0
    def testD_SubmitFailed(self):
        """
        _testD_SubmitFailed_

        Check if jobs without a possible site to run at go to SubmitFailed
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            site=[],
                                            workloadSpec=self.workloadSpecPath)

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()

        # Jobs should go to submit failed
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='SubmitFailed',
                                       jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #6
0
    def testF_OverloadTest(self):
        """
        _OverloadTest_
        
        Test and see what happens if you put in more jobs
        Then the sites can handle
        """

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertThreshold(siteName=site, taskType="Silly", maxSlots=1)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        workloadName = "basicWorkload"
        myThread = threading.currentThread()
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10
        cacheDir = os.path.join(self.testDir, "CacheDir")

        jobGroupList = self.createJobGroups(
            nSubs=nSubs,
            nJobs=nJobs,
            task=workload.getTask("ReReco"),
            workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName),
            type="Silly",
        )

        for group in jobGroupList:
            changeState.propagate(group.jobs, "created", "new")

        jobSubmitter = JobSubmitterPoller(config=config)

        # Actually run it
        jobSubmitter.algorithm()

        # Should be one job for each site
        nSites = len(self.sites)
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSites)

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state="Executing", jobType="Silly")
        self.assertEqual(len(result), nSites)
        result = getJobsAction.execute(state="Created", jobType="Silly")
        self.assertEqual(len(result), nJobs * nSubs - nSites)

        # Now clean-up
        command = ["condor_rm", self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        del jobSubmitter

        return
Example #7
0
    def testJobSiteDrain(self):
        """
        _testJobSiteDrain_

        Test the behavior of jobs pending to a single site that is in drain mode
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        jobSubmitter = JobSubmitterPoller(config=config)
        myResourceControl = ResourceControl(config)
        changeState = ChangeState(config)
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")

        nSubs = 1
        nJobs = 30

        site = 'T2_US_Nebraska'
        self.setResourceThresholds(site, pendingSlots=100, runningSlots=100,
                                   tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 10, 'runningSlots': 10},
                                   Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5})

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[site],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # submit first 10 jobs
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)

        myResourceControl.changeSiteState(site, 'Draining')

        # site is now in drain, so don't submit anything
        jobSubmitter.algorithm()

        # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 0)

        # make sure the drain grace period expires...
        time.sleep(3)
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        # the remaining jobs should have gone to submitfailed by now
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 0)
Example #8
0
    def testJobSiteDrain(self):
        """
        _testJobSiteDrain_

        Test the behavior of jobs pending to a single site that is in drain mode
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        jobSubmitter = JobSubmitterPoller(config=config)
        myResourceControl = ResourceControl(config)
        changeState = ChangeState(config)
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")

        nSubs = 1
        nJobs = 30

        site = 'T2_US_Nebraska'
        self.setResourceThresholds(site, pendingSlots=100, runningSlots=100,
                                   tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 10, 'runningSlots': 10},
                                   Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5})

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[site],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # submit first 10 jobs
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)

        myResourceControl.changeSiteState(site, 'Draining')

        # site is now in drain, so don't submit anything
        jobSubmitter.algorithm()

        # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 0)

        # make sure the drain grace period expires...
        time.sleep(3)
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        # the remaining jobs should have gone to submitfailed by now
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 0)
Example #9
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    existingTransaction = False
    if myThread.transaction.conn:
        existingTransaction = True
    else:
        myThread.transaction.begin()

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn,
                            transaction=True)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn,
                                      transaction=True)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                changeState.propagate(liveWMBSJob, "killed", liveJob["state"])
                killableJobs.append(liveJob)
        # Now kill them
        try:
            bossAir.kill(jobs=killableJobs)
        except BossAirException, ex:
            # Something's gone wrong
            # Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.
            pass
Example #10
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest',
                                    'TestWorkload', 'WMSandbox',
                                    'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath,
                                               workloadName=workloadName)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'created')

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        #These should go directly to exhausted
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(
                    f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(
                    f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]],
                    "Unknown lumi %s" % f['runs'][0]['lumis'])
                self.assertTrue(f['merged'], 1)
                self.assertTrue(f['first_event'], 88)
            self.assertEqual(counter, 20)
        return
Example #11
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig = None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package = "WMCore.WMBS",
                            logger = myThread.logger,
                            dbinterface = myThread.dbi)
    killFilesAction = daoFactory(classname = "Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname = "Jobs.KillWorkflow")

    existingTransaction = False
    if myThread.transaction.conn:
        existingTransaction = True
    else:
        myThread.transaction.begin()

    killFilesAction.execute(workflowName = workflowName,
                            conn = myThread.transaction.conn,
                            transaction = True)

    liveJobs = killJobsAction.execute(workflowName = workflowName,
                                      conn = myThread.transaction.conn,
                                      transaction = True)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config = bossAirConfig, noSetup = True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id = liveJob["id"])
                liveWMBSJob.update(liveJob)
                changeState.propagate(liveWMBSJob, "killed", liveJob["state"])
                killableJobs.append(liveJob)
        # Now kill them
        try:
            bossAir.kill(jobs = killableJobs)
        except BossAirException, ex:
            # Something's gone wrong
            # Jobs not killed!
            logging.error("Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.
            pass
Example #12
0
    def testZ_Profile(self):
        """
        _Profile_

        Do a basic profiling of the algo
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 150)

        startTime = time.time()
        #cProfile.runctx("testRetryManager.algorithm()", globals(), locals(), filename = "profStats.stat")
        testRetryManager.algorithm(None)
        stopTime  = time.time()

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'New')
        self.assertEqual(len(idList), nJobs)


        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
    def testZ_Profile(self):
        """
        _Profile_

        Do a basic profiling of the algo
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 150)

        startTime = time.time()
        #cProfile.runctx("testRetryManager.algorithm()", globals(), locals(), filename = "profStats.stat")
        testRetryManager.algorithm(None)
        stopTime  = time.time()

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'New')
        self.assertEqual(len(idList), nJobs)


        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Example #14
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath,
                                               workloadName=workloadName)
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'created')

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        # These should go directly to exhausted
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]],
                                "Unknown lumi %s" % f['runs'][0]['lumis'])
                self.assertEqual(f['merged'], 0)
                self.assertEqual(f['first_event'], 88)
            self.assertEqual(counter, 20)
        return
Example #15
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        del jobSubmitter

        return
Example #16
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        baAPI.kill(jobs=idleJobs)

        del jobSubmitter

        return
Example #17
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from PyCondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(
                jtok, errorCode=61301
            )  # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down)

        return
Example #18
0
    def testF_PollerProfileTest(self):
        """
        _testF_PollerProfileTest_

        Submit a lot of jobs and test how long it takes for
        them to actually be submitted
        """

        workloadName = "basicWorkload"
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 100
        nJobs = 100
        sites = ['T1_US_FNAL']

        for site in sites:
            self.setResourceThresholds(site, pendingSlots = 20000, runningSlots = -1, tasks = ['Processing', 'Merge'],
                                       Processing = {'pendingSlots' : 10000, 'runningSlots' :-1},
                                       Merge = {'pendingSlots' : 10000, 'runningSlots' :-1, 'priority' : 5})

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config = config)

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir, 'workloadTest',
                                                                        workloadName),
                                            site = 'se.%s' % 'T1_US_FNAL')

        jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir, 'workloadTest',
                                                                        workloadName),
                                            site = 'se.%s' % 'T1_US_FNAL',
                                            taskType = 'Merge'))

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Actually run it
        startTime = time.time()
        cProfile.runctx("jobSubmitter.algorithm()", globals(), locals(), filename = "testStats.stat")
        stopTime = time.time()


        print "Job took %f seconds to complete" % (stopTime - startTime)

        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats()

        return
Example #19
0
    def testF_PollerProfileTest(self):
        """
        _testF_PollerProfileTest_

        Submit a lot of jobs and test how long it takes for
        them to actually be submitted
        """

        workloadName = "basicWorkload"
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 100
        nJobs = 100
        sites = ['T1_US_FNAL']

        for site in sites:
            self.setResourceThresholds(site, pendingSlots = 20000, runningSlots = -1, tasks = ['Processing', 'Merge'],
                                       Processing = {'pendingSlots' : 10000, 'runningSlots' :-1},
                                       Merge = {'pendingSlots' : 10000, 'runningSlots' :-1, 'priority' : 5})

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config = config)

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir, 'workloadTest',
                                                                        workloadName),
                                            site = 'se.%s' % 'T1_US_FNAL')

        jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir, 'workloadTest',
                                                                        workloadName),
                                            site = 'se.%s' % 'T1_US_FNAL',
                                            taskType = 'Merge'))

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Actually run it
        startTime = time.time()
        cProfile.runctx("jobSubmitter.algorithm()", globals(), locals(), filename = "testStats.stat")
        stopTime = time.time()


        print "Job took %f seconds to complete" % (stopTime - startTime)

        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats()

        return
Example #20
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()
        
        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = "TestWorkload"

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, workloadName=workloadName)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "createfailed", "created")

        idList = self.getJobs.execute(state="CreateFailed")
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state="CreateFailed")
        self.assertEqual(len(idList), 0)

        # These should go directly to exhausted
        idList = self.getJobs.execute(state="Exhausted")
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection["filesets"]) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f["lfn"] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f["events"], 10)
                self.assertEqual(f["size"], 1024)
                self.assertEqual(f["parents"], [u"/this/is/a/parent"])
                self.assertTrue(
                    f["runs"][0]["lumis"] in [[12312], [12314, 12315, 12316]], "Unknown lumi %s" % f["runs"][0]["lumis"]
                )
                self.assertTrue(f["merged"], 1)
                self.assertTrue(f["first_event"], 88)
            self.assertEqual(counter, 20)
        return
Example #21
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        pipe.communicate()

        del jobSubmitter

        return
Example #22
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')

        baAPI.kill(jobs = idleJobs)

        del jobSubmitter

        return
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from CondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        dummycacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(self.testDir,
                                                                      'workloadTest',
                                                                      workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(jtok, errorCode=71301)  # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down)

        return
Example #24
0
    def testUpdateFailedDoc(self):
        """
        _testUpdateFailedDoc_

        Verify that the update function will work correctly and not throw a 500
        error if the doc didn't make it into the database for some reason.
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute("site1", seName="somese.cern.ch")

        testWorkflow = Workflow(spec="spec.xml",
                                owner="Steve",
                                name="wf001",
                                task=self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name="TestFileset")
        testFileset.create()
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased")
        testSubscription.create()

        testFileA = File(lfn="SomeLFNA",
                         events=1024,
                         size=2048,
                         locations=set(["somese.cern.ch"]))
        testFileA.create()
        testFileset.addFile(testFileA)
        testFileset.commit()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)
        jobGroup = jobFactory(files_per_job=1)[0]

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Merge"
        testJobA["couch_record"] = str(testJobA["id"])

        change.propagate([testJobA], "new", "none")
        testJobADoc = change.jobsdatabase.document(testJobA["couch_record"])

        self.assertTrue(testJobADoc.has_key("states"))
        self.assertTrue(testJobADoc["states"].has_key("1"))
        return
Example #25
0
    def testY_MultipleIterations(self):
        """
        _MultipleIterations_

        Paranoia based check to see if I'm saving class instances correctly
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "submitfailed", "Created")
        changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed")

        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="Created")
        self.assertEqual(len(idList), self.nJobs)

        # Make a new jobGroup for a second run
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # Set job state
        changer.propagate(testJobGroup.jobs, "submitfailed", "created")
        changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed")

        # Set them to go off
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 200)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="Created")
        self.assertEqual(len(idList), self.nJobs * 2)

        return
Example #26
0
class PauseAlgo(RetryAlgoBase):
    """
    _PauseAlgo_

        This implements the Paused job algorithm, explanation of the concept in #3114
    """
    def __init__ (self, config):
        RetryAlgoBase.__init__(self, config)
        self.changer = ChangeState(config)

    def isReady(self, job, cooloffType):
        """
        Actual function that does the work
        """
        #This should come from configuration, pause_count

        pauseCount = self.getAlgoParam(job['jobType'], param ='pauseCount', defaultReturn = 3)

        pauseMap = {
            'createcooloff' :    'createpaused',
            'submitcooloff' :    'submitpaused',
            'jobcooloff'    :    'jobpaused'
        }

        # Here introduces the SquaredAlgo logic :
        baseTimeoutDict = self.getAlgoParam(job['jobType'])
        baseTimeout = baseTimeoutDict.get(cooloffType.lower(), 10)
        cooloffTime = baseTimeout * pow(job['retry_count'], 2)
        currentTime = self.timestamp()
        if currentTime - job['state_time'] > cooloffTime:
            retryByTimeOut = True
        else:
            retryByTimeOut = False

        if retryByTimeOut :
            # If reached the pauseCount, we want the job to pause instead of retrying
            if pauseCount == 0:
                self.changer.propagate(job, pauseMap[job['state']], job['state'],  updatesummary=True)
                return False
            elif job['retry_count'] > 0 and not (job['retry_count'] % pauseCount):
                self.changer.propagate(job, pauseMap[job['state']], job['state'],  updatesummary=True)
                return False
            else:
                return True
        else:
            return False
Example #27
0
    def testUpdateFailedDoc(self):
        """
        _testUpdateFailedDoc_

        Verify that the update function will work correctly and not throw a 500
        error if the doc didn't make it into the database for some reason.
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname = "Locations.New")
        locationAction.execute("site1", seName = "somese.cern.ch")

        testWorkflow = Workflow(spec = "spec.xml", owner = "Steve",
                                name = "wf001", task = self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name = "TestFileset")
        testFileset.create()
        testSubscription = Subscription(fileset = testFileset,
                                        workflow = testWorkflow,
                                        split_algo = "FileBased")
        testSubscription.create()

        testFileA = File(lfn = "SomeLFNA", events = 1024, size = 2048,
                         locations = set(["somese.cern.ch"]))
        testFileA.create()
        testFileset.addFile(testFileA)
        testFileset.commit()

        splitter = SplitterFactory()
        jobFactory = splitter(package = "WMCore.WMBS",
                              subscription = testSubscription)
        jobGroup = jobFactory(files_per_job = 1)[0]

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Merge"
        testJobA["couch_record"] = str(testJobA["id"])

        change.propagate([testJobA], "new", "none")
        testJobADoc = change.jobsdatabase.document(testJobA["couch_record"])

        self.assertTrue("states" in testJobADoc)
        self.assertTrue("1" in testJobADoc["states"])
        return
Example #28
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        startTime = time.time()
        #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat")
        testErrorHandler.algorithm()
        stopTime = time.time()

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)
        
        return
Example #29
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs=nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        startTime = time.time()
        #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat")
        testErrorHandler.algorithm()
        stopTime = time.time()

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Example #30
0
    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = "TestWorkload"

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "executing", "created")
        changer.propagate(testJobGroup.jobs, "complete", "executing")
        changer.propagate(testJobGroup.jobs, "jobfailed", "complete")

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), self.nJobs)
        return
Example #31
0
    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #32
0
    def testA_testSubmit(self):
        """
        _testSubmit_

        Test whether we pick up submitted jobs
        """

        #workload = self.createWorkload()
        jobGroup = self.createTestJobGroup()
        config   = self.getConfig()

        xmlPath = os.path.join(WMCore.WMBase.getTestBase(),
                               "WMCore_t/FwkJobReport_t/PerformanceReport.xml")
        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        changer = ChangeState(config)
        for job in jobGroup.jobs:
            job['fwjr'] = myReport
        changer.propagate(jobGroup.jobs, "complete", "executing")
        changer.propagate(jobGroup.jobs, "success", "complete")

        dashboardReporter = DashboardReporterPoller(config = config)

        dashboardReporter.algorithm()

        # What the hell am I supposed to check?
        changer.propagate(jobGroup.jobs, 'jobfailed', 'executing')

        dashboardReporter.algorithm()

        return
Example #33
0
    def testA_Create(self):
        """
        WMComponent_t.RetryManager_t.RetryManager_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state='CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)
        return
    def testA_Create(self):
        """
        WMComponent_t.RetryManager_t.RetryManager_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """
        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'Created')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #35
0
    def testB_CheckExecutingJobsAndProfile(self):
        """
        _CheckExecutingJobsAndProfile_
        
        Pull up some executing jobs and profile them.
        """
        return
        jobGroup = self.createTestJobGroup()
        config   = self.getConfig()

        changer = ChangeState(config)
        changer.propagate(jobGroup.jobs, "executing", "created")

        dashboardReporter = DashboardReporterPoller(config = config)
        import cProfile, pstats
        cProfile.runctx("dashboardReporter.algorithm()", globals(), locals(), filename = "testStats.stat")

        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(.2)
        #dashboardReporter.algorithm()

        return
Example #36
0
    def testB_SpeedTest(self):
        """
        _SpeedTest_

        Tests the components, as in sees if they load.
        Otherwise does nothing.
        """
        return
        myThread = threading.currentThread()

        config = self.getConfig()

        self.nJobs = 2000

        testJobGroup = self.createTestJobGroup()

        changer = ChangeState(config)

        cacheDir = os.path.join(self.testDir, 'test')

        for job in testJobGroup.jobs:
            job["outcome"] = "success"
            job.save()
            path = os.path.join(cacheDir, job['name'])
            os.makedirs(path)
            f = open('%s/%s.out' %(path, job['name']),'w')
            f.write(job['name'])
            f.close()
            job.setCache(path)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'success', 'complete')




        testJobArchiver = JobArchiverPoller(config = config)
        cProfile.runctx("testJobArchiver.algorithm()", globals(), locals(), filename = "testStats.stat")


        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(.2)

        return
Example #37
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        nJobs = 100
        workloadName = 'TestWorkload'
        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)

        startTime = time.time()
        cProfile.runctx("testErrorHandler.algorithm()",
                        globals(),
                        locals(),
                        filename="profStats.stat")
        stopTime = time.time()

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Example #38
0
    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testSubscription = Subscription(id=1)  # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)


        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)


        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)



        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
Example #39
0
    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testSubscription = Subscription(id=1)  # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)


        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)


        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)



        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
Example #40
0
    def testC_Job(self):
        """
        WMComponent_t.RetryManager_t.RetryManager_t:testJob()

        Mimics creation of component and test jobs failed in create stage.
        """
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "executing", "created")
        changer.propagate(testJobGroup.jobs, "complete", "executing")
        changer.propagate(testJobGroup.jobs, "jobfailed", "complete")
        changer.propagate(testJobGroup.jobs, "jobcooloff", "jobfailed")

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="Created")
        self.assertEqual(len(idList), self.nJobs)
        return
Example #41
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        nJobs = 100
        workloadName = 'TestWorkload'
        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=nJobs, workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)

        startTime = time.time()
        cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename="profStats.stat")
        stopTime = time.time()

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Example #42
0
    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = "TestWorkload"

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5, workloadPath=workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "executing", "created")
        changer.propagate(testJobGroup.jobs, "complete", "executing")
        changer.propagate(testJobGroup.jobs, "jobfailed", "complete")

        testSubscription = Subscription(id=1)  # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="Exhausted")
        self.assertEqual(len(idList), self.nJobs)

        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
Example #43
0
    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #44
0
    def testY_MultipleIterations(self):
        """
        _MultipleIterations_

        Paranoia based check to see if I'm saving class instances correctly
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'Created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        # Make a new jobGroup for a second run
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # Set job state
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        # Set them to go off
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 200)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs * 2)

        return
Example #45
0
    def testI_MultipleJobTypes(self):
        """
        _testI_MultipleJobTypes_

        Check that we can configure different retry algorithms for different
        job types, including a default for nonspecified types.
        Also check that two job types can share the same retry algorithm
        but with different parameters
        """

        # Let's create 4 job groups
        processingJobGroup = self.createTestJobGroup(nJobs=10, retryOnce=True)
        productionJobGroup = self.createTestJobGroup(nJobs=15,
                                                     subType="Production",
                                                     retryOnce=True)
        mergeJobGroup = self.createTestJobGroup(nJobs=20,
                                                subType="Merge",
                                                retryOnce=True)
        skimJobGroup = self.createTestJobGroup(nJobs=5,
                                               subType="Skim",
                                               retryOnce=True)

        # Set an adequate config
        # Processing jobs get the PauseAlgo with pauseCount 4
        # Production jobs get the ExponentialAlgo
        # Merge jobs get the PauseAlgo but with pauseCount 2 which is the default
        # Skim jobs are not configured, so they get the default SquaredAlgo
        config = self.getConfig()
        config.RetryManager.plugins = {
            'Processing': 'PauseAlgo',
            'Production': 'ExponentialAlgo',
            'Merge': 'PauseAlgo',
            'default': 'SquaredAlgo'
        }
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 4
        config.RetryManager.PauseAlgo.section_("default")
        config.RetryManager.PauseAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }
        config.RetryManager.PauseAlgo.default.pauseCount = 2
        config.RetryManager.section_("ExponentialAlgo")
        config.RetryManager.ExponentialAlgo.section_("Production")
        config.RetryManager.ExponentialAlgo.Production.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.ExponentialAlgo.section_("default")
        config.RetryManager.ExponentialAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }
        config.RetryManager.section_("SquaredAlgo")
        config.RetryManager.SquaredAlgo.section_("Skim")
        config.RetryManager.SquaredAlgo.Skim.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.SquaredAlgo.section_("default")
        config.RetryManager.SquaredAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }

        # Start the state changer and RetryManager
        changer = ChangeState(config)
        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        # Create the jobs for the first time
        changer.propagate(processingJobGroup.jobs, 'created', 'new')

        # Let's start with the processing jobs and the pauseAlgo
        for count in range(1, 5):
            # Fail the jobs
            changer.propagate(processingJobGroup.jobs, 'executing', 'created')
            changer.propagate(processingJobGroup.jobs, 'jobfailed',
                              'executing')
            changer.propagate(processingJobGroup.jobs, 'jobcooloff',
                              'jobfailed')

            # Check  that the cooloff time is strictly enforced
            # First a job time just below the cooloff time
            for job in processingJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(processingJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")

            # Now above the cooloff time
            for job in processingJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            # Make sure the jobs get created again or go to paused
            if count < 4:
                idList = self.getJobs.execute(state='created')
            else:
                idList = self.getJobs.execute(state='jobpaused')
            self.assertEqual(len(idList), len(processingJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Unpause them so they don't interfere with subsequent tests
        changer.propagate(processingJobGroup.jobs, 'created', 'jobpaused')
        changer.propagate(processingJobGroup.jobs, 'executing', 'created')

        # Now the production jobs and the exponential algo
        changer.propagate(productionJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(productionJobGroup.jobs, 'executing', 'created')
            changer.propagate(productionJobGroup.jobs, 'jobfailed',
                              'executing')
            changer.propagate(productionJobGroup.jobs, 'jobcooloff',
                              'jobfailed')

            for job in productionJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        pow(30, count) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(productionJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")
            for job in productionJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        pow(30, count) - 5)
            testRetryManager.algorithm(None)

            idList = self.getJobs.execute(state='created')
            self.assertEqual(len(idList), len(productionJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Send them to executing
        changer.propagate(productionJobGroup.jobs, 'executing', 'created')

        # Now the merge jobs and the paused algo with different parameters
        changer.propagate(mergeJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(mergeJobGroup.jobs, 'executing', 'created')
            changer.propagate(mergeJobGroup.jobs, 'jobfailed', 'executing')
            changer.propagate(mergeJobGroup.jobs, 'jobcooloff', 'jobfailed')

            for job in mergeJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(mergeJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")

            for job in mergeJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        60 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            if count < 2:
                idList = self.getJobs.execute(state='created')
            else:
                idList = self.getJobs.execute(state='jobpaused')
            self.assertEqual(len(idList), len(mergeJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Send them to executing
        changer.propagate(mergeJobGroup.jobs, 'created', 'jobpaused')
        changer.propagate(mergeJobGroup.jobs, 'executing', 'created')

        # Now the skim jobs and the squared algo
        changer.propagate(skimJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(skimJobGroup.jobs, 'executing', 'created')
            changer.propagate(skimJobGroup.jobs, 'jobfailed', 'executing')
            changer.propagate(skimJobGroup.jobs, 'jobcooloff', 'jobfailed')

            for job in skimJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(skimJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")
            for job in skimJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            idList = self.getJobs.execute(state='created')
            self.assertEqual(len(idList), len(skimJobGroup.jobs),
                             "Jobs didn't change state correctly")
Example #46
0
    def testH_PauseAlgo(self):
        """
        _testH_PauseAlgo_

        Test the pause algorithm, note that given pauseCount = n, the
        job will run first n + 1 times before being paused.
        After that it will be paused each n times
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # adding a 2nd job group
        testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'PauseAlgo'}
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 20,
            'submit': 20,
            'job': 20
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 2
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        report = Report()

        # Making sure that jobs are not created ahead of time
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 15)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 25)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        # Make sure that no change happens before timeout
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 75)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin pauses them
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # Emulating ops retrying the job
        changer.propagate(testJobGroup.jobs, 'created', 'jobpaused')

        # Making sure it did the right thing
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 185)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 315)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 325)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # a configurable retry count per job type {jobExitCodeA: pauseCountB}
        config.RetryManager.PauseAlgo.Processing.retryErrorCodes = {
            8020: 1,
            12345: 1,
            5555: 2
        }

        testRetryManager2 = RetryManagerPoller(config)
        testRetryManager2.algorithm()

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        report.load(fwjrPath)
        for job in testJobGroup2.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))

        # fail the jobs
        changer.propagate(testJobGroup2.jobs, 'created', 'new')
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        # Giving time so they can be paused
        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin sent those jobs to the next state:
        testRetryManager2.algorithm()
        # job exit code is 8020, so it is supposed to be retried one time.
        # Meaning, that here we should have 10 jobs (from the first part of the test) in jobpaused
        # and 10 jobs in created state

        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        idList2 = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList2), self.nJobs)

        # save a second job report - with a retry count = 1
        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            j['retry_count'] = 1
            self.assertEqual(j['retry_count'], 1)
            report.save(
                os.path.join(j['cache_dir'],
                             "Report.%i.pkl" % j['retry_count']))

        # Fail them out again
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)

        # not sure if this check is needed:
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin sent those jobs to paused state:
        testRetryManager2.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        # And again, in total, there should be 10+10=20 jobs in jobpaused
        self.assertEqual(len(idList), self.nJobs * 2)

        return
Example #47
0
    def testG_ProcessingAlgo(self):
        """
        _ProcessingAlgo_

        Test for the ProcessingAlgo Prototype
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'ProcessingAlgo'}
        config.RetryManager.section_("ProcessingAlgo")
        config.RetryManager.ProcessingAlgo.section_("default")
        config.RetryManager.ProcessingAlgo.default.coolOffTime = {
            'create': 10,
            'submit': 10,
            'job': 10
        }
        changer = ChangeState(config)
        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")
        report = Report()
        report.load(fwjrPath)
        for job in testJobGroup.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 1)
            report.save(
                os.path.join(j['cache_dir'],
                             "Report.%i.pkl" % j['retry_count']))

        config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [8020]
        testRetryManager2 = RetryManagerPoller(config)
        testRetryManager2.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 5)

        # Now test timeout
        testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs)

        # Cycle jobs
        for job in testJobGroup2.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))
        changer.propagate(testJobGroup2.jobs, 'created', 'new')
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 0)

        config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = []
        config.RetryManager.ProcessingAlgo.default.MaxRunTime = 1
        testRetryManager3 = RetryManagerPoller(config)
        testRetryManager3.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs * 2)

        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 5)

        return
Example #48
0
    def testF_LinearAlgo(self):
        """
        _testLinearAlgo_

        Test the linear algorithm to make sure it loads and works
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'LinearAlgo'}
        config.RetryManager.section_("LinearAlgo")
        config.RetryManager.LinearAlgo.section_("Processing")
        config.RetryManager.LinearAlgo.Processing.coolOffTime = {
            'create': 10,
            'submit': 10,
            'job': 10
        }
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 5)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 12)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        return
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)


        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(sandboxDir,
                                         'PackageCollection_%i' % collectionIndex,
                                         'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory=collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info("Skipping cache update to be submitted. (%s job in cache)", len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s", pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [x for x in possibleLocations if x not in self.abortSites]
                if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in ('LogCollect', 'Merge', 'Cleanup', 'Harvesting'):
                nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites]
                if nonDrainingSites: # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {'id': jobID,
                       'requestName': newJob['request_name'],
                       'taskName': newJob['task_name'],
                       'taskType': newJob['type'],
                       'cache_dir': newJob["cache_dir"],
                       'priority': newJob['wf_priority'],
                       'taskID': newJob['task_id'],
                       'retry_count': newJob["retry_count"],
                       'taskPriority': None,                                # update from the thresholds
                       'custom': {'location': None},                        # update later
                       'packageDir': batchDir,
                       'sandbox': loadedJob["sandbox"],                     # remove before submit
                       'userdn': loadedJob.get("ownerDN", None),
                       'usergroup': loadedJob.get("ownerGroup", ''),
                       'userrole': loadedJob.get("ownerRole", ''),
                       'possibleSites': frozenset(possibleLocations),       # abort and drain sites filtered out
                       'potentialSites': frozenset(potentialLocations),     # original list of sites
                       'scramArch': loadedJob.get("scramArch", None),
                       'swVersion': loadedJob.get("swVersion", None),
                       'name': loadedJob["name"],
                       'proxyPath': loadedJob.get("proxyPath", None),
                       'estimatedJobTime': loadedJob.get("estimatedJobTime", None),
                       'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None),
                       'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None),
                       'numberOfCores': loadedJob.get("numberOfCores", 1),  # may update it later
                       'inputDataset': loadedJob.get('inputDataset', None),
                       'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None),
                       'allowOpportunistic': loadedJob.get('allowOpportunistic', False)}

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": file locations: " + ', '.join(job['fileLocations']) +
                                         ": site white list: " + ', '.join(job['siteWhitelist']) +
                                         ": site black list: " + ', '.join(job['siteBlacklist']))
                else:
                    # This is temporary addition if this is patched for existing agent.
                    # If jobs are created before the patch is applied fileLocations is not set.
                    # TODO. remove this later for new agent
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": Job is created before this patch. Please check this input for the jobs: %s " %
                                         job['fwjr'].getAllInputFiles())

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items():
                    self.taskTypePrioMap[task] = value.get('priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or  newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return


    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn("Have a job for %s which is not in the resource control", siteName)
                        continue

                    try:
                        totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"]
                        totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"]
                        totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"]
                        totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"]

                        taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"]
                        taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"]
                        taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"]
                        taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"]
                        taskPriority = self.currentRcThresholds[siteName]['thresholds'][jobType]["priority"]
                    except KeyError as ex:
                        msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType)
                        msg += str(ex)
                        logging.error(msg)
                        continue

                    # check if site has free pending slots AND free pending task slots
                    if totalPendingJobs >= totalPendingSlots or taskPendingJobs >= taskPendingSlots:
                        jobSubmitLogBySites[siteName]["NoPendingSlot"] += 1
                        logging.debug("Found a job for %s which has no free pending slots", siteName)
                        continue
                    # check if site overall thresholds have free slots
                    if totalPendingJobs + totalRunningJobs >= totalPendingSlots + totalRunningSlots:
                        jobSubmitLogBySites[siteName]["NoRunningSlot"] += 1
                        logging.debug("Found a job for %s which has no free overall slots", siteName)
                        continue
                    # finally, check whether task has free overall slots
                    if taskPendingJobs + taskRunningJobs >= taskPendingSlots + taskRunningSlots:
                        jobSubmitLogBySites[siteName]["NoTaskSlot"] += 1
                        logging.debug("Found a job for %s which has no free task slots", siteName)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName]["total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1
                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = taskPriority

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsPerPoll:
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit


    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return


    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """

        try:
            myThread = threading.currentThread()
            self.getThresholds()
            self.refreshCache()

            if self.useReqMgrForCompletionCheck:
                # only runs when reqmgr is used (not Tier0)
                self.removeAbortedForceCompletedWorkflowFromCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return



    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Example #50
0
    def testA_BasicTest(self):
        """
        Use the MockPlugin to create a simple test
        Check to see that all the jobs were "submitted",
        don't care about thresholds
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 20
        site = "T2_US_UCSD"

        self.setResourceThresholds(site,
                                   pendingSlots=50,
                                   runningSlots=100,
                                   tasks=['Processing', 'Merge'],
                                   Processing={
                                       'pendingSlots': 50,
                                       'runningSlots': 100
                                   },
                                   Merge={
                                       'pendingSlots': 50,
                                       'runningSlots': 100
                                   })

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Do pre-submit check
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()

        # Check that jobs are in the right state
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        for jobId in result:
            loc = getLocationAction.execute(jobid=jobId)
            self.assertEqual(loc, [['T2_US_UCSD']])

        # Run another cycle, it shouldn't submit anything. There isn't anything to submit
        jobSubmitter.algorithm()
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        nSubs = 1
        nJobs = 10

        # Submit another 10 jobs
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            taskType="Merge")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Check that the jobs are available for submission and run another cycle
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), nSubs * nJobs)
        jobSubmitter.algorithm()

        # Check that the last 10 jobs were submitted as well.
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Merge")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #51
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs",
                     len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
                )
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info(
                "Skipping cache update to be submitted. (%s job in cache)",
                len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount,
                             len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s",
                              pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in self.ioboundTypes:
                nonDrainingSites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'],
                                               0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {
                'id':
                jobID,
                'requestName':
                newJob['request_name'],
                'taskName':
                newJob['task_name'],
                'taskType':
                newJob['type'],
                'cache_dir':
                newJob["cache_dir"],
                'priority':
                newJob['wf_priority'],
                'taskID':
                newJob['task_id'],
                'retry_count':
                newJob["retry_count"],
                'taskPriority':
                None,  # update from the thresholds
                'custom': {
                    'location': None
                },  # update later
                'packageDir':
                batchDir,
                'sandbox':
                loadedJob["sandbox"],  # remove before submit
                'userdn':
                loadedJob.get("ownerDN", None),
                'usergroup':
                loadedJob.get("ownerGroup", ''),
                'userrole':
                loadedJob.get("ownerRole", ''),
                'possibleSites':
                frozenset(
                    possibleLocations),  # abort and drain sites filtered out
                'potentialSites':
                frozenset(potentialLocations),  # original list of sites
                'scramArch':
                loadedJob.get("scramArch", None),
                'swVersion':
                loadedJob.get("swVersion", None),
                'name':
                loadedJob["name"],
                'proxyPath':
                loadedJob.get("proxyPath", None),
                'estimatedJobTime':
                loadedJob.get("estimatedJobTime", None),
                'estimatedDiskUsage':
                loadedJob.get("estimatedDiskUsage", None),
                'estimatedMemoryUsage':
                loadedJob.get("estimatedMemoryUsage", None),
                'numberOfCores':
                loadedJob.get("numberOfCores", 1),  # may update it later
                'inputDataset':
                loadedJob.get('inputDataset', None),
                'inputDatasetLocations':
                loadedJob.get('inputDatasetLocations', None),
                'allowOpportunistic':
                loadedJob.get('allowOpportunistic', False)
            }

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d",
                    badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
        )
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items(
                ):
                    self.taskTypePrioMap[task] = value.get(
                        'priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName][
                "total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName][
                "total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName][
                "total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName][
                "total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName][
                'thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault(
                "init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][
                jobType].setdefault("init_task_pending_jobs", taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName][
                "init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName,
                                                                 jobType)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (
                jobPrio <= highestPriorityInJobs) or (jobType
                                                      in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(
                totalPendingSlots, initTotalPending) + (
                    totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{
            "Condition": "NoPendingSlot",
            "Current": totalPendingJobs,
            "Threshold": totalPendingThreshold
        }, {
            "Condition": "NoTaskPendingSlot",
            "Current": taskPendingJobs,
            "Threshold": taskPendingThreshold
        }, {
            "Condition": "NoRunningSlot",
            "Current": totalPendingJobs + totalRunningJobs,
            "Threshold": totalJobThreshold
        }, {
            "Condition": "NoTaskRunningSlot",
            "Current": taskPendingJobs + taskRunningJobs,
            "Threshold": totalTaskTheshold
        }]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(),
                              key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn(
                            "Have a job for %s which is not in the resource control",
                            siteName)
                        continue

                    condition = self._getJobSubmitCondition(
                        jobPrio, siteName, jobType)

                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName,
                                      condition)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName][
                        "total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType][
                        "task_pending_jobs"] += 1

                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = self.currentRcThresholds[
                        siteName]['thresholds'][jobType]["priority"]

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info(
                        "Submitter reached limit of submit slots for this cycle: %i",
                        self.maxJobsThisCycle)
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s",
                     dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.",
                     len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName)
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get(
                "CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg,
                                      "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork",
                                        "warning",
                                        this_thread=True)
            self.getThresholds()
            self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(
            dbi=myThread.dbi,
            logger=logging,
            condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Example #52
0
    def testE_FullChain(self):
        """
        _FullChain_

        Full test going through the chain; using polling cycles and everything
        """

        return

        from WMComponent.JobSubmitter.JobSubmitter   import JobSubmitter
        from WMComponent.JobStatusLite.JobStatusLite import JobStatusLite
        from WMComponent.JobTracker.JobTracker       import JobTracker


        myThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))


        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = 'se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitter(config = config)
        jobTracker   = JobTracker(config = config)
        jobStatus    = JobStatusLite(config = config)


        jobSubmitter.prepareToStart()
        jobTracker.prepareToStart()
        jobStatus.prepareToStart()

        # What should happen here:
        # 1) The JobSubmitter should submit the jobs
        # 2) Because of the ridiculously short time on pending jobs
        #     the JobStatus poller should mark the jobs as done
        #     and kill them.
        # 3) The JobTracker should realize there are finished jobs
        #
        # So at the end of several polling cycles, the jobs should all
        # be done, but be in the failed status (they timed out)

        time.sleep(20)


        myThread.workerThreadManager.terminateWorkers()


        getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs")
        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state = 'JobFailed', jobType = "Processing")
        self.assertEqual(len(result), nJobs * nSubs)
        return
Example #53
0
    def testD_PrototypeChain(self):
        """
        _PrototypeChain_

        Prototype the BossAir workflow
        """
        myThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))


        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = 'se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)
        jobTracker   = JobTrackerPoller(config = config)
        statusPoller = StatusPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Check WMBS
        getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs")
        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        statusPoller.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nSubs * nJobs)


        # Tracker should do nothing
        jobTracker.algorithm()

        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)


        # Wait for jobs to timeout due to short Pending wait period
        time.sleep(12)


        statusPoller.algorithm()

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Timeout', complete = '0')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Jobs should be gone
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)


        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nSubs * nJobs)


        # Because they timed out, they all should have failed
        jobTracker.algorithm()

        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state = 'JobFailed', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #54
0
    def testB_thresholdTest(self):
        """
        _testB_thresholdTest_

        Check that the threshold management is working,
        this requires checks on pending/running jobs globally
        at a site and per task/site
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10
        site = "T1_US_FNAL"

        self.setResourceThresholds(site,
                                   pendingSlots=50,
                                   runningSlots=220,
                                   tasks=['Processing', 'Merge'],
                                   Processing={
                                       'pendingSlots': 45,
                                       'runningSlots': 200
                                   },
                                   Merge={
                                       'pendingSlots': 10,
                                       'runningSlots': 20,
                                       'priority': 5
                                   })

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config=config)

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Do pre-submit check
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        jobSubmitter.algorithm()

        # Check that jobs are in the right state,
        # here we are limited by the pending threshold for the Processing task (45)
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 45)

        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        for jobId in result:
            loc = getLocationAction.execute(jobid=jobId)
            self.assertEqual(loc, [['T1_US_FNAL']])

        # Run another cycle, it shouldn't submit anything. Jobs are still in pending
        jobSubmitter.algorithm()
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 45)

        # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site
        nSubs = 1
        nJobs = 10
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            taskType='Merge')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Merge")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 45)

        # Now let's test running thresholds
        # The scenario will be setup as follows: Move all current jobs as running
        # Create 300 Processing jobs and 300 merge jobs
        # Run 5 polling cycles, moving all pending jobs to running in between
        # Result is, merge is left at 30 running 0 pending and processing is left at 240 running 0 pending
        # Processing has 110 jobs in queue and Merge 280
        # This tests all threshold dynamics including the prioritization of merge over processing
        nSubs = 1
        nJobs = 300
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)
        jobGroupList.extend(
            self.createJobGroups(nSubs=nSubs,
                                 nJobs=nJobs,
                                 task=workload.getTask("ReReco"),
                                 workloadSpec=self.workloadSpecPath,
                                 site=site,
                                 taskType='Merge'))
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        getRunJobID = self.baDaoFactory(classname="LoadByWMBSID")
        setRunJobStatus = self.baDaoFactory(classname="SetStatus")

        for i in range(5):
            result = getJobsAction.execute(state='Executing')
            binds = []
            for jobId in result:
                binds.append({'id': jobId, 'retry_count': 0})
            runJobIds = getRunJobID.execute(binds)
            setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running')
            jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType='Processing')
        self.assertEqual(len(result), 240)
        result = getJobsAction.execute(state='Created', jobType='Processing')
        self.assertEqual(len(result), 110)
        result = getJobsAction.execute(state='Executing', jobType='Merge')
        self.assertEqual(len(result), 30)
        result = getJobsAction.execute(state='Created', jobType='Merge')
        self.assertEqual(len(result), 280)

        return
Example #55
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'

        baAPI.submit(jobs=jobDummies)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)

        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)

        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)

        baAPI.removeComplete(jobs=jobDummies)

        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)

        return
Example #56
0
    def testC_prioritization(self):
        """
        _testC_prioritization_

        Check that jobs are prioritized by job type and by oldest workflow
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 10
        site = "T1_US_FNAL"

        self.setResourceThresholds(site,
                                   pendingSlots=10,
                                   runningSlots=10000,
                                   tasks=['Processing', 'Merge'],
                                   Processing={
                                       'pendingSlots': 50,
                                       'runningSlots': 10000
                                   },
                                   Merge={
                                       'pendingSlots': 10,
                                       'runningSlots': 10000,
                                       'priority': 5
                                   })

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config=config)

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            name='OldestWorkflow')
        jobGroupList.extend(
            self.createJobGroups(nSubs=nSubs,
                                 nJobs=nJobs,
                                 task=workload.getTask("ReReco"),
                                 workloadSpec=self.workloadSpecPath,
                                 site=site,
                                 taskType='Merge'))
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        # Merge goes first
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Merge")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        # Create a newer workflow processing, and after some new jobs for an old workflow

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            name='OldestWorkflow')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            name='NewestWorkflow')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Move pending jobs to running

        getRunJobID = self.baDaoFactory(classname="LoadByWMBSID")
        setRunJobStatus = self.baDaoFactory(classname="SetStatus")

        for idx in range(2):
            result = getJobsAction.execute(state='Executing')
            binds = []
            for jobId in result:
                binds.append({'id': jobId, 'retry_count': 0})
            runJobIds = getRunJobID.execute(binds)
            setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running')

            # Run again on created workflows
            jobSubmitter.algorithm()

            result = getJobsAction.execute(state='Created', jobType="Merge")
            self.assertEqual(len(result), 0)
            result = getJobsAction.execute(state='Executing', jobType="Merge")
            self.assertEqual(len(result), 10)
            result = getJobsAction.execute(state='Created',
                                           jobType="Processing")
            self.assertEqual(len(result), 30 - (idx + 1) * 10)
            result = getJobsAction.execute(state='Executing',
                                           jobType="Processing")
            self.assertEqual(len(result), (idx + 1) * 10)

            # Check that older workflow goes first even with newer jobs
            getWorkflowAction = self.daoFactory(
                classname="Jobs.GetWorkflowTask")
            workflows = getWorkflowAction.execute(result)
            for workflow in workflows:
                self.assertEqual(workflow['name'], 'OldestWorkflow')

        return
Example #57
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException(
                'Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)
        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        return

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        # Do not build ACDC for utilitarian job types
        jobList = [
            job for job in jobList
            if job['type'] not in ['LogCollect', 'Cleanup']
        ]

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed",
                     len(jobList), state)
        retrydoneJobs = []
        cooloffJobs = []
        passJobs = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'],
                                                 self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                logging.debug("JobInfo: %s", job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(
                cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs,
                                       'retrydone',
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs,
                                       '%scooloff' % state,
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs", len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []

        if self.reqAuxDB:
            self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName).get("NoRetryExitCodes", [])

        for job in jobList:
            report = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error(
                    "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.",
                    job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error(
                    "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.",
                    job['id'], reportPath)
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                # correct the location if the original location is different from recorded in wmbs
                # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this.
                # If location in wmbs needs to be updated, it should happen in JobAccountant.
                locationFromFWJR = report.getSiteName()
                if locationFromFWJR:
                    job["location"] = locationFromFWJR
                    job["site_cms_name"] = locationFromFWJR

                if startTime is None or stopTime is None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i",
                                  job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (
                        job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([
                        x for x in report.getExitCodes()
                        if x in self.exitCodesNoRetry
                ]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    exhaustJobs.append(job)
                    continue

                if len(
                    [x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.debug(msg)
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception as ex:
                logging.warning(
                    "Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs

    def handleRetryDoneJobs(self, jobList):
        """
        _handleRetryDoneJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d retry done jobs", len(jobList))
        myThread.transaction.begin()
        self.exhaustJobs(jobList)
        myThread.transaction.commit()

        return

    def handleFailedJobs(self, jobList, state):
        """
        _handleFailedJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d failures", len(jobList))
        myThread.transaction.begin()
        self.processRetries(jobList, state)
        myThread.transaction.commit()

        return

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state="%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed", len(idList),
                         state)
            while len(idList) > 0:
                tmpList = idList[:self.maxProcessSize]
                idList = idList[self.maxProcessSize:]
                jobList = self.loadJobsFromList(tmpList)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state='retrydone')
        logging.info("Found %d jobs done with all retries", len(idList))
        while len(idList) > 0:
            tmpList = idList[:self.maxProcessSize]
            idList = idList[self.maxProcessSize:]
            jobList = self.loadJobsFromList(tmpList)
            self.handleRetryDoneJobs(jobList)

        return

    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """
        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})
        results = self.idLoad.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def loadJobsFromListFull(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk.
        Include the full metadata.
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    @timeFunction
    def algorithm(self, parameters=None):
        """
        Performs the handleErrors method, looking for each type of failure
        And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        try:
            myThread = threading.currentThread()
            self.handleErrors()
        except (CouchConnectionError, HTTPException) as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. "
            msg += "Transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.error(msg)
        except Exception as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught unexpected exception in ErrorHandler:\n"
            msg += str(ex)
            logging.exception(msg)
            raise ErrorHandlerException(msg)
Example #58
0
    def testE_SiteModesTest(self):
        """
        _testE_SiteModesTest_

        Test the behavior of the submitter in response to the different
        states of the sites
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)
        nSubs = 1
        nJobs = 20

        sites = [
            'T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL'
        ]
        for site in sites:
            self.setResourceThresholds(site,
                                       pendingSlots=10,
                                       runningSlots=-1,
                                       tasks=['Processing', 'Merge'],
                                       Processing={
                                           'pendingSlots': 10,
                                           'runningSlots': -1
                                       },
                                       Merge={
                                           'pendingSlots': 10,
                                           'runningSlots': -1,
                                           'priority': 5
                                       })

        myResourceControl = ResourceControl(config)
        myResourceControl.changeSiteState('T2_US_Florida', 'Draining')
        # First test that we prefer Normal over drain, and T1 over T2/T3
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        # Actually run it
        jobSubmitter.algorithm()

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection
        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        locationDict = getLocationAction.execute([{
            'jobid': x
        } for x in result])
        for entry in locationDict:
            loc = entry['site_name']
            self.assertNotEqual(loc, 'T2_US_Florida')

        # Now set everything to down, check we don't submit anything
        for site in sites:
            myResourceControl.changeSiteState(site, 'Down')
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter.algorithm()
        # Nothing is submitted despite the empty slots at Uniandes and Florida
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Now set everything to Drain and create Merge jobs. Those should be submitted
        for site in sites:
            myResourceControl.changeSiteState(site, 'Draining')

        nSubsMerge = 1
        nJobsMerge = 5
        jobGroupList = self.createJobGroups(nSubs=nSubsMerge,
                                            nJobs=nJobsMerge,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            taskType='Merge')

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType='Merge')
        self.assertEqual(len(result), nSubsMerge * nJobsMerge)

        # Now set everything to Aborted, and create Merge jobs. Those should fail
        # since the can only run at one place
        for site in sites:
            myResourceControl.changeSiteState(site, 'Aborted')

        nSubsMerge = 1
        nJobsMerge = 5
        jobGroupList = self.createJobGroups(nSubs=nSubsMerge,
                                            nJobs=nJobsMerge,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            taskType='Merge')

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='SubmitFailed', jobType='Merge')
        self.assertEqual(len(result), nSubsMerge * nJobsMerge)
        result = getJobsAction.execute(state='Executing', jobType='Processing')
        self.assertEqual(len(result), nSubs * nJobs)

        return