Example #1
0
    def testD_SubmitFailed(self):
        """
        _testD_SubmitFailed_

        Check if jobs without a possible site to run at go to SubmitFailed
        """
        workloadName = "basicWorkload"
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            site = [],
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName))

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config = config)
        jobSubmitter.algorithm()

        # Jobs should go to submit failed
        getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs")
        result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #2
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')
        sn = "T2_US_UCSD"

        # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist
        # in BossAir_t.py
        baAPI.updateSiteInformation(idleJobs, sn, True)

        # Now kill 'em manually
        #        command = ['condor_rm', self.user]
        #        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        #        pipe.communicate()
        
        del jobSubmitter

        return
Example #3
0
    def testB_Submit(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit()
        
        Mimics creation of component and test jobs failed in submit stage.
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')
        
        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')

        idList = self.getJobs.execute(state = 'SubmitFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'SubmitFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #4
0
    def testF_OverloadTest(self):
        """
        _OverloadTest_
        
        Test and see what happens if you put in more jobs
        Then the sites can handle
        """

        resourceControl = ResourceControl()
        for site in self.sites:
            resourceControl.insertThreshold(siteName=site, taskType="Silly", maxSlots=1)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        workloadName = "basicWorkload"
        myThread = threading.currentThread()
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 10
        cacheDir = os.path.join(self.testDir, "CacheDir")

        jobGroupList = self.createJobGroups(
            nSubs=nSubs,
            nJobs=nJobs,
            task=workload.getTask("ReReco"),
            workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName),
            type="Silly",
        )

        for group in jobGroupList:
            changeState.propagate(group.jobs, "created", "new")

        jobSubmitter = JobSubmitterPoller(config=config)

        # Actually run it
        jobSubmitter.algorithm()

        # Should be one job for each site
        nSites = len(self.sites)
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSites)

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state="Executing", jobType="Silly")
        self.assertEqual(len(result), nSites)
        result = getJobsAction.execute(state="Created", jobType="Silly")
        self.assertEqual(len(result), nJobs * nSubs - nSites)

        # Now clean-up
        command = ["condor_rm", self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        del jobSubmitter

        return
Example #5
0
    def testJobSiteDrain(self):
        """
        _testJobSiteDrain_

        Test the behavior of jobs pending to a single site that is in drain mode
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        jobSubmitter = JobSubmitterPoller(config=config)
        myResourceControl = ResourceControl(config)
        changeState = ChangeState(config)
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")

        nSubs = 1
        nJobs = 30

        site = 'T2_US_Nebraska'
        self.setResourceThresholds(site, pendingSlots=100, runningSlots=100,
                                   tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 10, 'runningSlots': 10},
                                   Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5})

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[site],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # submit first 10 jobs
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)

        myResourceControl.changeSiteState(site, 'Draining')

        # site is now in drain, so don't submit anything
        jobSubmitter.algorithm()

        # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 0)

        # make sure the drain grace period expires...
        time.sleep(3)
        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 10)
        # the remaining jobs should have gone to submitfailed by now
        result = getJobsAction.execute(state='submitfailed', jobType="Processing")
        self.assertEqual(len(result), 20)
        result = getJobsAction.execute(state='created', jobType="Processing")
        self.assertEqual(len(result), 0)
Example #6
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig = None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package = "WMCore.WMBS",
                            logger = myThread.logger,
                            dbinterface = myThread.dbi)
    killFilesAction = daoFactory(classname = "Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname = "Jobs.KillWorkflow")

    existingTransaction = False
    if myThread.transaction.conn:
        existingTransaction = True
    else:
        myThread.transaction.begin()

    killFilesAction.execute(workflowName = workflowName,
                            conn = myThread.transaction.conn,
                            transaction = True)

    liveJobs = killJobsAction.execute(workflowName = workflowName,
                                      conn = myThread.transaction.conn,
                                      transaction = True)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config = bossAirConfig, noSetup = True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id = liveJob["id"])
                liveWMBSJob.update(liveJob)
                changeState.propagate(liveWMBSJob, "killed", liveJob["state"])
                killableJobs.append(liveJob)
        # Now kill them
        try:
            bossAir.kill(jobs = killableJobs)
        except BossAirException, ex:
            # Something's gone wrong
            # Jobs not killed!
            logging.error("Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.
            pass
    def testZ_Profile(self):
        """
        _Profile_

        Do a basic profiling of the algo
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 150)

        startTime = time.time()
        #cProfile.runctx("testRetryManager.algorithm()", globals(), locals(), filename = "profStats.stat")
        testRetryManager.algorithm(None)
        stopTime  = time.time()

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'New')
        self.assertEqual(len(idList), nJobs)


        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Example #8
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath,
                                               workloadName=workloadName)
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'created')

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        # These should go directly to exhausted
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]],
                                "Unknown lumi %s" % f['runs'][0]['lumis'])
                self.assertEqual(f['merged'], 0)
                self.assertEqual(f['first_event'], 88)
            self.assertEqual(counter, 20)
        return
Example #9
0
    def testF_PollerProfileTest(self):
        """
        _testF_PollerProfileTest_

        Submit a lot of jobs and test how long it takes for
        them to actually be submitted
        """

        workloadName = "basicWorkload"
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 100
        nJobs = 100
        sites = ['T1_US_FNAL']

        for site in sites:
            self.setResourceThresholds(site, pendingSlots = 20000, runningSlots = -1, tasks = ['Processing', 'Merge'],
                                       Processing = {'pendingSlots' : 10000, 'runningSlots' :-1},
                                       Merge = {'pendingSlots' : 10000, 'runningSlots' :-1, 'priority' : 5})

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config = config)

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir, 'workloadTest',
                                                                        workloadName),
                                            site = 'se.%s' % 'T1_US_FNAL')

        jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir, 'workloadTest',
                                                                        workloadName),
                                            site = 'se.%s' % 'T1_US_FNAL',
                                            taskType = 'Merge'))

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Actually run it
        startTime = time.time()
        cProfile.runctx("jobSubmitter.algorithm()", globals(), locals(), filename = "testStats.stat")
        stopTime = time.time()


        print "Job took %f seconds to complete" % (stopTime - startTime)

        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats()

        return
Example #10
0
    def testY_MultipleIterations(self):
        """
        _MultipleIterations_

        Paranoia based check to see if I'm saving class instances correctly
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "submitfailed", "Created")
        changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed")

        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="Created")
        self.assertEqual(len(idList), self.nJobs)

        # Make a new jobGroup for a second run
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # Set job state
        changer.propagate(testJobGroup.jobs, "submitfailed", "created")
        changer.propagate(testJobGroup.jobs, "submitcooloff", "submitfailed")

        # Set them to go off
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"], stateTime=int(time.time()) - 200)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state="SubmitCooloff")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="Created")
        self.assertEqual(len(idList), self.nJobs * 2)

        return
Example #11
0
    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()
        
        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = "TestWorkload"

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath, workloadName=workloadName)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "createfailed", "created")

        idList = self.getJobs.execute(state="CreateFailed")
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state="CreateFailed")
        self.assertEqual(len(idList), 0)

        # These should go directly to exhausted
        idList = self.getJobs.execute(state="Exhausted")
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection["filesets"]) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f["lfn"] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f["events"], 10)
                self.assertEqual(f["size"], 1024)
                self.assertEqual(f["parents"], [u"/this/is/a/parent"])
                self.assertTrue(
                    f["runs"][0]["lumis"] in [[12312], [12314, 12315, 12316]], "Unknown lumi %s" % f["runs"][0]["lumis"]
                )
                self.assertTrue(f["merged"], 1)
                self.assertTrue(f["first_event"], 88)
            self.assertEqual(counter, 20)
        return
Example #12
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')

        baAPI.kill(jobs = idleJobs)

        del jobSubmitter

        return
Example #13
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        pipe.communicate()

        del jobSubmitter

        return
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from CondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        dummycacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(self.testDir,
                                                                      'workloadTest',
                                                                      workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(jtok, errorCode=71301)  # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down)

        return
Example #15
0
    def __init__(self, **configDict):
        """
        init jobCreator
        """

        myThread = threading.currentThread()

        self.transaction = myThread.transaction

        # DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        # WMCore splitter factory for splitting up jobs.
        self.splitterFactory = SplitterFactory()

        config = Configuration()
        config.section_("JobStateMachine")
        config.JobStateMachine.couchurl = configDict["couchURL"]
        config.JobStateMachine.couch_retries = configDict["defaultRetries"]
        config.JobStateMachine.couchDBName = configDict["couchDBName"]

        self.config = config

        # Variables
        self.jobCacheDir = configDict["jobCacheDir"]
        self.defaultJobType = configDict["defaultJobType"]
        self.limit = configDict.get("fileLoadLimit", 500)

        self.createWorkArea = CreateWorkArea()

        self.changeState = ChangeState(self.config)

        return
Example #16
0
    def setUp(self):
        myThread = threading.currentThread()

        self.testInit = TestInit(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules = ["WMCore.WMBS"],
                                useDefault = False)
        self.databaseName = "couchapp_t_0"
        self.testInit.setupCouch(self.databaseName, "WorkloadSummary")
        self.testInit.setupCouch("%s/jobs" % self.databaseName, "JobDump")
        self.testInit.setupCouch("%s/fwjrs" % self.databaseName, "FWJRDump")

        # Setup config for couch connections
        config = self.testInit.getConfiguration()
        config.section_("JobStateMachine")
        config.JobStateMachine.couchDBName  = self.databaseName

        # Create couch server and connect to databases
        self.couchdb      = CouchServer(config.JobStateMachine.couchurl)
        self.jobsdatabase = self.couchdb.connectDatabase("%s/jobs" % config.JobStateMachine.couchDBName)
        self.fwjrdatabase = self.couchdb.connectDatabase("%s/fwjrs" % config.JobStateMachine.couchDBName)

        # Create changeState
        self.changeState = ChangeState(config)
        self.config      = config

        # Create testDir
        self.testDir = self.testInit.generateWorkDir()

        return
Example #17
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)

        self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available    
        self.initAlerts(compName = "ErrorHandler")        
        
        return
Example #18
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {"default": self.maxRetries}
        if "default" not in self.maxRetries:
            raise ErrorHandlerException("Max retries for the default job type must be specified")

        self.maxProcessSize = getattr(self.config.ErrorHandler, "maxProcessSize", 250)
        self.exitCodes = getattr(self.config.ErrorHandler, "failureExitCodes", [])
        self.maxFailTime = getattr(self.config.ErrorHandler, "maxFailTime", 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, "readFWJR", False)
        self.passCodes = getattr(self.config.ErrorHandler, "passExitCodes", [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database)

        return
Example #19
0
class PauseAlgo(RetryAlgoBase):
    """
    _PauseAlgo_

        This implements the Paused job algorithm, explanation of the concept in #3114
    """
    def __init__ (self, config):
        RetryAlgoBase.__init__(self, config)
        self.changer = ChangeState(config)

    def isReady(self, job, cooloffType):
        """
        Actual function that does the work
        """
        #This should come from configuration, pause_count

        pauseCount = self.getAlgoParam(job['jobType'], param ='pauseCount', defaultReturn = 3)

        pauseMap = {
            'createcooloff' :    'createpaused',
            'submitcooloff' :    'submitpaused',
            'jobcooloff'    :    'jobpaused'
        }

        # Here introduces the SquaredAlgo logic :
        baseTimeoutDict = self.getAlgoParam(job['jobType'])
        baseTimeout = baseTimeoutDict.get(cooloffType.lower(), 10)
        cooloffTime = baseTimeout * pow(job['retry_count'], 2)
        currentTime = self.timestamp()
        if currentTime - job['state_time'] > cooloffTime:
            retryByTimeOut = True
        else:
            retryByTimeOut = False

        if retryByTimeOut :
            # If reached the pauseCount, we want the job to pause instead of retrying
            if pauseCount == 0:
                self.changer.propagate(job, pauseMap[job['state']], job['state'],  updatesummary=True)
                return False
            elif job['retry_count'] > 0 and not (job['retry_count'] % pauseCount):
                self.changer.propagate(job, pauseMap[job['state']], job['state'],  updatesummary=True)
                return False
            else:
                return True
        else:
            return False
Example #20
0
    def testCheck(self):
        """
        This is the test class for function Check from module ChangeState
        """
        change = ChangeState(self.config, "changestate_t")

        # Run through all good state transitions and assert that they work
        for state in self.transitions.keys():
            for dest in self.transitions[state]:
                change.check(dest, state)
        dummystates = ['dummy1', 'dummy2', 'dummy3', 'dummy4']

        # Then run through some bad state transistions and assertRaises(AssertionError)
        for state in self.transitions.keys():
            for dest in dummystates:
                self.assertRaises(AssertionError, change.check, dest, state)
        return
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.changeState = ChangeState(self.config)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.loadAction = self.daoFactory(classname="Jobs.LoadFromIDWithWorkflow")


        # Variables
        self.numberOfJobsToCluster = getattr(self.config.JobArchiver,
                                             "numberOfJobsToCluster", 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobArchiver")

        try:
            self.uploadPublishDir = getattr(self.config.JobArchiver, 'uploadPublishDir',
                                            os.path.join(config.JobArchiver.componentDir, 'publishDir'))
            self.logDir = getattr(config.JobArchiver, 'logDir',
                                  os.path.join(config.JobArchiver.componentDir, 'logDir'))
            if not os.path.isdir(self.logDir):
                os.makedirs(self.logDir)
            if not os.path.isdir(self.uploadPublishDir):
                os.makedirs(self.uploadPublishDir)
        except Exception as ex:
            msg = "Unhandled exception while setting up logDir and/or uploadPublishDir!\n"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            try:
                logging.debug("Directory: %s", self.logDir)
                logging.debug("Config: %s", config)
            except:
                pass
            raise JobArchiverPollerException(msg)

        try:
            from WMCore.WorkQueue.WorkQueueUtils import queueFromConfig
            self.workQueue = queueFromConfig(self.config)
        except Exception as ex:
            msg = "Could not load workQueue"
            msg += str(ex)
            logging.error(msg)
            # raise JobArchiverPollerException(msg)

        self.uploadPublishInfo = getattr(self.config.JobArchiver, 'uploadPublishInfo', False)
        self.userFileCacheURL = getattr(self.config.JobArchiver, 'userFileCacheURL', None)
        self.handleWorkflowInjection = getattr(self.config.JobArchiver,
                                               'handleInjected', True)

        return
Example #22
0
    def testUpdateFailedDoc(self):
        """
        _testUpdateFailedDoc_

        Verify that the update function will work correctly and not throw a 500
        error if the doc didn't make it into the database for some reason.
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname = "Locations.New")
        locationAction.execute("site1", seName = "somese.cern.ch")

        testWorkflow = Workflow(spec = "spec.xml", owner = "Steve",
                                name = "wf001", task = self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name = "TestFileset")
        testFileset.create()
        testSubscription = Subscription(fileset = testFileset,
                                        workflow = testWorkflow,
                                        split_algo = "FileBased")
        testSubscription.create()

        testFileA = File(lfn = "SomeLFNA", events = 1024, size = 2048,
                         locations = set(["somese.cern.ch"]))
        testFileA.create()
        testFileset.addFile(testFileA)
        testFileset.commit()

        splitter = SplitterFactory()
        jobFactory = splitter(package = "WMCore.WMBS",
                              subscription = testSubscription)
        jobGroup = jobFactory(files_per_job = 1)[0]

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Merge"
        testJobA["couch_record"] = str(testJobA["id"])

        change.propagate([testJobA], "new", "none")
        testJobADoc = change.jobsdatabase.document(testJobA["couch_record"])

        self.assertTrue("states" in testJobADoc)
        self.assertTrue("1" in testJobADoc["states"])
        return
Example #23
0
    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = "TestWorkload"

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, "workloadTest", "TestWorkload", "WMSandbox", "WMWorkload.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, "created", "new")
        changer.propagate(testJobGroup.jobs, "executing", "created")
        changer.propagate(testJobGroup.jobs, "complete", "executing")
        changer.propagate(testJobGroup.jobs, "jobfailed", "complete")

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state="JobFailed")
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state="JobCooloff")
        self.assertEqual(len(idList), self.nJobs)
        return
Example #24
0
    def testA_testSubmit(self):
        """
        _testSubmit_

        Test whether we pick up submitted jobs
        """

        #workload = self.createWorkload()
        jobGroup = self.createTestJobGroup()
        config   = self.getConfig()

        xmlPath = os.path.join(WMCore.WMBase.getTestBase(),
                               "WMCore_t/FwkJobReport_t/PerformanceReport.xml")
        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        changer = ChangeState(config)
        for job in jobGroup.jobs:
            job['fwjr'] = myReport
        changer.propagate(jobGroup.jobs, "complete", "executing")
        changer.propagate(jobGroup.jobs, "success", "complete")

        dashboardReporter = DashboardReporterPoller(config = config)

        dashboardReporter.algorithm()

        # What the hell am I supposed to check?
        changer.propagate(jobGroup.jobs, 'jobfailed', 'executing')

        dashboardReporter.algorithm()

        return
Example #25
0
    def testB_SpeedTest(self):
        """
        _SpeedTest_

        Tests the components, as in sees if they load.
        Otherwise does nothing.
        """
        return
        myThread = threading.currentThread()

        config = self.getConfig()

        self.nJobs = 2000

        testJobGroup = self.createTestJobGroup()

        changer = ChangeState(config)

        cacheDir = os.path.join(self.testDir, 'test')

        for job in testJobGroup.jobs:
            job["outcome"] = "success"
            job.save()
            path = os.path.join(cacheDir, job['name'])
            os.makedirs(path)
            f = open('%s/%s.out' %(path, job['name']),'w')
            f.write(job['name'])
            f.close()
            job.setCache(path)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'success', 'complete')




        testJobArchiver = JobArchiverPoller(config = config)
        cProfile.runctx("testJobArchiver.algorithm()", globals(), locals(), filename = "testStats.stat")


        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(.2)

        return
Example #26
0
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        startTime = time.time()
        #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat")
        testErrorHandler.algorithm()
        stopTime = time.time()

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)
        
        return
    def testA_Create(self):
        """
        WMComponent_t.RetryManager_t.RetryManager_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """
        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')
        changer.propagate(testJobGroup.jobs, 'createcooloff', 'createfailed')

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID = job["id"],
                                    stateTime = int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'Created')
        self.assertEqual(len(idList), self.nJobs)
        return
Example #28
0
    def __init__(self, config, insertStates=False):
        """
        __init__

        BossAir should work with the standard config
        structure of WMAgent
        """

        WMConnectionBase.__init__(self, daoPackage="WMCore.BossAir")

        myThread = threading.currentThread()

        self.config = config
        self.plugins = {}
        self.states = []

        self.jobs = []

        self.pluginDir = config.BossAir.pluginDir
        # This is the default state jobs are created in
        self.newState = getattr(config.BossAir, 'newState', 'New')

        # Get any proxy info
        self.checkProxy = getattr(config.BossAir, 'checkProxy', False)
        self.cert = getattr(config.BossAir, 'cert', None)

        self.stateMachine = ChangeState(self.config)

        # Create a factory to load plugins
        self.pluginFactory = WMFactory("plugins", self.pluginDir)

        self.daoFactory = DAOFactory(package="WMCore.BossAir",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.deleteDAO = self.daoFactory(classname="DeleteJobs")
        self.stateDAO = self.daoFactory(classname="NewState")
        self.loadByWMBSDAO = self.daoFactory(classname="LoadByWMBSID")
        self.updateDAO = self.daoFactory(classname="UpdateJobs")
        self.newJobDAO = self.daoFactory(classname="NewJobs")
        self.runningJobDAO = self.daoFactory(classname="LoadRunning")
        self.completeJobDAO = self.daoFactory(classname="LoadComplete")
        self.loadJobsDAO = self.daoFactory(classname="LoadByStatus")
        self.completeDAO = self.daoFactory(classname="CompleteJob")
        self.monitorDAO = self.daoFactory(classname="JobStatusForMonitoring")

        self.states = None
        self.loadPlugin(insertStates)

        return
Example #29
0
    def testB_CheckExecutingJobsAndProfile(self):
        """
        _CheckExecutingJobsAndProfile_
        
        Pull up some executing jobs and profile them.
        """
        return
        jobGroup = self.createTestJobGroup()
        config   = self.getConfig()

        changer = ChangeState(config)
        changer.propagate(jobGroup.jobs, "executing", "created")

        dashboardReporter = DashboardReporterPoller(config = config)
        import cProfile, pstats
        cProfile.runctx("dashboardReporter.algorithm()", globals(), locals(), filename = "testStats.stat")

        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(.2)
        #dashboardReporter.algorithm()

        return
Example #30
0
    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs, retry_count=5,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testSubscription = Subscription(id=1)  # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)


        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)


        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)



        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)
Example #31
0
    def testE_FailJobs(self):
        """
        _FailJobs_

        Test our ability to fail jobs based on the information in the FWJR
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest',
                                    'TestWorkload', 'WMSandbox',
                                    'WMWorkload.pkl')

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath,
                                               fwjrPath=fwjrPath)

        config = self.getConfig()
        config.ErrorHandler.readFWJR = True
        config.ErrorHandler.failureExitCodes = [8020]
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        # This should exhaust all jobs due to exit code
        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.failureExitCodes = []
        config.ErrorHandler.maxFailTime = -10
        testErrorHandler2 = ErrorHandlerPoller(config)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler2.algorithm(None)

        # This should exhaust all jobs due to timeout
        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.maxFailTime = 24 * 3600
        config.ErrorHandler.passExitCodes = [8020]
        testErrorHandler3 = ErrorHandlerPoller(config)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler3.algorithm(None)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        return
Example #32
0
    def testH_PauseAlgo(self):
        """
        _testH_PauseAlgo_

        Test the pause algorithm, note that given pauseCount = n, the
        job will run first n + 1 times before being paused.
        After that it will be paused each n times
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'PauseAlgo'}
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 20,
            'submit': 20,
            'job': 20
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 2
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        # Making sure that jobs are not created ahead of time
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 15)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 25)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        # Make sure that no change happens before timeout
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 75)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin pauses them
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # Emulating ops retrying the job
        changer.propagate(testJobGroup.jobs, 'created', 'jobpaused')

        # Making sure it did the right thing
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 185)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 315)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 325)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        return
Example #33
0
    def testF_PollerProfileTest(self):
        """
        _testF_PollerProfileTest_

        Submit a lot of jobs and test how long it takes for
        them to actually be submitted
        """

        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 100
        nJobs = 100
        site = "T1_US_FNAL"

        self.setResourceThresholds(site,
                                   pendingSlots=20000,
                                   runningSlots=-1,
                                   tasks=['Processing', 'Merge'],
                                   Processing={
                                       'pendingSlots': 10000,
                                       'runningSlots': -1
                                   },
                                   Merge={
                                       'pendingSlots': 10000,
                                       'runningSlots': -1,
                                       'priority': 5
                                   })

        # Always initialize the submitter after setting the sites, flaky!
        JobSubmitterPoller(config=config)

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)

        jobGroupList.extend(
            self.createJobGroups(nSubs=nSubs,
                                 nJobs=nJobs,
                                 task=workload.getTask("ReReco"),
                                 workloadSpec=self.workloadSpecPath,
                                 site=site,
                                 taskType='Merge'))

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Actually run it
        startTime = time.time()
        cProfile.runctx("JobSubmitterPoller(config=config).algorithm()",
                        globals(),
                        locals(),
                        filename="testStats.stat")
        stopTime = time.time()

        print("Job took %f seconds to complete" % (stopTime - startTime))

        p = pstats.Stats('testStats.stat')
        p.sort_stats('cumulative')
        p.print_stats()

        return
Example #34
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package = "WMCore.WMBS", \
                                     logger = logging,
                                     dbinterface = myThread.dbi)
        self.config = config

        #Libraries
        self.resourceControl = ResourceControl()

        self.changeState = ChangeState(self.config)
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))

        # BossAir
        self.bossAir = BossAirAPI(config=self.config)

        # Additions for caching-based JobSubmitter
        self.workflowTimestamps = {}
        self.workflowPrios = {}
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.siteKeys = {}
        self.locationDict = {}
        self.cmsNames = {}
        self.drainSites = set()
        self.abortSites = set()
        self.sortedSites = []
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobSubmitter")

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            try:
                logging.debug("PackageDir: %s" % self.packageDir)
                logging.debug("Config: %s" % config)
            except:
                pass
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        return
Example #35
0
    def testB_thresholdTest(self):
        """
        _testB_thresholdTest_

        Check that the threshold management is working,
        this requires checks on pending/running jobs globally
        at a site and per task/site
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10
        site = "T1_US_FNAL"

        self.setResourceThresholds(site, pendingSlots=50, runningSlots=220, tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 45, 'runningSlots': 200},
                                   Merge={'pendingSlots': 10, 'runningSlots': 20, 'priority': 5})

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config=config)

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Do pre-submit check
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        jobSubmitter.algorithm()

        # Check that jobs are in the right state,
        # here we are limited by the pending threshold for the Processing task (45)
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 45)

        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        for jobId in result:
            loc = getLocationAction.execute(jobid=jobId)
            self.assertEqual(loc, [['T1_US_FNAL']])

        # Run another cycle, it shouldn't submit anything. Jobs are still in pending
        jobSubmitter.algorithm()
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 45)

        # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site
        nSubs = 1
        nJobs = 10
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            taskType='Merge')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Merge")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 5)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 45)

        # Now let's test running thresholds
        # The scenario will be setup as follows: Move all current jobs as running
        # Create 300 Processing jobs and 300 merge jobs
        # Run 5 polling cycles, moving all pending jobs to running in between
        # Result is, merge is left at 30 running 0 pending and processing is left at 240 running 0 pending
        # Processing has 110 jobs in queue and Merge 280
        # This tests all threshold dynamics including the prioritization of merge over processing
        nSubs = 1
        nJobs = 300
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)
        jobGroupList.extend(self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                                 task=workload.getTask("ReReco"),
                                                 workloadSpec=self.workloadSpecPath,
                                                 site=site,
                                                 taskType='Merge'))
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        getRunJobID = self.baDaoFactory(classname="LoadByWMBSID")
        setRunJobStatus = self.baDaoFactory(classname="SetStatus")

        for i in range(5):
            result = getJobsAction.execute(state='Executing')
            binds = []
            for jobId in result:
                binds.append({'id': jobId, 'retry_count': 0})
            runJobIds = getRunJobID.execute(binds)
            setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running')
            jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType='Processing')
        self.assertEqual(len(result), 240)
        result = getJobsAction.execute(state='Created', jobType='Processing')
        self.assertEqual(len(result), 110)
        result = getJobsAction.execute(state='Executing', jobType='Merge')
        self.assertEqual(len(result), 30)
        result = getJobsAction.execute(state='Created', jobType='Merge')
        self.assertEqual(len(result), 280)

        return
Example #36
0
    def testE_SiteModesTest(self):
        """
        _testE_SiteModesTest_

        Test the behavior of the submitter in response to the different
        states of the sites
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)
        nSubs = 1
        nJobs = 20

        sites = ['T2_US_Florida', 'T2_RU_INR', 'T3_CO_Uniandes', 'T1_US_FNAL']
        for site in sites:
            self.setResourceThresholds(site, pendingSlots=10, runningSlots=999999, tasks=['Processing', 'Merge'],
                                       Processing={'pendingSlots': 10, 'runningSlots': 999999},
                                       Merge={'pendingSlots': 10, 'runningSlots': 999999, 'priority': 5})

        myResourceControl = ResourceControl(config)
        myResourceControl.changeSiteState('T2_US_Florida', 'Draining')
        # First test that we prefer Normal over drain, and T1 over T2/T3
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        # Actually run it
        jobSubmitter.algorithm()

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection
        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        locationDict = getLocationAction.execute([{'jobid': x} for x in result])
        for entry in locationDict:
            loc = entry['site_name']
            self.assertNotEqual(loc, 'T2_US_Florida')

        # Now set everything to down, check we don't submit anything
        for site in sites:
            myResourceControl.changeSiteState(site, 'Down')
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter.algorithm()
        # Nothing is submitted despite the empty slots at Uniandes and Florida
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Now set everything to Drain and create Merge jobs. Those should be submitted
        for site in sites:
            myResourceControl.changeSiteState(site, 'Draining')

        nSubsMerge = 1
        nJobsMerge = 5
        jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            taskType='Merge')

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='Executing', jobType='Merge')
        self.assertEqual(len(result), nSubsMerge * nJobsMerge)

        # Now set everything to Aborted, and create Merge jobs. Those should fail
        # since the can only run at one place
        for site in sites:
            myResourceControl.changeSiteState(site, 'Aborted')

        nSubsMerge = 1
        nJobsMerge = 5
        jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge,
                                            site=[x for x in sites],
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            taskType='Merge')

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        result = getJobsAction.execute(state='SubmitFailed', jobType='Merge')
        self.assertEqual(len(result), nSubsMerge * nJobsMerge)
        result = getJobsAction.execute(state='Executing', jobType='Processing')
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #37
0
    def testY_MultipleIterations(self):
        """
        _MultipleIterations_

        Paranoia based check to see if I'm saving class instances correctly
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'Created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 50)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 150)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        # Make a new jobGroup for a second run
        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # Set job state
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        # Set them to go off
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 200)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs * 2)

        return
Example #38
0
class JobCreatorWorker:
    """
    This is the ProcessPool worker function that actually
    runs the jobCreator
    """
    def __init__(self, **configDict):
        """
        init jobCreator
        """

        myThread = threading.currentThread()

        self.transaction = myThread.transaction

        # DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        # WMCore splitter factory for splitting up jobs.
        self.splitterFactory = SplitterFactory()

        config = Configuration()
        config.section_("JobStateMachine")
        config.JobStateMachine.couchurl = configDict["couchURL"]
        config.JobStateMachine.couch_retries = configDict["defaultRetries"]
        config.JobStateMachine.couchDBName = configDict["couchDBName"]

        self.config = config

        # Variables
        self.jobCacheDir = configDict['jobCacheDir']
        self.defaultJobType = configDict['defaultJobType']
        self.limit = configDict.get('fileLoadLimit', 500)

        self.createWorkArea = CreateWorkArea()

        self.changeState = ChangeState(self.config)

        return

    def __call__(self, parameters):
        """
        Poller for looking in all active subscriptions for jobs that need to be made.

        """

        logging.info("In JobCreatorWorker.__call__")

        myThread = threading.currentThread()

        for entry in parameters:
            # This retrieves a single subscription
            subscriptionID = entry.get('subscription')

            if subscriptionID < 0:
                logging.error("Got non-existant subscription")
                logging.error("Assuming parameters in error: returning")
                return subscriptionID

            myThread.transaction.begin()

            logging.info("About to call subscription %i", subscriptionID)

            wmbsSubscription = Subscription(id=subscriptionID)
            wmbsSubscription.load()
            wmbsSubscription["workflow"].load()
            workflow = wmbsSubscription["workflow"]

            wmWorkload = retrieveWMSpec(wmbsSubscription)

            if not workflow.task or not wmWorkload:
                # Then we have a problem
                # We have no sandbox
                # We NEED a sandbox
                # Abort this subscription!
                # But do NOT fail
                # We have no way of marking a subscription as bad per se
                # We'll have to just keep skipping it
                wmTask = None
                seederList = []
                logging.error("Have no task for workflow %i", workflow.id)
                logging.error("Aborting Subscription %i", subscriptionID)
                continue

            else:
                wmTask = wmWorkload.getTaskByPath(workflow.task)
                if hasattr(wmTask.data, 'seeders'):
                    manager = GeneratorManager(wmTask)
                    seederList = manager.getGeneratorList()
                else:
                    seederList = []

            logging.info("About to enter JobFactory")
            logging.debug("Going to call wmbsJobFactory with limit %i",
                          self.limit)

            # My hope is that the job factory is smart enough only to split un-split jobs
            wmbsJobFactory = self.splitterFactory(
                package="WMCore.WMBS",
                subscription=wmbsSubscription,
                generators=seederList,
                limit=self.limit)
            splitParams = retrieveJobSplitParams(wmWorkload, workflow.task)
            logging.debug("Split Params: %s", splitParams)

            continueSubscription = True
            myThread.transaction.commit()

            # Turn on the jobFactory
            myThread.transaction.begin()
            wmbsJobFactory.open()

            # Create a function to hold it
            jobSplittingFunction = runSplitter(jobFactory=wmbsJobFactory,
                                               splitParams=splitParams)
            while continueSubscription:
                # This loop runs over the jobFactory,
                # using yield statements and a pre-existing proxy to
                # generate and process new jobs

                # First we need the jobs.

                try:
                    wmbsJobGroups = next(jobSplittingFunction)
                    logging.info("Retrieved %i jobGroups from jobSplitter",
                                 len(wmbsJobGroups))
                except StopIteration:
                    # If you receive a stopIteration, we're done
                    logging.info("Completed iteration over subscription %i",
                                 subscriptionID)
                    continueSubscription = False
                    continue

                # Now we get to find out what job they are.
                countJobs = self.daoFactory(
                    classname="Jobs.GetNumberOfJobsPerWorkflow")
                jobNumber = countJobs.execute(workflow=workflow.id,
                                              conn=myThread.transaction.conn,
                                              transaction=True)
                logging.debug("Have %i jobs for this workflow already",
                              jobNumber)

                for wmbsJobGroup in wmbsJobGroups:

                    logging.debug("Processing jobGroup %i",
                                  wmbsJobGroup.exists())
                    logging.debug("Processing %i jobs", len(wmbsJobGroup.jobs))

                    # Create a directory
                    self.createWorkArea.processJobs(
                        jobGroup=wmbsJobGroup,
                        startDir=self.jobCacheDir,
                        workflow=workflow,
                        wmWorkload=wmWorkload,
                        transaction=myThread.transaction,
                        conn=myThread.transaction.conn)

                    for job in wmbsJobGroup.jobs:
                        jobNumber += 1
                        self.saveJob(job=job,
                                     workflow=workflow,
                                     wmTask=wmTask,
                                     jobNumber=jobNumber)

                    self.advanceJobGroup(wmbsJobGroup)

                    logging.debug("Finished call for jobGroup %i",
                                  wmbsJobGroup.exists())

            # END: while loop over jobSplitter
            myThread.transaction.commit()

            # About to reset everything
            wmbsJobGroups = None
            wmTask = None
            wmWorkload = None
            splitParams = None
            wmbsJobFactory = None
            gc.collect()

            # About to check memory
            doMemoryCheck(
                "About to get memory references: End of subscription loop")

        # Final memory check
        doMemoryCheck("About to get memory references: End of __call__()")

        logging.debug("About to return from JobCreatorWorker.__call__()")

        return parameters

    def saveJob(self, job, workflow, wmTask=None, jobNumber=0):
        """
        _saveJob_

        Actually do the mechanics of saving the job to a pickle file
        """
        priority = None

        if wmTask:
            # If we managed to load the task,
            # so the url should be valid
            job['spec'] = workflow.spec
            job['task'] = wmTask.getPathName()
            priority = wmTask.getTaskPriority()
            if job.get('sandbox', None) is None:
                job['sandbox'] = wmTask.data.input.sandbox

        job['priority'] = priority
        job['counter'] = jobNumber
        cacheDir = job.getCache()
        job['cache_dir'] = cacheDir
        output = open(os.path.join(cacheDir, 'job.pkl'), 'w')
        pickle.dump(job, output, pickle.HIGHEST_PROTOCOL)
        output.flush()
        os.fsync(output.fileno())
        output.close()

        return

    def advanceJobGroup(self, wmbsJobGroup):
        """
        Pass this on to the jobCreator, which actually does the work

        """

        # Create the job
        self.changeState.propagate(wmbsJobGroup.jobs, 'created', 'new')

        logging.info("JobCreator has created jobGroup %i and is ending",
                     wmbsJobGroup.id)

        return
Example #39
0
class JobCreatorPoller(BaseWorkerThread):
    """
    Poller that does the work of job creation.
    Polls active subscriptions, asks for more work, and checks with local sites.

    """
    def __init__(self, config):
        """
        init jobCreator
        """

        BaseWorkerThread.__init__(self)

        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        self.setBulkCache = self.daoFactory(classname="Jobs.SetCache")
        self.countJobs = self.daoFactory(
            classname="Jobs.GetNumberOfJobsPerWorkflow")
        self.subscriptionList = self.daoFactory(
            classname="Subscriptions.ListIncomplete")
        self.setFWJRPath = self.daoFactory(classname="Jobs.SetFWJRPath")

        #information
        self.config = config

        #Variables
        self.defaultJobType = config.JobCreator.defaultJobType
        self.limit = getattr(config.JobCreator, 'fileLoadLimit', 500)
        self.agentNumber = int(getattr(config.Agent, 'agentNumber', 0))
        self.glideinLimits = getattr(config.JobCreator, 'GlideInRestriction',
                                     None)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available
        self.initAlerts(compName="JobCreator")

        try:
            self.jobCacheDir = getattr(
                config.JobCreator, 'jobCacheDir',
                os.path.join(config.JobCreator.componentDir, 'jobCacheDir'))
            self.check()
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled exception while setting up jobCacheDir!\n"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            raise JobCreatorException(msg)

        self.changeState = ChangeState(self.config)

        return

    def check(self):
        """
        Initial sanity checks on necessary environment factors

        """

        if not os.path.isdir(self.jobCacheDir):
            if not os.path.exists(self.jobCacheDir):
                os.makedirs(self.jobCacheDir)
            else:
                msg = "Assigned a pre-existant cache object %s.  Failing!" \
                      % (self.jobCacheDir)
                raise JobCreatorException(msg)

    def algorithm(self, parameters=None):
        """
        Actually runs the code
        """
        logging.debug("Running JSM.JobCreator")
        try:
            self.pollSubscriptions()
        except WMException:
            #self.close()
            myThread = threading.currentThread()
            if getattr(myThread, 'transaction', False) \
                   and getattr(myThread.transaction, 'transaction', False):
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            #self.close()
            myThread = threading.currentThread()
            if getattr(myThread, 'transaction', False) \
                   and getattr(myThread.transaction, 'transaction', False):
                myThread.transaction.rollback()
            # Handle temporary connection problems (Temporary)
            if "(InterfaceError) not connected" in str(ex):
                logging.error(
                    'There was a connection problem during the JobCreator algorithm, I will try again next cycle'
                )
            else:
                msg = "Failed to execute JobCreator \n%s\n\n%s" % (
                    ex, traceback.format_exc())
                logging.error(msg)
                raise JobCreatorException(msg)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def pollSubscriptions(self):
        """
        Poller for looking in all active subscriptions for jobs that need to be made.

        """
        logging.info("Beginning JobCreator.pollSubscriptions() cycle.")
        myThread = threading.currentThread()

        #First, get list of Subscriptions
        subscriptions = self.subscriptionList.execute()

        # Okay, now we have a list of subscriptions
        for subscriptionID in subscriptions:
            wmbsSubscription = Subscription(id=subscriptionID)
            try:
                wmbsSubscription.load()
            except IndexError:
                # This happens when the subscription no longer exists
                # i.e., someone executed a kill() function on the database
                # while the JobCreator was in cycle
                # Ignore this subscription
                msg = "JobCreator cannot load subscription %i" % subscriptionID
                logging.error(msg)
                self.sendAlert(6, msg=msg)
                continue

            workflow = Workflow(id=wmbsSubscription["workflow"].id)
            workflow.load()
            wmbsSubscription['workflow'] = workflow
            wmWorkload = retrieveWMSpec(workflow=workflow)

            if not workflow.task or not wmWorkload:
                # Then we have a problem
                # We NEED a sandbox
                # Abort this subscription!
                # But do NOT fail
                # We have no way of marking a subscription as bad per se
                # We'll have to just keep skipping it
                msg = "Have no task for workflow %i\n" % (workflow.id)
                msg += "Aborting Subscription %i" % (subscriptionID)
                logging.error(msg)
                self.sendAlert(1, msg=msg)
                continue

            logging.debug("Have loaded subscription %i with workflow %i\n" %
                          (subscriptionID, workflow.id))

            # retrieve information from the workload to propagate down to the job configuration
            allowOpport = wmWorkload.getAllowOpportunistic()

            # Set task object
            wmTask = wmWorkload.getTaskByPath(workflow.task)

            # Get generators
            # If you fail to load the generators, pass on the job
            try:
                if hasattr(wmTask.data, 'generators'):
                    manager = GeneratorManager(wmTask)
                    seederList = manager.getGeneratorList()
                else:
                    seederList = []
            except Exception as ex:
                msg = "Had failure loading generators for subscription %i\n" % (
                    subscriptionID)
                msg += "Exception: %s\n" % str(ex)
                msg += "Passing over this error.  It will reoccur next interation!\n"
                msg += "Please check or remove this subscription!\n"
                logging.error(msg)
                self.sendAlert(6, msg=msg)
                continue

            logging.debug(
                "Going to call wmbsJobFactory for sub %i with limit %i" %
                (subscriptionID, self.limit))

            splitParams = retrieveJobSplitParams(wmWorkload, workflow.task)
            logging.debug("Split Params: %s" % splitParams)

            # My hope is that the job factory is smart enough only to split un-split jobs
            splitterFactory = SplitterFactory(
                splitParams.get('algo_package', "WMCore.JobSplitting"))
            wmbsJobFactory = splitterFactory(package="WMCore.WMBS",
                                             subscription=wmbsSubscription,
                                             generators=seederList,
                                             limit=self.limit)

            # Turn on the jobFactory
            wmbsJobFactory.open()

            # Create a function to hold it
            jobSplittingFunction = runSplitter(jobFactory=wmbsJobFactory,
                                               splitParams=splitParams)

            # Now we get to find out how many jobs there are.
            jobNumber = self.countJobs.execute(workflow=workflow.id,
                                               conn=myThread.transaction.conn,
                                               transaction=True)
            jobNumber += splitParams.get('initial_lfn_counter', 0)
            logging.debug("Have %i jobs for workflow %s already in database." %
                          (jobNumber, workflow.name))

            continueSubscription = True
            while continueSubscription:
                # This loop runs over the jobFactory,
                # using yield statements and a pre-existing proxy to
                # generate and process new jobs

                # First we need the jobs.
                myThread.transaction.begin()
                try:
                    wmbsJobGroups = next(jobSplittingFunction)
                    logging.info("Retrieved %i jobGroups from jobSplitter" %
                                 (len(wmbsJobGroups)))
                except StopIteration:
                    # If you receive a stopIteration, we're done
                    logging.info("Completed iteration over subscription %i" %
                                 (subscriptionID))
                    continueSubscription = False
                    myThread.transaction.commit()
                    break

                # If we have no jobGroups, we're done
                if len(wmbsJobGroups) == 0:
                    logging.info(
                        "Found end in iteration over subscription %i" %
                        (subscriptionID))
                    continueSubscription = False
                    myThread.transaction.commit()
                    break

                # Assemble a dict of all the info
                processDict = {
                    'workflow': workflow,
                    'wmWorkload': wmWorkload,
                    'wmTaskName': wmTask.getPathName(),
                    'jobNumber': jobNumber,
                    'sandbox': wmTask.data.input.sandbox,
                    'owner': wmWorkload.getOwner().get('name', None),
                    'ownerDN': wmWorkload.getOwner().get('dn', None),
                    'ownerGroup': wmWorkload.getOwner().get('vogroup', ''),
                    'ownerRole': wmWorkload.getOwner().get('vorole', ''),
                    'numberOfCores': 1,
                    'inputDataset': wmTask.getInputDatasetPath()
                }
                try:
                    maxCores = 1
                    stepNames = wmTask.listAllStepNames()
                    for stepName in stepNames:
                        sh = wmTask.getStep(stepName)
                        maxCores = max(maxCores, sh.getNumberOfCores())
                    processDict.update({'numberOfCores': maxCores})
                except AttributeError:
                    logging.info(
                        "Failed to read multicore settings from task %s" %
                        wmTask.getPathName())

                tempSubscription = Subscription(id=wmbsSubscription['id'])

                # if we have glideinWMS constraints, then adapt all jobs
                if self.glideinLimits:
                    capResourceEstimates(wmbsJobGroups,
                                         processDict['numberOfCores'],
                                         self.glideinLimits)

                nameDictList = []
                for wmbsJobGroup in wmbsJobGroups:
                    # For each jobGroup, put a dictionary
                    # together and run it with creatorProcess
                    jobsInGroup = len(wmbsJobGroup.jobs)
                    wmbsJobGroup.subscription = tempSubscription
                    tempDict = {}
                    tempDict.update(processDict)
                    tempDict['jobGroup'] = wmbsJobGroup
                    tempDict['swVersion'] = wmTask.getSwVersion()
                    tempDict['scramArch'] = wmTask.getScramArch()
                    tempDict['jobNumber'] = jobNumber
                    tempDict['agentNumber'] = self.agentNumber
                    tempDict[
                        'inputDatasetLocations'] = wmbsJobGroup.getLocationsForJobs(
                        )
                    tempDict['allowOpportunistic'] = allowOpport

                    jobGroup = creatorProcess(work=tempDict,
                                              jobCacheDir=self.jobCacheDir)
                    jobNumber += jobsInGroup

                    # Set jobCache for group
                    for job in jobGroup.jobs:
                        nameDictList.append({
                            'jobid': job['id'],
                            'cacheDir': job['cache_dir']
                        })
                        job["user"] = wmWorkload.getOwner()["name"]
                        job["group"] = wmWorkload.getOwner()["group"]
                # Set the caches in the database
                try:
                    if len(nameDictList) > 0:
                        self.setBulkCache.execute(
                            jobDictList=nameDictList,
                            conn=myThread.transaction.conn,
                            transaction=True)
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unknown exception while setting the bulk cache:\n"
                    msg += str(ex)
                    logging.error(msg)
                    self.sendAlert(6, msg=msg)
                    logging.debug(
                        "Error while setting bulkCache with following values: %s\n"
                        % nameDictList)
                    raise JobCreatorException(msg)

                # Advance the jobGroup in changeState
                for wmbsJobGroup in wmbsJobGroups:
                    self.advanceJobGroup(wmbsJobGroup=wmbsJobGroup)

                # Now end the transaction so that everything is wrapped
                # in a single rollback
                myThread.transaction.commit()

            # END: While loop over jobFactory

            # Close the jobFactory
            wmbsJobFactory.close()

        return

# This is the code for the multiprocessing based queue retrieval system
# I'm keeping this here because I hope to go back and re-instate this once
# I figure out how to deal with the transaction problems.
# Keep until we can assure ourselves that what we're doing now presents no problems.

#        while True:
#            try:
#                # Get stuff out of the queue with a ridiculously
#                # short wait time
#                startTime = time.time()
#                entry = self.result.get(timeout = self.wait)
#
#                if not entry.get('success', False):
#                    msg =  "Encountered exception processing jobGroup\n"
#                    msg += entry.get('msg', '')
#                    logging.error(msg)
#                    continue
#                else:
#                    wmbsJobGroup = entry.get('jobGroup')
#            except Queue.Empty:
#                # This means the queue has no current results
#                stopTime  = time.time()
#                logging.error("Found empty queue after %f seconds" % (stopTime - startTime))
#                break
#
#            # Set the cache dir
#            nameDictList = []
#            for job in wmbsJobGroup.jobs:
#                nameDictList.append({'jobid':job['id'], 'cacheDir':job['cache_dir']})
#
#
#            try:
#                myThread.transaction.begin()
#                self.setBulkCache.execute(jobDictList = nameDictList,
#                                          conn = myThread.transaction.conn,
#                                          transaction = True)
#                myThread.transaction.commit()
#            except Exception, ex:
#                msg =  "Unhandled exception while setting jobCache for jobGroup %i\n" % wmbsJobGroup.id
#                msg += str(ex)
#                logging.error(msg)
#                raise JobCreatorException(msg)
#
#            self.advanceJobGroup(wmbsJobGroup = wmbsJobGroup)
#
#            logging.debug("Finished call for jobGroup %i" \
#                          % (wmbsJobGroup.id))
#
#        #END: While loop over wmbsJob
#
#        return

    def advanceJobGroup(self, wmbsJobGroup):
        """
        _advanceJobGroup_

        Mark jobGroup as ready in changeState
        """
        try:
            self.changeState.propagate(wmbsJobGroup.jobs, 'created', 'new')

            createFailedJobs = filter(
                lambda x: x.get('failedOnCreation', False), wmbsJobGroup.jobs)
            self.generateCreateFailedReports(createFailedJobs)
            self.changeState.propagate(createFailedJobs, 'createfailed',
                                       'created')
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled exception while calling changeState.\n"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            logging.debug(
                "Error while using changeState on the following jobs: %s\n" %
                wmbsJobGroup.jobs)

        logging.info("JobCreator has finished creating jobGroup %i.\n" \
                     % (wmbsJobGroup.id))
        return

    def generateCreateFailedReports(self, createFailedJobs):
        """
        _generateCreateFailedReports_

        Create and store FWJR for the  jobs that failed on creation
        leaving meaningful information about what happened with them
        """
        if not createFailedJobs:
            return

        fjrsToSave = []
        for failedJob in createFailedJobs:
            report = Report()
            defaultMsg = "There is a condition which assures that this job will fail if it's submitted"
            report.addError("CreationFailure", 99305, "CreationFailure",
                            failedJob.get("failedReason", defaultMsg))
            jobCache = failedJob.getCache()
            try:
                fjrPath = os.path.join(jobCache, "Report.0.pkl")
                report.save(fjrPath)
                fjrsToSave.append({
                    "jobid": failedJob["id"],
                    "fwjrpath": fjrPath
                })
                failedJob["fwjr"] = report
            except Exception:
                logging.error(
                    "Something went wrong while saving the report for  job %s"
                    % failedJob["id"])

        myThread = threading.currentThread()
        self.setFWJRPath.execute(binds=fjrsToSave,
                                 conn=myThread.transaction.conn,
                                 transaction=True)

        return
Example #40
0
    def testE_FullChain(self):
        """
        _FullChain_

        Full test going through the chain; using polling cycles and everything
        """

        return

        from WMComponent.JobSubmitter.JobSubmitter   import JobSubmitter
        from WMComponent.JobStatusLite.JobStatusLite import JobStatusLite
        from WMComponent.JobTracker.JobTracker       import JobTracker


        myThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))


        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'

        baAPI  = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(self.testDir,
                                                                      'workloadTest',
                                                                      workloadName),
                                            site='se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitter(config=config)
        jobTracker = JobTracker(config=config)
        jobStatus = JobStatusLite(config=config)


        jobSubmitter.prepareToStart()
        jobTracker.prepareToStart()
        jobStatus.prepareToStart()

        # What should happen here:
        # 1) The JobSubmitter should submit the jobs
        # 2) Because of the ridiculously short time on pending jobs
        #     the JobStatus poller should mark the jobs as done
        #     and kill them.
        # 3) The JobTracker should realize there are finished jobs
        #
        # So at the end of several polling cycles, the jobs should all
        # be done, but be in the failed status (they timed out)

        time.sleep(20)


        myThread.workerThreadManager.terminateWorkers()


        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state='JobFailed', jobType="Processing")
        self.assertEqual(len(result), nJobs * nSubs)
        return
Example #41
0
    def testD_PrototypeChain(self):
        """
        _PrototypeChain_

        Prototype the BossAir workflow
        """
        dummymyThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))


        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        dummycacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(self.testDir,
                                                                      'workloadTest',
                                                                      workloadName),
                                            site='se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config=config)
        jobTracker   = JobTrackerPoller(config=config)
        statusPoller = StatusPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Check WMBS
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        statusPoller.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nSubs * nJobs)


        # Tracker should do nothing
        jobTracker.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)


        # Wait for jobs to timeout due to short Pending wait period
        time.sleep(12)


        statusPoller.algorithm()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Timeout', complete='0')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Jobs should be gone
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)


        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nSubs * nJobs)


        # Because they timed out, they all should have failed
        jobTracker.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state='JobFailed', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #42
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs, location='T3_US_Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'

        baAPI.submit(jobs=jobDummies)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)

        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)

        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)

        jobsToRemove = baAPI._buildRunningJobs(wmbsJobs=jobDummies)
        baAPI._deleteJobs(jobs=jobsToRemove)

        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)

        return
Example #43
0
class BossAirAPI(WMConnectionBase):
    """
    _BossAirAPI_

    The API layer for the BossAir prototype
    """
    def __init__(self, config, insertStates=False):
        """
        __init__

        BossAir should work with the standard config
        structure of WMAgent
        """

        WMConnectionBase.__init__(self, daoPackage="WMCore.BossAir")

        myThread = threading.currentThread()

        self.config = config
        self.plugins = {}
        self.states = []

        self.jobs = []

        self.pluginDir = config.BossAir.pluginDir
        # This is the default state jobs are created in
        self.newState = getattr(config.BossAir, 'newState', 'New')

        # Get any proxy info
        self.checkProxy = getattr(config.BossAir, 'checkProxy', False)
        self.cert = getattr(config.BossAir, 'cert', None)

        self.stateMachine = ChangeState(self.config)

        # Create a factory to load plugins
        self.pluginFactory = WMFactory("plugins", self.pluginDir)

        self.daoFactory = DAOFactory(package="WMCore.BossAir",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.deleteDAO = self.daoFactory(classname="DeleteJobs")
        self.stateDAO = self.daoFactory(classname="NewState")
        self.loadByWMBSDAO = self.daoFactory(classname="LoadByWMBSID")
        self.updateDAO = self.daoFactory(classname="UpdateJobs")
        self.newJobDAO = self.daoFactory(classname="NewJobs")
        self.runningJobDAO = self.daoFactory(classname="LoadRunning")
        self.completeJobDAO = self.daoFactory(classname="LoadComplete")
        self.loadJobsDAO = self.daoFactory(classname="LoadByStatus")
        self.completeDAO = self.daoFactory(classname="CompleteJob")
        self.monitorDAO = self.daoFactory(classname="JobStatusForMonitoring")

        self.states = None
        self.loadPlugin(insertStates)

        return

    def loadPlugin(self, insertStates):
        """
        _loadPlugin_

        Actually load the plugin and init the database
        """

        states = set()

        for name in self.config.BossAir.pluginNames:
            self.plugins[name] = self.pluginFactory.loadObject(
                classname=name, args=self.config)
            for state in self.plugins[name].states:
                states.add(state)

        if self.newState not in states:
            states.add(self.newState)

        if insertStates:
            # Add states only if we're not
            # doing a secondary instantiation
            self.addStates(states=states)

        self.states = states

        return

    def addStates(self, states):
        """
        _addStates_

        Add States to bl_status table. Meant to be done only
        once in an agent lifetime.
        """
        existingTransaction = self.beginTransaction()

        self.stateDAO.execute(states=states,
                              conn=self.getDBConn(),
                              transaction=self.existingTransaction())

        self.commitTransaction(existingTransaction)

        return

    def createNewJobs(self, wmbsJobs):
        """
        _createNewJobs_

        Create new jobs in the BossAir database
        Accepts WMBS Jobs
        """

        existingTransaction = self.beginTransaction()

        jobsToCreate = []

        # First turn wmbsJobs into runJobs
        for wmbsJob in wmbsJobs:
            runJob = RunJob()
            runJob.buildFromJob(job=wmbsJob)
            if runJob.get('status') not in self.states:
                runJob['status'] = self.newState
            jobsToCreate.append(runJob)

        # Next insert them into the database
        self.newJobDAO.execute(jobs=jobsToCreate,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        self.commitTransaction(existingTransaction)

        return

    def _listRunJobs(self, active=True):
        """
        _listRunJobs_

        List runjobs, either active or complete
        """

        if active:
            runJobDicts = self.runningJobDAO.execute(
                conn=self.getDBConn(), transaction=self.existingTransaction())
        else:
            runJobDicts = self.completeJobDAO.execute(
                conn=self.getDBConn(), transaction=self.existingTransaction())

        runJobs = []
        for jDict in runJobDicts:
            rj = RunJob()
            rj.update(jDict)
            runJobs.append(rj)

        return runJobs

    def _loadByStatus(self, status, complete='1'):
        """
        _loadByStatus_

        Load jobs by status
        """

        if status not in self.states:
            msg = "Asked to load by status %s which is not loaded\n" % (status)
            msg += "This indicates that the wrong plugins are loaded\n"
            logging.error(msg)
            raise BossAirException(msg)

        loadJobs = self.loadJobsDAO.execute(
            status=status,
            complete=complete,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        statusJobs = []
        for jDict in loadJobs:
            rj = RunJob()
            rj.update(jDict)
            statusJobs.append(rj)

        return statusJobs

    def _loadByID(self, jobs):
        """
        _loadByID_

        Load by running Job ID
        """
        loadJobsDAO = self.daoFactory(classname="LoadByID")
        loadJobs = loadJobsDAO.execute(jobs=jobs,
                                       conn=self.getDBConn(),
                                       transaction=self.existingTransaction())

        loadedJobs = []
        for jDict in loadJobs:
            rj = RunJob()
            rj.update(jDict)
            loadedJobs.append(rj)

        return loadedJobs

    def _updateJobs(self, jobs):
        """
        _updateJobs_

        Update the job entries in the BossAir database
        """

        if len(jobs) < 1:
            # Nothing to do
            return

        existingTransaction = self.beginTransaction()

        self.updateDAO.execute(jobs=jobs,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        jobsWithLocation = [
            job for job in jobs if job.get('location') is not None
        ]
        if jobsWithLocation:
            self.stateMachine.recordLocationChange(jobsWithLocation)

        self.commitTransaction(existingTransaction)

        return

    def _deleteJobs(self, jobs):
        """
        _deleteJobs_

        Delete the job entries in the BossAir database
        NOTE: only used by unit tests
        """

        if len(jobs) < 1:
            # Nothing to do
            return

        idList = [x['id'] for x in jobs]

        existingTransaction = self.beginTransaction()

        self.deleteDAO.execute(jobs=idList,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        self.commitTransaction(existingTransaction)

        return

    def loadByWMBS(self, wmbsJobs):
        """
        _loadByWMBS_

        Load BossAir info based on wmbs Jobs.
        """

        if len(wmbsJobs) < 1:
            return []

        jobList = self.loadByWMBSDAO.execute(
            jobs=wmbsJobs,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        loadedJobs = []
        for job in jobList:
            rj = RunJob()
            rj.buildFromJob(job)
            loadedJobs.append(rj)

        if len(loadedJobs) != len(wmbsJobs):
            logging.error("Could not load all jobs in BossAir for WMBS input!")
            idList = [x['jobid'] for x in loadedJobs]
            for job in wmbsJobs:
                if job['id'] not in idList:
                    logging.error(
                        "Failed to retrieve wmbs_id %i and WMBS job info: %s",
                        job['id'], job)

        return loadedJobs

    def check(self):
        """
        _check_

        Perform checks of critical components, i.e. proxy validation, etc.
        """

        if self.checkProxy:
            command = 'voms-proxy-info'
            if self.cert is not None and self.cert != '':
                command += ' --file ' + self.cert

            pipe = subprocess.Popen(command,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            output, dummyerr = pipe.communicate()

            try:
                output = output.split("timeleft  :")[1].strip()
            except IndexError:
                raise BossAirException("Missing Proxy", output.strip())

            if output == "0:00:00":
                raise BossAirException("Proxy Expired", output.strip())

        return

    def submit(self, jobs, info=None):
        """
        _submit_

        Submit jobs using the plugin

        Requires both plugin name and workflow user from submitter

        Deals internally in RunJob objects, but interfaces
        to the outside with WMBS Job analogs

        Returns (successes, failures)
        """
        self.check()

        successJobs = []
        failureJobs = []

        # TODO: Add plugin and user to input via JobSubmitter
        # IMPORTANT IMPORTANT IMPORTANT

        # Put job into RunJob format
        pluginDict = {}

        for job in jobs:
            rj = RunJob()
            rj.buildFromJob(job=job)
            if not job.get('location', False):
                rj['location'] = job.get('custom', {}).get('location', None)
            plugin = rj['plugin']
            pluginDict.setdefault(plugin, [])
            pluginDict[plugin].append(rj)
            # Can't add to the cache in submit()
            # It's NOT the same bossAir instance
            # self.jobs.append(rj)

        for plugin in pluginDict.keys():
            if plugin not in self.plugins.keys():
                # Then we have a non-existant plugin
                msg = "CRITICAL ERROR: Non-existant plugin!\n"
                msg += "Given a plugin %s that we don't have access to.\n" % (
                    plugin)
                msg += "Ignoring the jobs for this plugin for now"
                logging.error(msg)
                continue
            try:
                pluginInst = self.plugins[plugin]
                jobsToSubmit = pluginDict.get(plugin, [])
                logging.debug("About to submit %i jobs to plugin %s",
                              len(jobsToSubmit), plugin)
                localSuccess, localFailure = pluginInst.submit(
                    jobs=jobsToSubmit, info=info)
                for job in localSuccess:
                    successJobs.append(job.buildWMBSJob())
                for job in localFailure:
                    failureJobs.append(job.buildWMBSJob())
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while submitting jobs to plugin: %s\n" % plugin
                msg += str(ex)
                logging.error(msg)
                logging.debug("Jobs being submitted: %s\n", jobsToSubmit)
                logging.debug("Job info: %s\n", info)
                raise BossAirException(msg)
            finally:
                # make sure we release this memory
                pluginDict.clear()
                del jobsToSubmit[:]

        # Create successful jobs in BossAir
        try:
            logging.debug("About to create %i new jobs in BossAir",
                          len(successJobs))
            self.createNewJobs(wmbsJobs=successJobs)
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled error in creation of %i new jobs.\n" % len(
                successJobs)
            msg += str(ex)
            logging.error(msg)
            logging.debug("Job: %s", successJobs)
            raise BossAirException(msg)

        return successJobs, failureJobs

    def track(self, runJobIDs=None, wmbsIDs=None):
        """
        _track_

        Track all running jobs
        Load job info from the cache (it should be there since we submitted the job)

        OPTIONAL: You can submit a list of jobs to check, based either on wmbsIDs or
         on runjobIDs.  This takes a list of integer IDs.
        """

        jobsToChange = []
        jobsToComplete = []
        jobsToReturn = []
        returnList = []

        jobsToTrack = {}

        runningJobs = self._listRunJobs(active=True)

        if runJobIDs:
            for job in runningJobs:
                if job['id'] not in runJobIDs:
                    runningJobs.remove(job)
        if wmbsIDs:
            for job in runningJobs:
                if job['jobid'] not in wmbsIDs:
                    runningJobs.remove(job)

        if len(runningJobs) < 1:
            # Then we have no running jobs
            return returnList

        logging.info("About to start building running jobs")

        loadedJobs = self._buildRunningJobsFromRunJobs(runJobs=runningJobs)

        logging.info("About to look for %i loadedJobs.", len(loadedJobs))

        for runningJob in loadedJobs:
            plugin = runningJob['plugin']
            if plugin not in jobsToTrack.keys():
                jobsToTrack[plugin] = []
            jobsToTrack[plugin].append(runningJob)

        for plugin in jobsToTrack.keys():
            if plugin not in self.plugins.keys():
                msg = "Jobs tracking with non-existant plugin %s\n" % (plugin)
                msg += "They were submitted but can't be tracked?\n"
                msg += "That's too strange to continue\n"
                logging.error(msg)
                raise BossAirException(msg)
            try:
                # Then we send them to the plugins
                # Should give you a lit of jobs to change and jobs to complete
                pluginInst = self.plugins[plugin]
                localRunning, localChanges, localCompletes = pluginInst.track(
                    jobs=jobsToTrack[plugin])
                jobsToReturn.extend(localRunning)
                jobsToChange.extend(localChanges)
                jobsToComplete.extend(localCompletes)
                logging.info(
                    "Executing/changing/completing %i/%i/%i jobs in plugin %s.",
                    len(localRunning), len(localChanges), len(localCompletes),
                    plugin)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while tracking jobs for plugin %s!\n" % plugin
                msg += str(ex)
                logging.error(msg)
                logging.debug("JobsToTrack: %s", jobsToTrack[plugin])
                raise BossAirException(msg)

        logging.info("About to change %i jobs", len(jobsToChange))
        logging.debug("JobsToChange: %s", jobsToChange)
        logging.info("About to complete %i jobs", len(jobsToComplete))
        logging.debug("JobsToComplete: %s", jobsToComplete)

        self._updateJobs(jobs=jobsToChange)
        self._complete(jobs=jobsToComplete)

        # We should have a globalState variable for changed jobs
        # from the plugin
        # Return that to the calling function
        for rj in jobsToReturn:
            job = rj.buildWMBSJob()
            job['globalState'] = rj['globalState']
            returnList.append(job)

        return returnList

    def _complete(self, jobs):
        """
        _complete_

        Complete jobs using plugin functions
        Requires jobs in RunJob format
        """
        if len(jobs) < 1:
            return

        # We should be insulated from bad plugins by track()
        jobsToComplete = {}

        for job in jobs:
            if job['plugin'] not in jobsToComplete.keys():
                jobsToComplete[job['plugin']] = []
            jobsToComplete[job['plugin']].append(job)

        try:
            for plugin in jobsToComplete.keys():
                self.plugins[plugin].complete(jobsToComplete[plugin])
        except WMException:
            raise
        except Exception as ex:
            msg = "Exception while completing jobs!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("JobsToComplete: %s", jobsToComplete)
            raise BossAirException(msg)
        finally:
            # If the complete code fails, label the jobs as finished anyway
            # We want to avoid cyclic repetition of failed jobs
            # If they don't have a FWJR, the Accountant will catch it.
            self._completeKill(jobs)

        return

    def _completeKill(self, jobs):
        """
        __completeKill_

        Mark jobs killed in BossAir as completed
        Requires jobs in RunJob format
        """
        if len(jobs) < 1:
            return

        idsToComplete = [job['id'] for job in jobs]

        existingTransaction = self.beginTransaction()
        self.completeDAO.execute(jobs=idsToComplete,
                                 conn=self.getDBConn(),
                                 transaction=self.existingTransaction())
        self.commitTransaction(existingTransaction)

        return

    def getComplete(self):
        """
        _getComplete_

        The tracker should call this: It's only
        interested in the jobs that are completed.
        """

        completeJobs = []

        completeRunJobs = self._listRunJobs(active=False)

        for rj in completeRunJobs:
            job = rj.buildWMBSJob()
            completeJobs.append(job)

        return completeJobs

    def kill(self, jobs, workflowName=None, killMsg=None, errorCode=71300):
        """
        _kill_

        Kill jobs using plugin functions:

        Only active jobs (status = 1) will be killed. If workflowName is given,
        then kill all its jobs in one shot.
        An optional killMsg can be sent; this will be written into the job FWJR.
        The errorCode will be the one specified and if no killMsg is provided then
        a standard message associated with the exit code will be used.
        If a previous FWJR exists, this error will be appended to it.
        """
        if not jobs:
            return
        jobsToKill = {}

        # Now get a list of which jobs are in the batch system
        # only kill jobs present there
        loadedJobs = self._buildRunningJobs(wmbsJobs=jobs)

        for runningJob in loadedJobs:
            plugin = runningJob['plugin']
            jobsToKill.setdefault(plugin, [])
            jobsToKill[plugin].append(runningJob)

        for plugin in jobsToKill.keys():
            if plugin not in self.plugins.keys():
                msg = "Jobs tracking with non-existant plugin %s\n" % (plugin)
                msg += "They were submitted but can't be tracked?\n"
                msg += "That's too strange to continue\n"
                logging.error(msg)
                raise BossAirException(msg)
            else:
                # Then we send them to the plugins
                try:
                    pluginInst = self.plugins[plugin]
                    if workflowName:
                        # jobs are completed regardless whether the kill succeeded or not
                        self._completeKill(jobs=jobsToKill[plugin])
                        pluginInst.killWorkflowJobs(workflow=workflowName)
                    else:
                        # raise an exception if it fails to kill jobs, such that the same
                        # jobs are retried again in the next cycle
                        pluginInst.kill(jobs=jobsToKill[plugin], raiseEx=True)
                        self._completeKill(jobs=jobsToKill[plugin])

                    # Register the killed jobs
                    for job in jobsToKill[plugin]:
                        if job.get('cache_dir') is None or job.get(
                                'retry_count') is None:
                            continue
                        # Try to save an error report as the jobFWJR
                        if not os.path.isdir(job['cache_dir']):
                            # Then we have a bad cache directory
                            logging.error(
                                "Could not write a kill FWJR due to non-existant cache_dir for job %i\n",
                                job['id'])
                            logging.debug("cache_dir: %s\n", job['cache_dir'])
                            continue
                        reportName = os.path.join(
                            job['cache_dir'],
                            'Report.%i.pkl' % job['retry_count'])
                        errorReport = Report()
                        if os.path.exists(reportName) and os.path.getsize(
                                reportName) > 0:
                            # Then there's already a report there.  Add messages
                            errorReport.load(reportName)
                        # Build a better job message
                        if killMsg:
                            reportedMsg = killMsg
                        else:
                            reportedMsg = WM_JOB_ERROR_CODES[errorCode]
                            reportedMsg += '\n Job last known status was: %s' % job.get(
                                'globalState', 'Unknown')
                        errorReport.addError("JobKilled", errorCode,
                                             "JobKilled", reportedMsg)
                        try:
                            errorReport.save(filename=reportName)
                        except IOError as ioe:
                            logging.warning(
                                'Cannot write report %s because of %s',
                                reportName, ioe)
                except RuntimeError:
                    logging.warning(
                        "Plugin failed to remove jobs. It will be retried in the next cycle."
                    )
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unhandled exception while calling kill method for plugin %s\n" % plugin
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug(
                        "Interrupted while killing following jobs: %s\n",
                        jobsToKill[plugin])
                    raise BossAirException(msg)
        return

    def update(self, jobs):
        """
        _update_

        Overwrite the database with whatever you put into
        this function.
        """

        runJobs = self._buildRunningJobs(wmbsJobs=jobs)

        self._updateJobs(jobs=runJobs)

        return

    def monitor(self, commonState=True):
        """
        _monitor_

        Initiate the call to the monitoring DAO
        This should not be called by the standard Submitter/Status/Tracker
        system.  It is meant for outside calling.
        """
        results = self.monitorDAO.execute(
            commonState,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        return results

    def updateJobInformation(self, workflow, task, **kwargs):
        """
        _updateJobInformation_

        Update the information of jobs in a particular workflow and task,
        the data will be updated according the keyword arguments which
        will be interpreted by the individual plugins accordingly.
        """
        for plugin in self.plugins.keys():
            try:
                pluginInst = self.plugins[plugin]
                pluginInst.updateJobInformation(workflow, task, **kwargs)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while calling update method for plugin %s\n" % plugin
                msg += str(ex)
                logging.error(msg)
                raise BossAirException(msg)
        return

    def updateSiteInformation(self, jobs, siteName, excludeSite):
        """
        _updateSiteInformation_

        Modify condor classAd for all Idle jobs for a site if it has gone Down, Draining or Aborted.
        Kill all jobs if the site is the only site for the job.
        """
        jobkill = []
        for plugin in self.plugins.keys():
            try:
                pluginInst = self.plugins[plugin]
                tempjoblist = pluginInst.updateSiteInformation(
                    jobs, siteName, excludeSite)
                if tempjoblist is not None:
                    jobkill.extend(tempjoblist)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while calling update method for plugin %s\n" % plugin
                msg += str(ex)
                logging.error(msg)
                raise BossAirException(msg)
        return jobkill

    def _buildRunningJobsFromRunJobs(self, runJobs):
        """
        _buildRunningJobsFromRunJobs_

        Same as _buildRunningJobs_, but taking runJobs as input
        """
        finalJobs = []

        loadedJobs = self._loadByID(jobs=runJobs)

        for loadJob in loadedJobs:
            runJob = None
            for rj in runJobs:
                if rj['id'] == loadJob['id']:
                    runJob = rj
                    break
            # We should have two instances of the job
            for key in runJob.keys():
                # Fill one from the other
                # runJob, being most recent, should be on top
                if runJob[key] is None:
                    runJob[key] = loadJob.get(key, None)
            finalJobs.append(runJob)

        return finalJobs

    def _buildRunningJobs(self, wmbsJobs):
        """
        _buildRunningJobs_

        Build running jobs by loading information from the database and
        compiling it into a runJob object.  This overwrites any information
        from the database with the info from the WMBS Job
        """
        runJobsLoaded = self.loadByWMBS(wmbsJobs=wmbsJobs)

        return runJobsLoaded
Example #44
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default' : self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.exitCodes      = getattr(self.config.ErrorHandler, 'failureExitCodes', [])
        self.maxFailTime    = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR       = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes      = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs    = self.daoFactory(classname = "Jobs.GetAllJobs")
        self.idLoad     = self.daoFactory(classname = "Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        return

    def setup(self, parameters = None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        # Do not build ACDC for utilitarian job types
        jobList = [ job for job in jobList if job['type'] not in ['LogCollect','Cleanup'] ]

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state))
        retrydoneJobs = []
        cooloffJobs = []
        passJobs    = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                logging.debug("JobInfo: %s" % job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs, 'retrydone',
                                       '%sfailed' % state, updatesummary = True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs, '%scooloff' % state,
                                       '%sfailed' % state, updatesummary = True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs" % len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []
        for job in jobList:
            report     = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath))
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                if startTime is None or stopTime is None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i" % job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.exitCodes]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception as ex:
                logging.warning("Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs

    def handleRetryDoneJobs(self, jobList):
        """
        _handleRetryDoneJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d retry done jobs" % len(jobList))
        myThread.transaction.begin()
        self.exhaustJobs(jobList)
        myThread.transaction.commit()
        
        return

    def handleFailedJobs(self, jobList, state):
        """
        _handleFailedJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d failures" % len(jobList))
        myThread.transaction.begin()
        self.processRetries(jobList, state)
        myThread.transaction.commit()

        return

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state = "%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed" % (len(idList), state))
            while len(idList) > 0:
                tmpList = idList[:self.maxProcessSize]
                idList = idList[self.maxProcessSize:]
                jobList = self.loadJobsFromList(tmpList)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state = 'retrydone')
        logging.info("Found %d jobs done with all retries" % len(idList))
        while len(idList) > 0:
            tmpList = idList[:self.maxProcessSize]
            idList = idList[self.maxProcessSize:]
            jobList = self.loadJobsFromList(tmpList)
            self.handleRetryDoneJobs(jobList)

        return

    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """
        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})
        results = self.idLoad.execute(jobID = binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id = entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def loadJobsFromListFull(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk.
        Include the full metadata.
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID = binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id = entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def algorithm(self, parameters = None):
        """
        Performs the handleErrors method, looking for each type of failure
        And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        try:
            self.handleErrors()
        except (CouchConnectionError, HTTPException) as ex:
            msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler\n"
            msg += "transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.exception(msg)
        except Exception as ex:
            msg = "Caught unexpected exception in ErrorHandler\n"
            msg += str(ex)
            msg += str(traceback.format_exc())
            logging.exception(msg)
            raise ErrorHandlerException(msg)
Example #45
0
    def testMemoryProfile(self):
        """
        _testMemoryProfile_

        Creates 20k jobs and keep refreshing the cache and submitting
        them between the components cycle

        Example using memory_profiler library, unfortunately the source
        code has to be updated with decorators.
        NOTE: Never run it on jenkins
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)
        # myResourceControl = ResourceControl(config)

        nSubs = 20
        nJobs = 100

        sites = ['T2_US_Florida', 'T2_RU_INR', 'T3_CO_Uniandes', 'T1_US_FNAL']
        allSites = CRIC().PSNtoPNNMap('*')

        for site in allSites:
            self.setResourceThresholds(site, pendingSlots=20000, runningSlots=999999, tasks=['Processing', 'Merge'],
                                       Processing={'pendingSlots': 10000, 'runningSlots': 999999},
                                       Merge={'pendingSlots': 10000, 'runningSlots': 999999, 'priority': 5})

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config=config)

        self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10,
                             task=workload.getTask("ReReco"),
                             workloadSpec=self.workloadSpecPath,
                             site=[x for x in sites], changeState=changeState)

        # Actually run it
        jobSubmitter.algorithm()  # cycle 1

        self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10,
                             task=workload.getTask("ReReco"),
                             workloadSpec=self.workloadSpecPath,
                             site=[x for x in sites], changeState=changeState)
        # myResourceControl.changeSiteState('T2_US_Florida', 'Draining')
        jobSubmitter.algorithm()  # cycle 2

        self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10,
                             task=workload.getTask("ReReco"),
                             workloadSpec=self.workloadSpecPath,
                             site=[x for x in sites], changeState=changeState)
        # myResourceControl.changeSiteState('T2_RU_INR', 'Draining')
        jobSubmitter.algorithm()  # cycle 3

        self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10,
                             task=workload.getTask("ReReco"),
                             workloadSpec=self.workloadSpecPath,
                             site=[x for x in sites], changeState=changeState)
        # myResourceControl.changeSiteState('T3_CO_Uniandes', 'Draining')
        jobSubmitter.algorithm()  # cycle 4

        # myResourceControl.changeSiteState('T2_RU_INR', 'Normal')
        jobSubmitter.algorithm()  # cycle 5

        # myResourceControl.changeSiteState('T2_US_Florida', 'Normal')
        jobSubmitter.algorithm()  # cycle 6

        # myResourceControl.changeSiteState('T2_RU_INR', 'Normal')
        jobSubmitter.algorithm()  # cycle 7

        # myResourceControl.changeSiteState('T3_CO_Uniandes', 'Normal')
        jobSubmitter.algorithm()  # cycle 8
        jobSubmitter.algorithm()  # cycle 9, nothing to submit

        return
Example #46
0
    def testI_MultipleJobTypes(self):
        """
        _testI_MultipleJobTypes_

        Check that we can configure different retry algorithms for different
        job types, including a default for nonspecified types.
        Also check that two job types can share the same retry algorithm
        but with different parameters
        """

        # Let's create 4 job groups
        processingJobGroup = self.createTestJobGroup(nJobs=10, retryOnce=True)
        productionJobGroup = self.createTestJobGroup(nJobs=15,
                                                     subType="Production",
                                                     retryOnce=True)
        mergeJobGroup = self.createTestJobGroup(nJobs=20,
                                                subType="Merge",
                                                retryOnce=True)
        skimJobGroup = self.createTestJobGroup(nJobs=5,
                                               subType="Skim",
                                               retryOnce=True)

        # Set an adequate config
        # Processing jobs get the PauseAlgo with pauseCount 4
        # Production jobs get the ExponentialAlgo
        # Merge jobs get the PauseAlgo but with pauseCount 2 which is the default
        # Skim jobs are not configured, so they get the default SquaredAlgo
        config = self.getConfig()
        config.RetryManager.plugins = {
            'Processing': 'PauseAlgo',
            'Production': 'ExponentialAlgo',
            'Merge': 'PauseAlgo',
            'default': 'SquaredAlgo'
        }
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 4
        config.RetryManager.PauseAlgo.section_("default")
        config.RetryManager.PauseAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }
        config.RetryManager.PauseAlgo.default.pauseCount = 2
        config.RetryManager.section_("ExponentialAlgo")
        config.RetryManager.ExponentialAlgo.section_("Production")
        config.RetryManager.ExponentialAlgo.Production.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.ExponentialAlgo.section_("default")
        config.RetryManager.ExponentialAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }
        config.RetryManager.section_("SquaredAlgo")
        config.RetryManager.SquaredAlgo.section_("Skim")
        config.RetryManager.SquaredAlgo.Skim.coolOffTime = {
            'create': 30,
            'submit': 30,
            'job': 30
        }
        config.RetryManager.SquaredAlgo.section_("default")
        config.RetryManager.SquaredAlgo.default.coolOffTime = {
            'create': 60,
            'submit': 60,
            'job': 60
        }

        # Start the state changer and RetryManager
        changer = ChangeState(config)
        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        # Create the jobs for the first time
        changer.propagate(processingJobGroup.jobs, 'created', 'new')

        # Let's start with the processing jobs and the pauseAlgo
        for count in range(1, 5):
            # Fail the jobs
            changer.propagate(processingJobGroup.jobs, 'executing', 'created')
            changer.propagate(processingJobGroup.jobs, 'jobfailed',
                              'executing')
            changer.propagate(processingJobGroup.jobs, 'jobcooloff',
                              'jobfailed')

            # Check  that the cooloff time is strictly enforced
            # First a job time just below the cooloff time
            for job in processingJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(processingJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")

            # Now above the cooloff time
            for job in processingJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            # Make sure the jobs get created again or go to paused
            if count < 4:
                idList = self.getJobs.execute(state='created')
            else:
                idList = self.getJobs.execute(state='jobpaused')
            self.assertEqual(len(idList), len(processingJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Unpause them so they don't interfere with subsequent tests
        changer.propagate(processingJobGroup.jobs, 'created', 'jobpaused')
        changer.propagate(processingJobGroup.jobs, 'executing', 'created')

        # Now the production jobs and the exponential algo
        changer.propagate(productionJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(productionJobGroup.jobs, 'executing', 'created')
            changer.propagate(productionJobGroup.jobs, 'jobfailed',
                              'executing')
            changer.propagate(productionJobGroup.jobs, 'jobcooloff',
                              'jobfailed')

            for job in productionJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        pow(30, count) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(productionJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")
            for job in productionJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        pow(30, count) - 5)
            testRetryManager.algorithm(None)

            idList = self.getJobs.execute(state='created')
            self.assertEqual(len(idList), len(productionJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Send them to executing
        changer.propagate(productionJobGroup.jobs, 'executing', 'created')

        # Now the merge jobs and the paused algo with different parameters
        changer.propagate(mergeJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(mergeJobGroup.jobs, 'executing', 'created')
            changer.propagate(mergeJobGroup.jobs, 'jobfailed', 'executing')
            changer.propagate(mergeJobGroup.jobs, 'jobcooloff', 'jobfailed')

            for job in mergeJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(mergeJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")

            for job in mergeJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        60 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            if count < 2:
                idList = self.getJobs.execute(state='created')
            else:
                idList = self.getJobs.execute(state='jobpaused')
            self.assertEqual(len(idList), len(mergeJobGroup.jobs),
                             "Jobs didn't change state correctly")

        # Send them to executing
        changer.propagate(mergeJobGroup.jobs, 'created', 'jobpaused')
        changer.propagate(mergeJobGroup.jobs, 'executing', 'created')

        # Now the skim jobs and the squared algo
        changer.propagate(skimJobGroup.jobs, 'created', 'new')

        for count in range(1, 3):
            changer.propagate(skimJobGroup.jobs, 'executing', 'created')
            changer.propagate(skimJobGroup.jobs, 'jobfailed', 'executing')
            changer.propagate(skimJobGroup.jobs, 'jobcooloff', 'jobfailed')

            for job in skimJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) + 5)
            testRetryManager.algorithm(None)
            idList = self.getJobs.execute(state='JobCoolOff')
            self.assertEqual(
                len(idList), len(skimJobGroup.jobs),
                "Jobs went into cooloff without the proper timing")
            for job in skimJobGroup.jobs:
                self.setJobTime.execute(jobID=job["id"],
                                        stateTime=int(time.time()) -
                                        30 * pow(count, 2) - 5)
            testRetryManager.algorithm(None)

            idList = self.getJobs.execute(state='created')
            self.assertEqual(len(idList), len(skimJobGroup.jobs),
                             "Jobs didn't change state correctly")
Example #47
0
    def testC_prioritization(self):
        """
        _testC_prioritization_

        Check that jobs are prioritized by job type and by oldest workflow
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 10
        site = "T1_US_FNAL"

        self.setResourceThresholds(site, pendingSlots=10, runningSlots=10000, tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 50, 'runningSlots': 10000},
                                   Merge={'pendingSlots': 10, 'runningSlots': 10000, 'priority': 5})

        # Always initialize the submitter after setting the sites, flaky!
        jobSubmitter = JobSubmitterPoller(config=config)

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            name='OldestWorkflow')
        jobGroupList.extend(self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                                 task=workload.getTask("ReReco"),
                                                 workloadSpec=self.workloadSpecPath,
                                                 site=site,
                                                 taskType='Merge'))
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter.algorithm()

        # Merge goes first
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Merge")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 10)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        # Create a newer workflow processing, and after some new jobs for an old workflow

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            name='OldestWorkflow')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            name='NewestWorkflow')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Move pending jobs to running

        getRunJobID = self.baDaoFactory(classname="LoadByWMBSID")
        setRunJobStatus = self.baDaoFactory(classname="SetStatus")

        for idx in range(2):
            result = getJobsAction.execute(state='Executing')
            binds = []
            for jobId in result:
                binds.append({'id': jobId, 'retry_count': 0})
            runJobIds = getRunJobID.execute(binds)
            setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running')

            # Run again on created workflows
            jobSubmitter.algorithm()

            result = getJobsAction.execute(state='Created', jobType="Merge")
            self.assertEqual(len(result), 0)
            result = getJobsAction.execute(state='Executing', jobType="Merge")
            self.assertEqual(len(result), 10)
            result = getJobsAction.execute(state='Created', jobType="Processing")
            self.assertEqual(len(result), 30 - (idx + 1) * 10)
            result = getJobsAction.execute(state='Executing', jobType="Processing")
            self.assertEqual(len(result), (idx + 1) * 10)

            # Check that older workflow goes first even with newer jobs
            getWorkflowAction = self.daoFactory(classname="Jobs.GetWorkflowTask")
            workflows = getWorkflowAction.execute(result)
            for workflow in workflows:
                self.assertEqual(workflow['name'], 'OldestWorkflow')

        return
Example #48
0
    def testH_PauseAlgo(self):
        """
        _testH_PauseAlgo_

        Test the pause algorithm, note that given pauseCount = n, the
        job will run first n + 1 times before being paused.
        After that it will be paused each n times
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        # adding a 2nd job group
        testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'PauseAlgo'}
        config.RetryManager.section_("PauseAlgo")
        config.RetryManager.PauseAlgo.section_("Processing")
        config.RetryManager.PauseAlgo.Processing.coolOffTime = {
            'create': 20,
            'submit': 20,
            'job': 20
        }
        config.RetryManager.PauseAlgo.Processing.pauseCount = 2
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'jobcooloff')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        report = Report()

        # Making sure that jobs are not created ahead of time
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 15)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 25)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        # Make sure that no change happens before timeout
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 75)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin pauses them
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # Emulating ops retrying the job
        changer.propagate(testJobGroup.jobs, 'created', 'jobpaused')

        # Making sure it did the right thing
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='JobCoolOff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be retried
        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 185)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        # Fail them out again
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 315)
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 325)

        # Make sure that the plugin allowed them to go back to created state
        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList), self.nJobs)

        # a configurable retry count per job type {jobExitCodeA: pauseCountB}
        config.RetryManager.PauseAlgo.Processing.retryErrorCodes = {
            8020: 1,
            12345: 1,
            5555: 2
        }

        testRetryManager2 = RetryManagerPoller(config)
        testRetryManager2.algorithm()

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        report.load(fwjrPath)
        for job in testJobGroup2.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))

        # fail the jobs
        changer.propagate(testJobGroup2.jobs, 'created', 'new')
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        # Giving time so they can be paused
        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin sent those jobs to the next state:
        testRetryManager2.algorithm()
        # job exit code is 8020, so it is supposed to be retried one time.
        # Meaning, that here we should have 10 jobs (from the first part of the test) in jobpaused
        # and 10 jobs in created state

        idList = self.getJobs.execute(state='created')
        self.assertEqual(len(idList), self.nJobs)

        idList2 = self.getJobs.execute(state='jobpaused')
        self.assertEqual(len(idList2), self.nJobs)

        # save a second job report - with a retry count = 1
        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            j['retry_count'] = 1
            self.assertEqual(j['retry_count'], 1)
            report.save(
                os.path.join(j['cache_dir'],
                             "Report.%i.pkl" % j['retry_count']))

        # Fail them out again
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 175)

        # not sure if this check is needed:
        idList = self.getJobs.execute(state='jobcooloff')
        self.assertEqual(len(idList), self.nJobs)

        # Giving time so they can be paused
        for job in testJobGroup2.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 85)

        # Make sure that the plugin sent those jobs to paused state:
        testRetryManager2.algorithm(None)
        idList = self.getJobs.execute(state='jobpaused')
        # And again, in total, there should be 10+10=20 jobs in jobpaused
        self.assertEqual(len(idList), self.nJobs * 2)

        return
Example #49
0
    def testC_prioTest(self):
        """
        _testC_prioTest_

        Test whether the correct job type, workflow and task id priorities
        are respected in the DAO
        """
        workload1 = self.createTestWorkload(name='testWorkload1')
        workload2 = self.createTestWorkload(name='testWorkload2')
        workload3 = self.createTestWorkload(name='testWorkload3')
        workload4 = self.createTestWorkload(name='testWorkload4')

        config = self.getConfig()
        changeState = ChangeState(config)
        getJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")

        site = "T1_US_FNAL"
        self.setResourceThresholds(site, pendingSlots=1000, runningSlots=1000,
                                   tasks=['Processing', 'Merge', 'Production', 'Harvesting', 'LogCollect'],
                                   Processing={'pendingSlots': 1000, 'runningSlots': 1000},
                                   Merge={'pendingSlots': 1000, 'runningSlots': 10000},
                                   Production={'pendingSlots': 1000, 'runningSlots': 1000},
                                   Harvesting={'pendingSlots': 1000, 'runningSlots': 1000},
                                   LogCollect={'pendingSlots': 1000, 'runningSlots': 1000})

        nSubs = 1
        nJobs = 5
        jobGroupList = []
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                        task=workload1.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site,
                                        name='OldestWorkflow')  # task_id = 1
        jobGroupList.extend(jobGroup)
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                        task=workload1.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site,
                                        taskType='Merge')  # task_id = 2
        jobGroupList.extend(jobGroup)
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                        task=workload1.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site,
                                        taskType='LogCollect')  # task_id = 3
        jobGroupList.extend(jobGroup)

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # retrieve all 15 jobs created so far
        result = getJobsAction.execute(limitRows=100)
        self.assertItemsEqual([int(j['task_prio']) for j in result],
                              [4] * 5 + [2] * 5 + [0] * 5)
        self.assertItemsEqual([int(j['wf_priority']) for j in result],
                              [1] * 15)
        self.assertItemsEqual([int(j['task_id']) for j in result],
                              [2] * 5 + [3] * 5 + [1] * 5)

        # now retrieve only 6 jobs (5 Merge and 1 LogCollect), wf prio=1
        result = getJobsAction.execute(limitRows=6)
        self.assertItemsEqual([int(j['task_prio']) for j in result], [4] * 5 + [2] * 1)

        jobGroupList = []
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=2,
                                        task=workload2.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site, taskType='Merge')  # task_id = 4
        jobGroupList.extend(jobGroup)
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=3,
                                        task=workload3.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site, taskType='Processing')  # task_id = 5
        jobGroupList.extend(jobGroup)
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=3,
                                        task=workload3.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site, taskType='LogCollect')  # task_id = 6
        jobGroupList.extend(jobGroup)

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # retrieve all 30 jobs created so far
        result = getJobsAction.execute(limitRows=100)
        self.assertItemsEqual([int(j['task_prio']) for j in result],
                              [4] * 10 + [2] * 10 + [0] * 10)
        # merge prio 2, merge prio 1, logCol prio 3, logCol prio 1, proc prio 3, proc prio 1
        self.assertItemsEqual([int(j['wf_priority']) for j in result],
                              [2] * 5 + [1] * 5 + [3] * 5 + [1] * 5 + [3] * 5 + [1] * 5)
        # merge id 4, merge id 2, logCol id 6, logCol id 3, proc id 5, proc id 1
        self.assertItemsEqual([int(j['task_id']) for j in result],
                              [4] * 5 + [2] * 5 + [6] * 5 + [3] * 5 + [5] * 5 + [1] * 5)

        jobGroupList = []
        jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=2,
                                        task=workload4.getTask("ReReco"),
                                        workloadSpec=self.workloadSpecPath,
                                        site=site, taskType='Merge')  # task_id = 7
        jobGroupList.extend(jobGroup)

        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # retrieve all 15 Merge jobs created so far
        result = getJobsAction.execute(limitRows=15)
        self.assertItemsEqual([int(j['task_prio']) for j in result], [4] * 15)
        # merge prio 2, merge prio 2, merge prio 1
        self.assertItemsEqual([int(j['wf_priority']) for j in result], [2] * 10 + [1] * 5)
        # merge id 7, merge id 4, merge id 2
        self.assertItemsEqual([int(j['task_id']) for j in result],
                              [7] * 5 + [4] * 5 + [2] * 5)
Example #50
0
    def testG_ProcessingAlgo(self):
        """
        _ProcessingAlgo_

        Test for the ProcessingAlgo Prototype
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'ProcessingAlgo'}
        config.RetryManager.section_("ProcessingAlgo")
        config.RetryManager.ProcessingAlgo.section_("default")
        config.RetryManager.ProcessingAlgo.default.coolOffTime = {
            'create': 10,
            'submit': 10,
            'job': 10
        }
        changer = ChangeState(config)
        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")
        report = Report()
        report.load(fwjrPath)
        for job in testJobGroup.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 1)
            report.save(
                os.path.join(j['cache_dir'],
                             "Report.%i.pkl" % j['retry_count']))

        config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = [8020]
        testRetryManager2 = RetryManagerPoller(config)
        testRetryManager2.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 5)

        # Now test timeout
        testJobGroup2 = self.createTestJobGroup(nJobs=self.nJobs)

        # Cycle jobs
        for job in testJobGroup2.jobs:
            job['fwjr'] = report
            job['retry_count'] = 0
            report.save(
                os.path.join(job['cache_dir'],
                             "Report.%i.pkl" % job['retry_count']))
        changer.propagate(testJobGroup2.jobs, 'created', 'new')
        changer.propagate(testJobGroup2.jobs, 'executing', 'created')
        changer.propagate(testJobGroup2.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup2.jobs, 'jobfailed', 'complete')
        changer.propagate(testJobGroup2.jobs, 'jobcooloff', 'jobfailed')

        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 0)

        config.RetryManager.ProcessingAlgo.default.OneMoreErrorCodes = []
        config.RetryManager.ProcessingAlgo.default.MaxRunTime = 1
        testRetryManager3 = RetryManagerPoller(config)
        testRetryManager3.algorithm()

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs * 2)

        for job in testJobGroup2.jobs:
            j = Job(id=job['id'])
            j.load()
            self.assertEqual(j['retry_count'], 5)

        return
Example #51
0
    def testA_BasicTest(self):
        """
        Use the MockPlugin to create a simple test
        Check to see that all the jobs were "submitted",
        don't care about thresholds
        """
        workload = self.createTestWorkload()
        config = self.getConfig()
        changeState = ChangeState(config)

        nSubs = 2
        nJobs = 20
        site = "T2_US_UCSD"

        self.setResourceThresholds(site, pendingSlots=50, runningSlots=100, tasks=['Processing', 'Merge'],
                                   Processing={'pendingSlots': 50, 'runningSlots': 100},
                                   Merge={'pendingSlots': 50, 'runningSlots': 100})

        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Do pre-submit check
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()

        # Check that jobs are in the right state
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Check assigned locations
        getLocationAction = self.daoFactory(classname="Jobs.GetLocation")
        for jobId in result:
            loc = getLocationAction.execute(jobid=jobId)
            self.assertEqual(loc, [['T2_US_UCSD']])

        # Run another cycle, it shouldn't submit anything. There isn't anything to submit
        jobSubmitter.algorithm()
        result = getJobsAction.execute(state='Created', jobType="Processing")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        nSubs = 1
        nJobs = 10

        # Submit another 10 jobs
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=self.workloadSpecPath,
                                            site=site,
                                            taskType="Merge")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        # Check that the jobs are available for submission and run another cycle
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), nSubs * nJobs)
        jobSubmitter.algorithm()

        # Check that the last 10 jobs were submitted as well.
        result = getJobsAction.execute(state='Created', jobType="Merge")
        self.assertEqual(len(result), 0)
        result = getJobsAction.execute(state='Executing', jobType="Merge")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Example #52
0
    def testF_LinearAlgo(self):
        """
        _testLinearAlgo_

        Test the linear algorithm to make sure it loads and works
        """

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs)

        config = self.getConfig()
        config.RetryManager.plugins = {'Processing': 'LinearAlgo'}
        config.RetryManager.section_("LinearAlgo")
        config.RetryManager.LinearAlgo.section_("Processing")
        config.RetryManager.LinearAlgo.Processing.coolOffTime = {
            'create': 10,
            'submit': 10,
            'job': 10
        }
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')
        changer.propagate(testJobGroup.jobs, 'created', 'submitcooloff')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')
        changer.propagate(testJobGroup.jobs, 'submitcooloff', 'submitfailed')

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        testRetryManager = RetryManagerPoller(config)
        testRetryManager.setup(None)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 5)

        testRetryManager.algorithm(None)
        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)

        for job in testJobGroup.jobs:
            self.setJobTime.execute(jobID=job["id"],
                                    stateTime=int(time.time()) - 12)

        testRetryManager.algorithm(None)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        return
Example #53
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package = "WMCore.WMBS", \
                                     logger = logging,
                                     dbinterface = myThread.dbi)
        self.config = config

        #Libraries
        self.resourceControl = ResourceControl()

        self.changeState = ChangeState(self.config)
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))

        # BossAir
        self.bossAir = BossAirAPI(config=self.config)

        # Additions for caching-based JobSubmitter
        self.workflowTimestamps = {}
        self.workflowPrios = {}
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.siteKeys = {}
        self.locationDict = {}
        self.cmsNames = {}
        self.drainSites = set()
        self.abortSites = set()
        self.sortedSites = []
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobSubmitter")

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            try:
                logging.debug("PackageDir: %s" % self.packageDir)
                logging.debug("Config: %s" % config)
            except:
                pass
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            batchID = self.jobsToPackage[workflowName]["batchid"]
            id = self.jobsToPackage[workflowName]["id"]
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(61101, 61105)])
        dbJobs = set()

        logging.info("Refreshing priority cache...")
        workflows = self.listWorkflows.execute()
        workflows = filter(lambda x: x['name'] in self.workflowPrios,
                           workflows)
        for workflow in workflows:
            self.workflowPrios[workflow['name']] = workflow['priority']

        logging.info("Querying WMBS for jobs to be submitted...")
        newJobs = self.listJobsAction.execute()
        logging.info("Found %s new jobs to be submitted." % len(newJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs." %
                             (jobCount, len(newJobs)))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s" %
                              pickledJobPath)
                badJobs[61103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = cPickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                self.sendAlert(6, msg=msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            siteWhitelist = loadedJob.get("siteWhitelist", [])
            siteBlacklist = loadedJob.get("siteBlacklist", [])
            trustSitelists = loadedJob.get("trustSitelists", False)

            # convert site lists into correct format
            if len(siteWhitelist) > 0:
                whitelist = []
                for cmsName in siteWhitelist:
                    whitelist.extend(self.cmsNames.get(cmsName, []))
                siteWhitelist = whitelist
            if len(siteBlacklist) > 0:
                blacklist = []
                for cmsName in siteBlacklist:
                    blacklist.extend(self.cmsNames.get(cmsName, []))
                siteBlacklist = blacklist

            # figure out possible locations for job
            if trustSitelists:
                possibleLocations = set(siteWhitelist) - set(siteBlacklist)
            else:
                possibleLocations = set()

                # all files in job have same location (in se names)
                rawLocations = loadedJob["input_files"][0]["locations"]

                # transform se names into site names
                for loc in rawLocations:
                    if not loc in self.siteKeys.keys():
                        # Then we have a problem
                        logging.error(
                            'Encountered unknown location %s for job %i' %
                            (loc, jobID))
                        logging.error(
                            'Ignoring for now, but watch out for this')
                    else:
                        for siteName in self.siteKeys[loc]:
                            possibleLocations.add(siteName)

                # filter with site lists
                if len(siteWhitelist) > 0:
                    possibleLocations = possibleLocations & set(siteWhitelist)
                if len(siteBlacklist) > 0:
                    possibleLocations = possibleLocations - set(siteBlacklist)

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                badJobs[61101].append(newJob)
                continue
            else:
                non_abort_sites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if non_abort_sites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = non_abort_sites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[61102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in ('LogCollect', 'Merge', 'Cleanup',
                                      'Harvesting'):
                non_draining_sites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if non_draining_sites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = non_draining_sites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[61104].append(newJob)
                    continue

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            for possibleLocation in possibleLocations:
                if possibleLocation not in self.cachedJobs:
                    self.cachedJobs[possibleLocation] = {}
                if newJob["type"] not in self.cachedJobs[possibleLocation]:
                    self.cachedJobs[possibleLocation][newJob["type"]] = {}

                locTypeCache = self.cachedJobs[possibleLocation][
                    newJob["type"]]
                workflowName = newJob['workflow']
                timestamp = newJob['timestamp']
                prio = newJob['task_priority']
                if workflowName not in locTypeCache:
                    locTypeCache[workflowName] = set()
                if workflowName not in self.jobDataCache:
                    self.jobDataCache[workflowName] = {}
                if not workflowName in self.workflowTimestamps:
                    self.workflowTimestamps[workflowName] = timestamp
                if workflowName not in self.workflowPrios:
                    self.workflowPrios[workflowName] = prio

                locTypeCache[workflowName].add(jobID)

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Now that we're out of that loop, put the job data in the cache
            jobInfo = (jobID, newJob["retry_count"], batchDir,
                       loadedJob["sandbox"], loadedJob["cache_dir"],
                       loadedJob.get("ownerDN",
                                     None), loadedJob.get("ownerGroup", ''),
                       loadedJob.get("ownerRole",
                                     ''), frozenset(possibleLocations),
                       loadedJob.get("scramArch", None),
                       loadedJob.get("swVersion", None), loadedJob["name"],
                       loadedJob.get("proxyPath",
                                     None), newJob['request_name'],
                       loadedJob.get("estimatedJobTime", None),
                       loadedJob.get("estimatedDiskUsage", None),
                       loadedJob.get("estimatedMemoryUsage",
                                     None), newJob['task_name'],
                       frozenset(potentialLocations),
                       loadedJob.get("numberOfCores", 1), newJob['task_id'])

            self.jobDataCache[workflowName][jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d"
                    % (badJobs, errorCode))
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()
        logging.info("Done with refreshCache() loop, pruning killed jobs.")

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self.cachedJobIDs -= jobIDsToPurge

        if len(jobIDsToPurge) == 0:
            return

        for siteName in self.cachedJobs.keys():
            for taskType in self.cachedJobs[siteName].keys():
                for workflow in self.cachedJobs[siteName][taskType].keys():
                    for cachedJobID in list(
                            self.cachedJobs[siteName][taskType][workflow]):
                        if cachedJobID in jobIDsToPurge:
                            self.cachedJobs[siteName][taskType][
                                workflow].remove(cachedJobID)
                            try:
                                del self.jobDataCache[workflow][cachedJobID]
                            except KeyError:
                                # Already gone
                                pass

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in (61102, 61104):
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WMJobErrorCodes[exitCode] +
                    ', '.join(job['possibleLocations']))
            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WMJobErrorCodes[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s"
                    % (job['id'], str(ioer)))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Reformat the submit thresholds.  This will store a dictionary keyed by
        task type.  Each task type will contain a list of tuples where each
        tuple contains teh site name and the number of running jobs.
        """
        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        newDrainSites = set()
        newAbortSites = set()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            cmsName = rcThresholds[siteName]["cms_name"]
            state = rcThresholds[siteName]["state"]

            if not cmsName in self.cmsNames.keys():
                self.cmsNames[cmsName] = []
            if not siteName in self.cmsNames[cmsName]:
                self.cmsNames[cmsName].append(siteName)
            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            for seName in rcThresholds[siteName]["se_names"]:
                if not seName in self.siteKeys.keys():
                    self.siteKeys[seName] = []
                if not siteName in self.siteKeys[seName]:
                    self.siteKeys[seName].append(siteName)

        # When the list of drain/abort sites changes between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        # TODO: Make it more efficient, only reshuffle locations when this happens
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        #Sort the sites using the following criteria:
        #T1 sites go first, then T2, then T3
        #After that we fill first the bigger ones
        #Python sorting is stable so let's do 2 sort passes, it should be fast
        #Assume  that all CMS names start with T[0-9]+, which the lexicon guarantees
        self.sortedSites = sorted(
            rcThresholds.keys(),
            key=lambda x: rcThresholds[x]["total_pending_slots"],
            reverse=True)
        self.sortedSites = sorted(
            self.sortedSites, key=lambda x: rcThresholds[x]["cms_name"][0:2])
        logging.debug('Sites will be filled in the following order: %s' %
                      str(self.sortedSites))

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToPrune = {}
        jobsCount = 0
        exitLoop = False

        for siteName in self.sortedSites:
            if exitLoop:
                break

            totalPending = None
            if siteName not in self.cachedJobs:
                logging.debug("No jobs for site %s" % siteName)
                continue
            logging.debug("Have site %s" % siteName)
            try:
                totalPendingSlots = self.currentRcThresholds[siteName][
                    "total_pending_slots"]
                totalRunningSlots = self.currentRcThresholds[siteName][
                    "total_running_slots"]
                totalRunning = self.currentRcThresholds[siteName][
                    "total_running_jobs"]
                totalPending = self.currentRcThresholds[siteName][
                    "total_pending_jobs"]
                state = self.currentRcThresholds[siteName]["state"]
            except KeyError as ex:
                msg = "Had invalid site info %s\n" % siteName['thresholds']
                msg += str(ex)
                logging.error(msg)
                continue
            for threshold in self.currentRcThresholds[siteName].get(
                    'thresholds', []):
                if exitLoop:
                    break
                try:
                    # Pull basic info for the threshold
                    taskType = threshold["task_type"]
                    maxSlots = threshold["max_slots"]
                    taskPendingSlots = threshold["pending_slots"]
                    taskRunning = threshold["task_running_jobs"]
                    taskPending = threshold["task_pending_jobs"]
                    taskPriority = threshold["priority"]
                except KeyError as ex:
                    msg = "Had invalid threshold %s\n" % threshold
                    msg += str(ex)
                    logging.error(msg)
                    continue

                #If the site is down, do not submit anything to it
                if state == 'Down':
                    continue

                #If the number of running exceeded the running slots in the
                #site then get out
                if totalRunningSlots >= 0 and totalRunning >= totalRunningSlots:
                    continue

                #If the task is running more than allowed then get out
                if maxSlots >= 0 and taskRunning >= maxSlots:
                    continue

                # Ignore this threshold if we've cleaned out the site
                if siteName not in self.cachedJobs:
                    continue

                # Ignore this threshold if we have no jobs
                # for it
                if taskType not in self.cachedJobs[siteName]:
                    continue

                taskCache = self.cachedJobs[siteName][taskType]

                # Calculate number of jobs we need
                nJobsRequired = min(totalPendingSlots - totalPending,
                                    taskPendingSlots - taskPending)
                breakLoop = False
                logging.debug("nJobsRequired for task %s: %i" %
                              (taskType, nJobsRequired))

                while nJobsRequired > 0:
                    # Do this until we have all the jobs for this threshold

                    # Pull a job out of the cache for the task/site.  Verify that we
                    # haven't already used this job in this polling cycle.
                    cachedJob = None
                    cachedJobWorkflow = None

                    workflows = taskCache.keys()

                    # Sorting by prio and timestamp on the subscription
                    def sortingCmp(x, y):
                        if (x not in self.workflowTimestamps) or (
                                y not in self.workflowTimestamps):
                            return -1
                        if (x not in self.workflowPrios) or (
                                y not in self.workflowPrios):
                            return -1
                        if self.workflowPrios[x] > self.workflowPrios[y]:
                            return -1
                        elif self.workflowPrios[x] == self.workflowPrios[y]:
                            return cmp(self.workflowTimestamps[x],
                                       self.workflowTimestamps[y])
                        else:
                            return 1

                    workflows.sort(cmp=sortingCmp)

                    for workflow in workflows:
                        # Run a while loop until you get a job
                        while len(taskCache[workflow]) > 0:
                            cachedJobID = taskCache[workflow].pop()
                            try:
                                cachedJob = self.jobDataCache[workflow].get(
                                    cachedJobID, None)
                                del self.jobDataCache[workflow][cachedJobID]
                            except:
                                pass

                            if cachedJobID not in jobsToPrune.get(
                                    workflow, set()):
                                cachedJobWorkflow = workflow
                                break
                            else:
                                cachedJob = None

                        # Remove the entry in the cache for the workflow if it is empty.
                        if len(self.cachedJobs[siteName][taskType]
                               [workflow]) == 0:
                            del self.cachedJobs[siteName][taskType][workflow]
                        if workflow in self.jobDataCache and len(
                                self.jobDataCache[workflow].keys()) == 0:
                            del self.jobDataCache[workflow]

                        if cachedJob:
                            # We found a job, bail out and handle it.
                            break

                    # Check to see if we need to delete this site from the cache
                    if len(self.cachedJobs[siteName][taskType].keys()) == 0:
                        del self.cachedJobs[siteName][taskType]
                        breakLoop = True
                    if len(self.cachedJobs[siteName].keys()) == 0:
                        del self.cachedJobs[siteName]
                        breakLoop = True

                    if not cachedJob:
                        # We didn't find a job, bail out.
                        # This site and task type is done
                        break

                    self.cachedJobIDs.remove(cachedJob[0])

                    if cachedJobWorkflow not in jobsToPrune:
                        jobsToPrune[cachedJobWorkflow] = set()

                    jobsToPrune[cachedJobWorkflow].add(cachedJob[0])

                    # Sort jobs by jobPackage
                    package = cachedJob[2]
                    if not package in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob[3]

                    possibleSites = cachedJob[8]
                    possibleSiteList = list(possibleSites)
                    fakeAssignedSiteName = random.choice(possibleSiteList)
                    potentialSites = cachedJob[18]

                    # Create a job dictionary object
                    jobDict = {
                        'id': cachedJob[0],
                        'retry_count': cachedJob[1],
                        'custom': {
                            'location': fakeAssignedSiteName
                        },
                        'cache_dir': cachedJob[4],
                        'packageDir': package,
                        'userdn': cachedJob[5],
                        'usergroup': cachedJob[6],
                        'userrole': cachedJob[7],
                        'priority': taskPriority,
                        'taskType': taskType,
                        'possibleSites': possibleSites,
                        'scramArch': cachedJob[9],
                        'swVersion': cachedJob[10],
                        'name': cachedJob[11],
                        'proxyPath': cachedJob[12],
                        'requestName': cachedJob[13],
                        'estimatedJobTime': cachedJob[14],
                        'estimatedDiskUsage': cachedJob[15],
                        'estimatedMemoryUsage': cachedJob[16],
                        'taskPriority': self.workflowPrios[workflow],
                        'taskName': cachedJob[17],
                        'numberOfCores': cachedJob[19],
                        'taskID': cachedJob[20],
                        'potentialSites': potentialSites
                    }

                    # Add to jobsToSubmit
                    jobsToSubmit[package].append(jobDict)
                    jobsCount += 1
                    if jobsCount >= self.maxJobsPerPoll:
                        breakLoop, exitLoop = True, True

                    # Deal with accounting
                    if len(possibleSites) == 1:
                        nJobsRequired -= 1
                    totalPending += 1
                    taskPending += 1

                    if breakLoop:
                        break

        # Remove the jobs that we're going to submit from the cache.
        allWorkflows = set()
        for siteName in self.cachedJobs.keys():
            for taskType in self.cachedJobs[siteName].keys():
                for workflow in self.cachedJobs[siteName][taskType].keys():
                    allWorkflows.add(workflow)
                    if workflow in jobsToPrune.keys():
                        self.cachedJobs[siteName][taskType][
                            workflow] -= jobsToPrune[workflow]

        # Remove workflows from the timestamp dictionary which are not anymore in the cache
        workflowsWithTimestamp = self.workflowTimestamps.keys()
        for workflow in workflowsWithTimestamp:
            if workflow not in allWorkflows:
                del self.workflowTimestamps[workflow]

        workflowsWithPrios = self.workflowPrios.keys()
        for workflow in workflowsWithPrios:
            if workflow not in allWorkflows:
                del self.workflowPrios[workflow]

        logging.info("Have %s packages to submit." % len(jobsToSubmit))
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])

            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d." %
                     (len(successList), len(failList)))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """

        try:
            myThread = threading.currentThread()
            self.getThresholds()
            self.refreshCache()
            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)

        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            self.sendAlert(7, msg=msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Example #54
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        dummycacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')
        sn = "T2_US_UCSD"

        # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist
        # in BossAir_t.py
        baAPI.updateSiteInformation(idleJobs, sn, True)

        # Now kill 'em manually
        #        command = ['condor_rm', self.user]
        #        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        #        pipe.communicate()

        del jobSubmitter

        return
Example #55
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                killableJobs.append(liveJob)
        # Now kill them
        try:
            logging.info("Killing %d jobs for workflow: %s", len(killableJobs),
                         workflowName)
            bossAir.kill(jobs=killableJobs, workflowName=workflowName)
        except BossAirException as ex:
            # Something's gone wrong. Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.

    liveWMBSJobs = defaultdict(list)
    for liveJob in liveJobs:
        if liveJob["state"] == "killed":
            # Then we've killed it already
            continue
        liveWMBSJob = Job(id=liveJob["id"])
        liveWMBSJob.update(liveJob)
        liveWMBSJobs[liveJob["state"]].append(liveWMBSJob)

    for state, jobsByState in liveWMBSJobs.items():
        if len(jobsByState) > 100 and state != "executing":
            # if there are to many jobs skip the couch and dashboard update
            # TODO: couch and dashboard need to be updated or parallel.
            changeState.check("killed", state)
            changeState.persist(jobsByState, "killed", state)
        else:
            changeState.propagate(jobsByState, "killed", state)
    return
Example #56
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if type(self.maxRetries) != dict:
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException(
                'Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes',
                                 [])
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available
        self.initAlerts(compName="ErrorHandler")

        # Some exit codes imply an immediate failure, non-configurable
        self.exitCodes.extend(WMJobPermanentSystemErrors)

        return

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed" %
                     (len(jobList), state))
        retrydoneJobs = []
        cooloffJobs = []
        passJobs = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'],
                                                 self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                self.sendAlert(4, msg=msg)
                logging.debug("JobInfo: %s" % job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(
                cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs,
                                       'retrydone',
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs,
                                       '%scooloff' % state,
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs" % len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []
        for job in jobList:
            report = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error(
                    "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff."
                    % job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error(
                    "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff."
                    % (job['id'], reportPath))
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                if startTime == None or stopTime == None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i" %
                                  job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (
                        job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len(
                    [x for x in report.getExitCodes() if x in self.exitCodes]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    self.sendAlert(4, msg=msg)
                    exhaustJobs.append(job)
                    continue

                if len(
                    [x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception, ex:
                logging.warning(
                    "Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs
Example #57
0
class AccountantWorker(WMConnectionBase):
    """
    Class that actually does the work of parsing FWJRs for the Accountant
    Run through ProcessPool
    """
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentAndGrandParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn2Psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        self.jobDBName = config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = None
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        return

    def reset(self):
        """
        _reset_

        Reset all global vars between runs.
        """
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        gc.collect()
        return

    def loadJobReport(self, jobReportPath):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s", jobReportPath)
            return self.createMissingFWKJR(99999, "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://", "")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s", jobReportPath)
            return self.createMissingFWKJR(
                99999,
                'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s", jobReportPath)
            return self.createMissingFWKJR(
                99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except Exception as ex:
            msg = "Error loading jobReport %s\n" % jobReportPath
            msg += str(ex)
            logging.error(msg)
            return self.createMissingFWKJR(99997, 'Cannot load jobReport')

        if not jobReport.listSteps():
            logging.error("FwkJobReport with no steps: %s", jobReportPath)
            return self.createMissingFWKJR(
                99997, 'jobReport with no steps: %s ' % jobReportPath)

        return jobReport

    def isTaskExistInFWJR(self, jobReport, jobStatus):
        """
        If taskName is not available in the FWJR, then tries to
        recover it getting data from the SQL database.
        """
        if not jobReport.getTaskName():
            logging.warning(
                "Trying to recover a corrupted FWJR for a %s job with job id %s",
                jobStatus, jobReport.getJobID())
            jobInfo = self.getJobTaskNameAction.execute(
                jobId=jobReport.getJobID(),
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

            jobReport.setTaskName(jobInfo['taskName'])
            jobReport.save(jobInfo['fwjr_path'])
            if not jobReport.getTaskName():
                msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (
                    jobStatus, jobReport.getJobID())
                raise AccountantWorkerException(msg)
            else:
                logging.info(
                    "TaskName '%s' successfully recovered and added to fwjr id %s.",
                    jobReport.getTaskName(), jobReport.getJobID())

        return

    def __call__(self, parameters):
        """
        __call__

        Handle a completed job.  The parameters dictionary will contain the job
        ID and the path to the framework job report.
        """
        returnList = []
        self.reset()

        for job in parameters:
            logging.info("Handling %s", job["fwjr_path"])

            # Load the job and set the ID
            fwkJobReport = self.loadJobReport(job["fwjr_path"])
            fwkJobReport.setJobID(job['id'])

            jobSuccess = self.handleJob(jobID=job["id"],
                                        fwkJobReport=fwkJobReport)

            if self.returnJobReport:
                returnList.append({
                    'id': job["id"],
                    'jobSuccess': jobSuccess,
                    'jobReport': fwkJobReport
                })
            else:
                returnList.append({'id': job["id"], 'jobSuccess': jobSuccess})

            self.count += 1

        existingTransaction = self.beginTransaction()

        # Now things done at the end of the job
        # Do what we can with WMBS files
        self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds)

        # handle merge files separately since parentage need to set
        # separately to support robust merge
        self.handleWMBSFiles(self.wmbsMergeFilesToBuild,
                             self.parentageBindsForMerge)

        # Create DBSBufferFiles
        self.createFilesInDBSBuffer()

        # Handle filesetAssoc
        if self.filesetAssoc:
            self.bulkAddToFilesetAction.execute(
                binds=self.filesetAssoc,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        # Move successful jobs to successful
        if self.listOfJobsToSave:
            idList = [x['id'] for x in self.listOfJobsToSave]
            outcomeBinds = [{
                'jobid': x['id'],
                'outcome': x['outcome']
            } for x in self.listOfJobsToSave]
            self.setBulkOutcome.execute(binds=outcomeBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.jobCompleteInput.execute(
                id=idList,
                lfnsToSkip=self.jobsWithSkippedFiles,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToSave, "success",
                                        "complete")

        # If we have failed jobs, fail them
        if self.listOfJobsToFail:
            outcomeBinds = [{
                'jobid': x['id'],
                'outcome': x['outcome']
            } for x in self.listOfJobsToFail]
            self.setBulkOutcome.execute(binds=outcomeBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed",
                                        "complete")

        # Arrange WMBS parentage
        if self.parentageBinds:
            self.setParentageByJob.execute(
                binds=self.parentageBinds,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
        if self.parentageBindsForMerge:
            self.setParentageByMergeJob.execute(
                binds=self.parentageBindsForMerge,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        # Straighten out DBS Parentage
        if self.mergedOutputFiles:
            self.handleDBSBufferParentage()

        if self.jobsWithSkippedFiles:
            self.handleSkippedFiles()

        self.commitTransaction(existingTransaction)

        return returnList

    def outputFilesetsForJob(self, outputMap, merged, moduleLabel, datatier):
        """
        _outputFilesetsForJob_

        Determine if the file should be placed in any other fileset.  Note that
        this will not return the JobGroup output fileset as all jobs will have
        their output placed there.
        """
        # output map identifier uses output module + datatier
        moduleLabel += datatier
        if moduleLabel not in outputMap:
            logging.info("Output module label missing from output map.")
            return []

        outputFilesets = []
        for outputFileset in outputMap[moduleLabel]:
            if merged is False and outputFileset["output_fileset"] is not None:
                outputFilesets.append(outputFileset["output_fileset"])
            else:
                if outputFileset["merged_output_fileset"] is not None:
                    outputFilesets.append(
                        outputFileset["merged_output_fileset"])

        return outputFilesets

    def addFileToDBS(self, jobReportFile, task, errorDataset=False):
        """
        _addFileToDBS_

        Add a file that was output from a job to the DBS buffer.
        """
        datasetInfo = jobReportFile["dataset"]

        dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"],
                                size=jobReportFile["size"],
                                events=jobReportFile["events"],
                                checksums=jobReportFile["checksums"],
                                status="NOTUPLOADED",
                                inPhedex=0)
        dbsFile.setAlgorithm(appName=datasetInfo["applicationName"],
                             appVer=datasetInfo["applicationVersion"],
                             appFam=jobReportFile["module_label"],
                             psetHash="GIBBERISH",
                             configContent=jobReportFile.get('configURL'))

        if errorDataset:
            dbsFile.setDatasetPath(
                "/%s/%s/%s" %
                (datasetInfo["primaryDataset"] + "-Error",
                 datasetInfo["processedDataset"], datasetInfo["dataTier"]))
        else:
            dbsFile.setDatasetPath(
                "/%s/%s/%s" %
                (datasetInfo["primaryDataset"],
                 datasetInfo["processedDataset"], datasetInfo["dataTier"]))

        dbsFile.setValidStatus(
            validStatus=jobReportFile.get("validStatus", None))
        dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None))
        dbsFile.setAcquisitionEra(
            era=jobReportFile.get('acquisitionEra', None))
        dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None))
        # TODO need to find where to get the prep id
        dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None))
        dbsFile['task'] = task
        dbsFile['runs'] = jobReportFile['runs']

        dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0],
                            immediateSave=False)
        self.dbsFilesToCreate.append(dbsFile)
        return

    def findDBSParents(self, lfn):
        """
        _findDBSParents_

        Find the parent of the file in DBS
        This is meant to be called recursively
        """
        parentsInfo = self.getParentInfoAction.execute(
            [lfn],
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        newParents = set()
        for parentInfo in parentsInfo:
            # This will catch straight to merge files that do not have redneck
            # parents.  We will mark the straight to merge file from the job
            # as a child of the merged parent.
            if int(parentInfo["merged"]) == 1:
                newParents.add(parentInfo["lfn"])

            elif parentInfo['gpmerged'] is None:
                continue

            # Handle the files that result from merge jobs that aren't redneck
            # children.  We have to setup parentage and then check on whether or
            # not this file has any redneck children and update their parentage
            # information.
            elif int(parentInfo["gpmerged"]) == 1:
                newParents.add(parentInfo["gplfn"])

            # If that didn't work, we've reached the great-grandparents
            # And we have to work via recursion
            else:
                parentSet = self.findDBSParents(lfn=parentInfo['gplfn'])
                for parent in parentSet:
                    newParents.add(parent)

        return newParents

    def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None):
        """
        _addFileToWMBS_

        Add a file that was produced in a job to WMBS.
        """
        fwjrFile["first_event"] = jobMask["FirstEvent"]

        if fwjrFile["first_event"] is None:
            fwjrFile["first_event"] = 0

        if jobType == "Merge" and fwjrFile["module_label"] != "logArchive":
            setattr(fwjrFile["fileRef"], 'merged', True)
            fwjrFile["merged"] = True

        wmbsFile = self.createFileFromDataStructsFile(fname=fwjrFile,
                                                      jobID=jobID)

        if jobType == "Merge":
            self.wmbsMergeFilesToBuild.append(wmbsFile)
        else:
            self.wmbsFilesToBuild.append(wmbsFile)

        if fwjrFile["merged"]:
            self.addFileToDBS(
                fwjrFile, task, jobType == "Repack"
                and fwjrFile["size"] > self.maxAllowedRepackOutputSize)

        return wmbsFile

    def handleJob(self, jobID, fwkJobReport):
        """
        _handleJob_

        Figure out if a job was successful or not, handle it appropriately
        (parse FWJR, update WMBS) and return the success status as a boolean

        """
        jobSuccess = fwkJobReport.taskSuccessful()

        outputMap = self.getOutputMapAction.execute(
            jobID=jobID,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        jobType = self.getJobTypeAction.execute(
            jobID=jobID,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        if jobSuccess:
            fileList = fwkJobReport.getAllFiles()

            # consistency check comparing outputMap to fileList
            # they should match except for some limited special cases
            # as of #7998, workflow output identifier is made of output module name and datatier
            outputModules = set([])
            for fwjrFile in fileList:
                outputModules.add(fwjrFile['outputModule'] +
                                  fwjrFile['dataset'].get('dataTier', ''))
            if set(outputMap) == outputModules:
                pass
            elif jobType == "LogCollect" and not outputMap and outputModules == {
                    'LogCollect'
            }:
                pass
            elif jobType == "Merge" and set(outputMap) == {'MergedRAW', 'MergedErrorRAW', 'logArchive'} \
                    and outputModules == {'MergedRAW', 'logArchive'}:
                pass
            elif jobType == "Merge" and set(outputMap) == {'MergedRAW', 'MergedErrorRAW', 'logArchive'} \
                    and outputModules == {'MergedErrorRAW', 'logArchive'}:
                pass
            elif jobType == "Express" and set(outputMap).difference(
                    outputModules) == {'write_RAWRAW'}:
                pass
            else:
                failJob = True
                if jobType in ["Processing", "Production"]:
                    cmsRunSteps = 0
                    for step in fwkJobReport.listSteps():
                        if step.startswith("cmsRun"):
                            cmsRunSteps += 1
                    if cmsRunSteps > 1:
                        failJob = False

                if failJob:
                    jobSuccess = False
                    logging.error(
                        "Job %d , list of expected outputModules does not match job report, failing job",
                        jobID)
                    logging.debug("Job %d , expected outputModules %s", jobID,
                                  sorted(outputMap.keys()))
                    logging.debug("Job %d , fwjr outputModules %s", jobID,
                                  sorted(outputModules))
                    fileList = fwkJobReport.getAllFilesFromStep(
                        step='logArch1')
                else:
                    logging.warning(
                        "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job",
                        jobID)
            # Make sure every file has a valid location
            # see https://github.com/dmwm/WMCore/issues/9353
            for fwjrFile in fileList:
                # T0 has analysis file without any location, see:
                # https://github.com/dmwm/WMCore/issues/9497
                if not fwjrFile.get("locations") and fwjrFile.get(
                        "lfn", "").endswith(".root"):
                    logging.warning(
                        "The following file doesn't have any location: %s",
                        fwjrFile)
                    jobSuccess = False
                    break
        else:
            fileList = fwkJobReport.getAllFilesFromStep(step='logArch1')

        if jobSuccess:
            logging.info("Job %d , handle successful job", jobID)
        else:
            logging.warning("Job %d , bad jobReport, failing job", jobID)

        # make sure the task name is present in FWJR (recover from WMBS if needed)
        if fileList:
            if jobSuccess:
                self.isTaskExistInFWJR(fwkJobReport, "success")
            else:
                self.isTaskExistInFWJR(fwkJobReport, "failed")

        # special check for LogCollect jobs
        skipLogCollect = False
        if jobSuccess and jobType == "LogCollect":
            for fwjrFile in fileList:
                try:
                    # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes
                    self.associateLogCollectToParentJobsInWMStats(
                        fwkJobReport, fwjrFile["lfn"],
                        fwkJobReport.getTaskName())
                except Exception as ex:
                    skipLogCollect = True
                    logging.error(
                        "Error occurred: associating log collect location, will try again\n %s",
                        str(ex))
                    break

        # now handle the job (unless the special LogCollect check failed)
        if not skipLogCollect:

            wmbsJob = Job(id=jobID)
            wmbsJob.load()
            outputID = wmbsJob.loadOutputID()
            wmbsJob.getMask()

            wmbsJob["fwjr"] = fwkJobReport

            if jobSuccess:
                wmbsJob["outcome"] = "success"
            else:
                wmbsJob["outcome"] = "failure"

            for fwjrFile in fileList:

                logging.debug("Job %d , register output %s", jobID,
                              fwjrFile["lfn"])

                wmbsFile = self.addFileToWMBS(jobType,
                                              fwjrFile,
                                              wmbsJob["mask"],
                                              jobID=jobID,
                                              task=fwkJobReport.getTaskName())
                merged = fwjrFile['merged']
                moduleLabel = fwjrFile["module_label"]

                if merged:
                    self.mergedOutputFiles.append(wmbsFile)

                self.filesetAssoc.append({
                    "lfn": wmbsFile["lfn"],
                    "fileset": outputID
                })

                # LogCollect jobs have no output fileset
                if jobType == "LogCollect":
                    pass
                # Repack jobs that wrote too large merged output skip output filesets
                elif jobType == "Repack" and merged and wmbsFile[
                        "size"] > self.maxAllowedRepackOutputSize:
                    pass
                else:
                    dataTier = fwjrFile['dataset'].get('dataTier', '')
                    outputFilesets = self.outputFilesetsForJob(
                        outputMap, merged, moduleLabel, dataTier)
                    for outputFileset in outputFilesets:
                        self.filesetAssoc.append({
                            "lfn": wmbsFile["lfn"],
                            "fileset": outputFileset
                        })

            # Check if the job had any skipped files, put them in ACDC containers
            # We assume full file processing (no job masks)
            if jobSuccess:
                skippedFiles = fwkJobReport.getAllSkippedFiles()
                if skippedFiles and jobType not in ['LogCollect', 'Cleanup']:
                    self.jobsWithSkippedFiles[jobID] = skippedFiles

            if jobSuccess:
                self.listOfJobsToSave.append(wmbsJob)
            else:
                self.listOfJobsToFail.append(wmbsJob)

        return jobSuccess

    def associateLogCollectToParentJobsInWMStats(self, fwkJobReport,
                                                 logAchiveLFN, task):
        """
        _associateLogCollectToParentJobsInWMStats_

        Associate a logArchive output to its parent job
        """
        if self.fwjrCouchDB is None:
            self.fwjrCouchDB = self.jobCouchdb.connectDatabase("%s/fwjrs" %
                                                               self.jobDBName)

        inputFileList = fwkJobReport.getAllInputFiles()
        requestName = task.split('/')[1]
        keys = []
        for inputFile in inputFileList:
            keys.append([requestName, inputFile["lfn"]])
        resultRows = self.fwjrCouchDB.loadView(
            "FWJRDump",
            'jobsByOutputLFN',
            options={"stale": "update_after"},
            keys=keys)['rows']
        if resultRows:
            # get data from wmbs
            parentWMBSJobIDs = []
            for row in resultRows:
                parentWMBSJobIDs.append({"jobid": row["value"]})
            # update Job doc in wmstats
            results = self.getJobInfoByID.execute(parentWMBSJobIDs)
            parentJobNames = []

            if isinstance(results, list):
                for jobInfo in results:
                    parentJobNames.append(jobInfo['name'])
            else:
                parentJobNames.append(results['name'])

            self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN)
        else:
            # TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0)
            # It need to be failed and retried.
            logging.error(
                "job report is missing for updating log archive mapping\n Input file list\n %s",
                inputFileList)

        return

    def createMissingFWKJR(self,
                           errorCode=999,
                           errorDescription='Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", 84, errorCode, errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report

    def createFilesInDBSBuffer(self):
        """
        _createFilesInDBSBuffer_
        It does the actual job of creating things in DBSBuffer
        WARNING: This assumes all files in a job have the same final location
        """
        if not self.dbsFilesToCreate:
            # Whoops, nothing to do!
            return

        dbsFileTuples = []
        dbsFileLoc = []
        dbsCksumBinds = []
        runLumiBinds = []
        jobLocations = set()

        for dbsFile in self.dbsFilesToCreate:
            # Append a tuple in the format specified by DBSBufferFiles.Add
            # Also run insertDatasetAlgo

            assocID = None
            datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (
                dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"],
                dbsFile["appFam"], dbsFile["psetHash"],
                dbsFile['processingVer'], dbsFile['acquisitionEra'],
                dbsFile['globalTag'])
            # First, check if this is in the cache
            if datasetAlgoPath in self.datasetAlgoPaths:
                for da in self.datasetAlgoID:
                    if da['datasetAlgoPath'] == datasetAlgoPath:
                        assocID = da['assocID']
                        break

            if not assocID:
                # Then we have to get it ourselves
                try:
                    assocID = dbsFile.insertDatasetAlgo()
                    self.datasetAlgoPaths.append(datasetAlgoPath)
                    self.datasetAlgoID.append({
                        'datasetAlgoPath': datasetAlgoPath,
                        'assocID': assocID
                    })
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath
                    msg += str(ex)
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            # Associate the workflow to the file using the taskPath and the requestName
            # TODO: debug why it happens and then drop/recover these cases automatically
            taskPath = dbsFile.get('task')
            if not taskPath:
                msg = "Can't do workflow association, report this error to a developer.\n"
                msg += "DbsFile : %s" % str(dbsFile)
                raise AccountantWorkerException(msg)
            workflowName = taskPath.split('/')[1]
            workflowPath = '%s:%s' % (workflowName, taskPath)
            if workflowPath in self.workflowPaths:
                for wf in self.workflowIDs:
                    if wf['workflowPath'] == workflowPath:
                        workflowID = wf['workflowID']
                        break
            else:
                result = self.dbsGetWorkflow.execute(
                    workflowName,
                    taskPath,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
                workflowID = result['id']

            self.workflowPaths.append(workflowPath)
            self.workflowIDs.append({
                'workflowPath': workflowPath,
                'workflowID': workflowID
            })

            lfn = dbsFile['lfn']
            selfChecksums = dbsFile['checksums']
            jobLocation = dbsFile.getLocations()[0]
            jobLocations.add(jobLocation)
            dbsFileTuples.append(
                (lfn, dbsFile['size'], dbsFile['events'], assocID,
                 dbsFile['status'], workflowID, dbsFile['in_phedex']))

            dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation})
            if dbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']})

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    dbsCksumBinds.append({
                        'lfn': lfn,
                        'cksum': selfChecksums[entry],
                        'cktype': entry
                    })

        try:

            diffLocation = jobLocations.difference(self.dbsLocations)
            self.dbsInsertLocation.execute(
                siteName=diffLocation,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
            self.dbsLocations.union(
                diffLocation)  # update the component cache location list

            self.dbsCreateFiles.execute(files=dbsFileTuples,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.dbsSetLocation.execute(binds=dbsFileLoc,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.dbsSetChecksum.execute(bulkList=dbsCksumBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            if runLumiBinds:
                self.dbsSetRunLumi.execute(
                    file=runLumiBinds,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
        except WMException:
            raise
        except Exception as ex:
            msg = "Got exception while inserting files into DBSBuffer!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Listing binds:")
            logging.debug("jobLocation: %s", jobLocation)
            logging.debug("dbsFiles: %s", dbsFileTuples)
            logging.debug("dbsFileLoc: %s", dbsFileLoc)
            logging.debug("Checksum binds: %s", dbsCksumBinds)
            logging.debug("RunLumi binds: %s", runLumiBinds)
            raise AccountantWorkerException(msg)

        # Now that we've created those files, clear the list
        self.dbsFilesToCreate = []
        return

    def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds):
        """
        _handleWMBSFiles_

        Do what can be done in bulk in bulk
        """
        if not wmbsFilesToBuild:
            # Nothing to do
            return

        runLumiBinds = []
        fileCksumBinds = []
        fileLocations = []
        fileCreate = []

        for wmbsFile in wmbsFilesToBuild:
            lfn = wmbsFile['lfn']
            if lfn is None:
                continue

            selfChecksums = wmbsFile['checksums']
            # by jobType add to different parentage relation
            # if it is the merge job, don't include the parentage on failed input files.
            # otherwise parentage is set for all input files.
            parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']})

            if wmbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']})

            if wmbsFile.getLocations():
                outpnn = wmbsFile.getLocations()[0]
                if self.pnn2Psn.get(outpnn, None):
                    fileLocations.append({'lfn': lfn, 'location': outpnn})
                else:
                    msg = "PNN doesn't exist in wmbs_pnns table: %s (investigate)" % outpnn
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    fileCksumBinds.append({
                        'lfn': lfn,
                        'cksum': selfChecksums[entry],
                        'cktype': entry
                    })

            fileCreate.append([
                lfn, wmbsFile['size'], wmbsFile['events'], None,
                wmbsFile["first_event"], wmbsFile['merged']
            ])

        if not fileCreate:
            return

        try:

            self.addFileAction.execute(files=fileCreate,
                                       conn=self.getDBConn(),
                                       transaction=self.existingTransaction())

            if runLumiBinds:
                self.setFileRunLumi.execute(
                    file=runLumiBinds,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())

            self.setFileAddChecksum.execute(
                bulkList=fileCksumBinds,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

            self.setFileLocation.execute(
                lfn=fileLocations,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        except WMException:
            raise
        except Exception as ex:
            msg = "Error while adding files to WMBS!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Printing binds:")
            logging.debug("FileCreate binds: %s", fileCreate)
            logging.debug("Runlumi binds: %s", runLumiBinds)
            logging.debug("Checksum binds: %s", fileCksumBinds)
            logging.debug("FileLocation binds: %s", fileLocations)
            raise AccountantWorkerException(msg)

        # Clear out finished files
        wmbsFilesToBuild = []
        return

    def createFileFromDataStructsFile(self, fname, jobID):
        """
        _createFileFromDataStructsFile_

        This function will create a WMBS File given a DataStructs file
        """
        wmbsFile = File()
        wmbsFile.update(fname)

        if fname["locations"] and isinstance(fname["locations"], set):
            pnn = list(fname["locations"])[0]
        elif fname["locations"] and isinstance(fname["locations"], list):
            if len(fname['locations']) > 1:
                logging.error(
                    "Have more then one location for a file in job %i", jobID)
                logging.error("Choosing location %s", fname['locations'][0])
            pnn = fname["locations"][0]
        else:
            pnn = fname["locations"]

        wmbsFile["locations"] = set()

        if pnn != None:
            wmbsFile.setLocation(pnn=pnn, immediateSave=False)
        wmbsFile['jid'] = jobID

        return wmbsFile

    def handleDBSBufferParentage(self):
        """
        _handleDBSBufferParentage_

        Handle all the DBSBuffer Parentage in bulk if you can
        """
        outputLFNs = [f['lfn'] for f in self.mergedOutputFiles]
        bindList = []
        for lfn in outputLFNs:
            newParents = self.findDBSParents(lfn=lfn)
            for parentLFN in newParents:
                bindList.append({'child': lfn, 'parent': parentLFN})

        # Now all the parents should exist
        # Commit them to DBSBuffer
        logging.info("About to commit all DBSBuffer Heritage information")
        logging.info(len(bindList))

        if bindList:
            try:
                self.dbsLFNHeritage.execute(
                    binds=bindList,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
            except WMException:
                raise
            except Exception as ex:
                msg = "Error while trying to handle the DBS LFN heritage\n"
                msg += str(ex)
                msg += "BindList: %s" % bindList
                logging.error(msg)
                raise AccountantWorkerException(msg)
        return

    def handleSkippedFiles(self):
        """
        _handleSkippedFiles_

        Handle all the skipped files in bulk,
        the way it handles the skipped files
        imposes an important restriction:
        Skipped files should have been processed by a single job
        in the task and no job mask exists in it.
        This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased
        splitting algorithms.
        Here ACDC records and created and the file are moved
        to wmbs_sub_files_failed from completed.
        """
        jobList = self.getFullJobInfo.execute(
            [{
                'jobid': x
            } for x in self.jobsWithSkippedFiles],
            fileSelection=self.jobsWithSkippedFiles,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        self.dataCollection.failedJobs(jobList, useMask=False)
        return
Example #58
0
class JobArchiverPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.changeState = ChangeState(self.config)

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.loadAction = self.daoFactory(
            classname="Jobs.LoadFromIDWithWorkflow")

        # Variables
        self.numberOfJobsToCluster = getattr(self.config.JobArchiver,
                                             "numberOfJobsToCluster", 1000)
        self.numberOfJobsToArchive = getattr(self.config.JobArchiver,
                                             "numberOfJobsToArchive", 10000)

        try:
            self.logDir = getattr(
                config.JobArchiver, 'logDir',
                os.path.join(config.JobArchiver.componentDir, 'logDir'))
            if not os.path.isdir(self.logDir):
                os.makedirs(self.logDir)
        except Exception as ex:
            msg = "Unhandled exception while setting up logDir!\n"
            msg += str(ex)
            logging.exception(msg)
            raise JobArchiverPollerException(msg)

        self.tier0Mode = hasattr(config, "Tier0Feeder")

        try:
            if not self.tier0Mode:
                self.workQueue = queueFromConfig(self.config)
        except Exception as ex:
            msg = "Could not load workQueue"
            msg += str(ex)
            logging.error(msg)
            # raise JobArchiverPollerException(msg)

        return

    def setup(self, parameters):
        """
        Load DB objects required for queries
        """
        return

    def terminate(self, params):
        """
        _terminate_

        This function terminates the job after a final pass
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    @timeFunction
    def algorithm(self, parameters=None):
        """
        Performs the archiveJobs method, looking for each type of failure
        And deal with it as desired.
        """
        try:
            self.archiveJobs()
            self.pollForClosable()
            self.markInjected()
        except WMException:
            myThread = threading.currentThread()
            if getattr(myThread, 'transaction', None) is not None \
                    and getattr(myThread.transaction, 'transaction', None) is not None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            myThread = threading.currentThread()
            msg = "Caught exception in JobArchiver\n"
            msg += str(ex)
            msg += "\n\n"
            if getattr(myThread, 'transaction', None) is not None \
                    and getattr(myThread.transaction, 'transaction', None) is not None:
                myThread.transaction.rollback()
            raise JobArchiverPollerException(msg)

        return

    def archiveJobs(self):
        """
        _archiveJobs_

        archiveJobs will handle the master task of looking for finished jobs,
        and running the code that cleans them out.
        """
        doneList = self.findFinishedJobs()
        logging.info("Found %i finished jobs to archive", len(doneList))

        jobCounter = 0
        for slicedList in grouper(doneList, 10000):
            self.cleanWorkArea(slicedList)

            successList = []
            failList = []
            killList = []
            for job in slicedList:
                if job["outcome"] == "success":
                    successList.append(job)
                elif job["outcome"] == "killed":
                    killList.append(job)
                else:
                    failList.append(job)

            myThread = threading.currentThread()
            myThread.transaction.begin()
            self.changeState.propagate(successList, "cleanout", "success")
            self.changeState.propagate(failList, "cleanout", "exhausted")
            self.changeState.propagate(killList, "cleanout", "killed")
            myThread.transaction.commit()

            jobCounter += len(slicedList)
            logging.info("Successfully archived %d jobs out of %d.",
                         jobCounter, len(doneList))

    def findFinishedJobs(self):
        """
        _findFinishedJobs_

        Will actually, surprisingly, find finished jobs (i.e., jobs either exhausted or successful)
        """
        jobList = []

        jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")
        jobList1 = jobListAction.execute(state="success",
                                         limitRows=self.numberOfJobsToArchive)
        jobList2 = jobListAction.execute(state="exhausted",
                                         limitRows=self.numberOfJobsToArchive)
        jobList3 = jobListAction.execute(state="killed",
                                         limitRows=self.numberOfJobsToArchive)

        jobList.extend(jobList1)
        jobList.extend(jobList2)
        jobList.extend(jobList3)

        if len(jobList) == 0:
            # Then nothing is ready
            return []

        # Put together a list of job IDs
        binds = []
        for jobID in jobList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID=binds)

        if not isinstance(results, list):
            results = [results]

        doneList = []

        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            doneList.append(tmpJob)

        return doneList

    def cleanWorkArea(self, doneList):
        """
        _cleanWorkArea_

        Upon workQueue realizing that a subscriptions is done, everything
        regarding those jobs is cleaned up.
        """

        for job in doneList:
            # print "About to clean cache for job %i" % (job['id'])
            self.cleanJobCache(job)

        return

    def cleanJobCache(self, job):
        """
        _cleanJobCache_

        Clears out any files still sticking around in the jobCache,
        tars up the contents and sends them off
        """

        cacheDir = job['cache_dir']

        if not cacheDir or not os.path.isdir(cacheDir):
            msg = "Could not find jobCacheDir %s" % (cacheDir)
            logging.error(msg)
            return

        cacheDirList = os.listdir(cacheDir)

        if cacheDirList == []:
            os.rmdir(cacheDir)
            return

        # Now we need to set up a final destination
        try:
            # Label all directories by workflow
            # Workflow better have a first character
            workflow = job['workflow']
            firstCharacter = workflow[0]
            jobFolder = 'JobCluster_%i' \
                        % (int(job['id'] / self.numberOfJobsToCluster))
            logDir = os.path.join(self.logDir, firstCharacter, workflow,
                                  jobFolder)
            if not os.path.exists(logDir):
                os.makedirs(logDir)
        except Exception as ex:
            msg = "Exception while trying to make output logDir\n"
            msg += str("logDir: %s\n" % (logDir))
            msg += str(ex)
            logging.error(msg)
            raise JobArchiverPollerException(msg)

        # Otherwise we have something in there
        try:
            tarName = 'Job_%i.tar.bz2' % (job['id'])
            with tarfile.open(name=os.path.join(logDir, tarName),
                              mode='w:bz2') as tarball:
                for fileName in cacheDirList:
                    fullFile = os.path.join(cacheDir, fileName)
                    try:
                        tarball.add(name=fullFile,
                                    arcname='Job_%i/%s' %
                                    (job['id'], fileName))
                    except IOError:
                        logging.error('Cannot read %s, skipping', fullFile)
        except Exception as ex:
            msg = "Exception while opening and adding to a tarfile\n"
            msg += "Tarfile: %s\n" % os.path.join(logDir, tarName)
            msg += str(ex)
            logging.error(msg)
            logging.debug("cacheDirList: %s", cacheDirList)
            raise JobArchiverPollerException(msg)

        try:
            shutil.rmtree('%s' % (cacheDir), ignore_errors=True)
        except Exception as ex:
            msg = "Error while removing the old cache dir.\n"
            msg += "CacheDir: %s\n" % cacheDir
            msg += str(ex)
            logging.error(msg)
            raise JobArchiverPollerException(msg)

        return

    def markInjected(self):
        """
        _markInjected_

        Mark any workflows that have been fully injected as injected
        """

        if self.tier0Mode:
            logging.debug(
                "Component will not check workflows for injection status")
            return

        myThread = threading.currentThread()
        getAction = self.daoFactory(classname="Workflow.GetInjectedWorkflows")
        markAction = self.daoFactory(
            classname="Workflow.MarkInjectedWorkflows")
        result = getAction.execute()

        # Check each result to see if it is injected:
        injected = []
        for name in result:
            try:
                if self.workQueue.getWMBSInjectionStatus(
                        name, isDrainMode(self.config)):
                    injected.append(name)
            except WorkQueueNoMatchingElements:
                # workflow not known - free to cleanup
                injected.append(name)
            except Exception as ex:
                logging.exception(
                    "Injection status checking failed, investigate: %s",
                    str(ex))

        logging.info("Found %d workflows to mark as injected", len(injected))
        # Now, mark as injected those that returned True
        if len(injected) > 0:
            myThread.transaction.begin()
            markAction.execute(names=injected, injected=True)
            myThread.transaction.commit()
        return

    def pollForClosable(self):
        """
        _pollForClosable_

        Search WMBS for filesets that can be closed and mark them as closed.
        """
        myThread = threading.currentThread()
        myThread.transaction.begin()

        closableFilesetDAO = self.daoFactory(classname="Fileset.ListClosable")
        closableFilesets = closableFilesetDAO.execute()
        logging.info("Found %d filesets to be closed", len(closableFilesets))

        for closableFileset in closableFilesets:
            openFileset = Fileset(id=closableFileset)
            openFileset.load()

            logging.debug("Closing fileset %s", openFileset.name)
            openFileset.markOpen(False)

        myThread.transaction.commit()
Example #59
0
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentAndGrandParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn2Psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        self.jobDBName = config.JobStateMachine.couchDBName
        self.jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = None
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        return
Example #60
0
    def testRecordInCouch(self):
        """
        _testRecordInCouch_

        Verify that jobs, state transitions and fwjrs are recorded correctly.
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute("site1", seName="somese.cern.ch")

        testWorkflow = Workflow(spec="spec.xml",
                                owner="Steve",
                                name="wf001",
                                task="Test")
        testWorkflow.create()
        testFileset = Fileset(name="TestFileset")
        testFileset.create()
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased")
        testSubscription.create()

        testFileA = File(lfn="SomeLFNA",
                         events=1024,
                         size=2048,
                         locations=set(["somese.cern.ch"]))
        testFileB = File(lfn="SomeLFNB",
                         events=1025,
                         size=2049,
                         locations=set(["somese.cern.ch"]))
        testFileA.create()
        testFileB.create()

        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.commit()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)
        jobGroup = jobFactory(files_per_job=1)[0]

        assert len(jobGroup.jobs) == 2, \
               "Error: Splitting should have created two jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Merge"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"

        change.propagate([testJobA, testJobB], "new", "none")
        change.propagate([testJobA, testJobB], "created", "new")
        change.propagate([testJobA, testJobB], "executing", "created")

        testJobADoc = change.jobsdatabase.document(testJobA["couch_record"])

        for transition in testJobADoc["states"].itervalues():
            self.assertTrue(
                type(transition["timestamp"]) in (types.IntType,
                                                  types.LongType))

        assert testJobADoc["jobid"] == testJobA["id"], \
               "Error: ID parameter is incorrect."
        assert testJobADoc["name"] == testJobA["name"], \
               "Error: Name parameter is incorrect."
        assert testJobADoc["jobgroup"] == testJobA["jobgroup"], \
               "Error: Jobgroup parameter is incorrect."
        assert testJobADoc["workflow"] == testJobA["workflow"], \
               "Error: Workflow parameter is incorrect."
        assert testJobADoc["task"] == testJobA["task"], \
               "Error: Task parameter is incorrect."
        assert testJobADoc["owner"] == testJobA["owner"], \
               "Error: Owner parameter is incorrect."

        assert testJobADoc["mask"]["FirstEvent"] == testJobA["mask"]["FirstEvent"], \
               "Error: First event in mask is incorrect."
        assert testJobADoc["mask"]["LastEvent"] == testJobA["mask"]["LastEvent"], \
               "Error: Last event in mask is incorrect."
        assert testJobADoc["mask"]["FirstLumi"] == testJobA["mask"]["FirstLumi"], \
               "Error: First lumi in mask is incorrect."
        assert testJobADoc["mask"]["LastLumi"] == testJobA["mask"]["LastLumi"], \
               "Error: First lumi in mask is incorrect."
        assert testJobADoc["mask"]["FirstRun"] == testJobA["mask"]["FirstRun"], \
               "Error: First run in mask is incorrect."
        assert testJobADoc["mask"]["LastEvent"] == testJobA["mask"]["LastRun"], \
               "Error: First event in mask is incorrect."

        assert len(testJobADoc["inputfiles"]) == 1, \
               "Error: Input files parameter is incorrect."

        testJobBDoc = change.jobsdatabase.document(testJobB["couch_record"])

        assert testJobBDoc["jobid"] == testJobB["id"], \
               "Error: ID parameter is incorrect."
        assert testJobBDoc["name"] == testJobB["name"], \
               "Error: Name parameter is incorrect."
        assert testJobBDoc["jobgroup"] == testJobB["jobgroup"], \
               "Error: Jobgroup parameter is incorrect."

        assert testJobBDoc["mask"]["FirstEvent"] == testJobB["mask"]["FirstEvent"], \
               "Error: First event in mask is incorrect."
        assert testJobBDoc["mask"]["LastEvent"] == testJobB["mask"]["LastEvent"], \
               "Error: Last event in mask is incorrect."
        assert testJobBDoc["mask"]["FirstLumi"] == testJobB["mask"]["FirstLumi"], \
               "Error: First lumi in mask is incorrect."
        assert testJobBDoc["mask"]["LastLumi"] == testJobB["mask"]["LastLumi"], \
               "Error: First lumi in mask is incorrect."
        assert testJobBDoc["mask"]["FirstRun"] == testJobB["mask"]["FirstRun"], \
               "Error: First run in mask is incorrect."
        assert testJobBDoc["mask"]["LastEvent"] == testJobB["mask"]["LastRun"], \
               "Error: First event in mask is incorrect."

        assert len(testJobBDoc["inputfiles"]) == 1, \
               "Error: Input files parameter is incorrect."

        changeStateDB = self.couchServer.connectDatabase(
            dbname="changestate_t/jobs")
        allDocs = changeStateDB.document("_all_docs")

        self.assertEqual(len(allDocs["rows"]), 3,
                         "Error: Wrong number of documents.")

        couchJobDoc = changeStateDB.document("1")

        assert couchJobDoc["name"] == testJobA["name"], \
               "Error: Name is wrong"
        assert len(couchJobDoc["inputfiles"]) == 1, \
               "Error: Wrong number of input files."

        result = changeStateDB.loadView("JobDump", "jobsByWorkflowName")

        self.assertEqual(len(result["rows"]), 2,
                         "Error: Wrong number of rows.")
        for row in result["rows"]:
            couchJobDoc = changeStateDB.document(row["value"]["id"])
            self.assertEqual(couchJobDoc["_rev"], row["value"]["rev"],
                             "Error: Rev is wrong.")

        return