Ejemplo n.º 1
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'

        baAPI.submit(jobs=jobDummies)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)

        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)

        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)

        baAPI.removeComplete(jobs=jobDummies)

        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)

        return
Ejemplo n.º 2
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI  = BossAirAPI(config = config)


        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin']   = 'TestPlugin'
            job['owner']    = 'tapas'

        baAPI.submit(jobs = jobDummies)


        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)


        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)


        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)


        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)


        baAPI.removeComplete(jobs = jobDummies)


        result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)


        return
Ejemplo n.º 3
0
    def testD_PrototypeChain(self):
        """
        _PrototypeChain_

        Prototype the BossAir workflow
        """
        myThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))


        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = 'se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)
        jobTracker   = JobTrackerPoller(config = config)
        statusPoller = StatusPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Check WMBS
        getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs")
        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        statusPoller.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nSubs * nJobs)


        # Tracker should do nothing
        jobTracker.algorithm()

        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)


        # Wait for jobs to timeout due to short Pending wait period
        time.sleep(12)


        statusPoller.algorithm()

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Timeout', complete = '0')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Jobs should be gone
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)


        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nSubs * nJobs)


        # Because they timed out, they all should have failed
        jobTracker.algorithm()

        result = getJobsAction.execute(state = 'Executing', jobType = "Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state = 'JobFailed', jobType = "Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Ejemplo n.º 4
0
    def testD_PrototypeChain(self):
        """
        _PrototypeChain_

        Prototype the BossAir workflow
        """
        dummymyThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'SimpleCondorPlugin'

        baAPI = BossAirAPI(config=config, insertStates=True)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        dummycacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site='se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)
        jobTracker = JobTrackerPoller(config=config)
        statusPoller = StatusPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Check WMBS
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        statusPoller.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Tracker should do nothing
        jobTracker.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Wait for jobs to timeout due to short Pending wait period
        time.sleep(12)

        statusPoller.algorithm()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Timeout', complete='0')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Jobs should be gone
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nSubs * nJobs)

        # Because they timed out, they all should have failed
        jobTracker.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state='JobFailed', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Ejemplo n.º 5
0
class JobTrackerPoller(BaseWorkerThread):
    """
    _JobTrackerPoller_

    Polls the BossAir database for complete jobs
    Handles completed jobs
    """
    def __init__(self, config):
        """
        Initialise class members
        """

        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=config)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")
        self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """

        return

    def terminate(self, params=None):
        """
        _terminate_

        Terminate the function after one more run.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    @timeFunction
    def algorithm(self, parameters=None):
        """
        Performs the archiveJobs method, looking for each type of failure
        And deal with it as desired.
        """
        logging.info("Running Tracker algorithm")
        myThread = threading.currentThread()
        try:
            self.trackJobs()
        except WMException:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = "Unknown exception in JobTracker!\n"
            msg += str(ex)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollback()
            logging.error(msg)
            raise JobTrackerException(msg)

        return

    def trackJobs(self):
        """
        _trackJobs_

        Finds a list of running jobs and the sites that they're running at,
        and passes that off to tracking.
        """
        passedJobs = []
        failedJobs = []

        jobList = self.jobListAction.execute(state="executing")
        logging.info("Have list of %i executing jobs in WMBS", len(jobList))

        if not jobList:
            return

        # retrieve completed jobs from BossAir that are 'executing' in WMBS
        completeJobs = self.bossAir.getComplete()
        logging.info(
            "Have list of %i jobs complete in BossAir but executing in WMBS",
            len(completeJobs))
        logging.debug(completeJobs)

        for job in completeJobs:
            if job['id'] not in jobList:
                logging.error(
                    "Found a complete job in BossAir without a correspondent in WMBS!"
                )
                continue
            if job['status'].lower() == 'timeout':
                failedJobs.append(job)
            else:
                passedJobs.append(job)

        # Assume all these jobs "passed" if they aren't in timeout
        self.passJobs(passedJobs)
        self.failJobs(failedJobs)

        return

    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """
        if len(failedJobs) == 0:
            return

        jrBinds = []
        for job in failedJobs:
            # Make sure the job object goes packed with fwjr_path to be persisted in couch
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error(
                    "The pickle in %s could not be loaded, generating a new one",
                    jrPath)
                fwjr = Report()
                fwjr.addError("NoJobReport", 99303, "NoJobReport",
                              WM_JOB_ERROR_CODES[99303])
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds,
                                   conn=myThread.transaction.conn,
                                   transaction=True)
        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs", len(failedJobs))
        myThread.transaction.commit()

        return

    def passJobs(self, passedJobs):
        """
        _passJobs_

        Pass jobs and move their stuff?
        """
        if len(passedJobs) == 0:
            return

        jrBinds = []
        for job in passedJobs:
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds,
                                   conn=myThread.transaction.conn,
                                   transaction=True)
        self.changeState.propagate(passedJobs, 'complete', 'executing')
        myThread.transaction.commit()

        logging.info("Passed %i jobs", len(passedJobs))

        return
Ejemplo n.º 6
0
class JobTrackerPoller(BaseWorkerThread):
    """
    _JobTrackerPoller_

    Polls the BossAir database for complete jobs
    Handles completed jobs
    """
    def __init__(self, config):
        """
        Initialise class members
        """

        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=config)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobTracker")

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """

        return

    def terminate(self, params=None):
        """
        _terminate_

        Terminate the function after one more run.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    def algorithm(self, parameters=None):
        """
        Performs the archiveJobs method, looking for each type of failure
        And deal with it as desired.
        """
        logging.info("Running Tracker algorithm")
        myThread = threading.currentThread()
        try:
            self.trackJobs()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollback()
            self.sendAlert(6, msg=str(ex))
            raise
        except Exception as ex:
            msg = "Unknown exception in JobTracker!\n"
            msg += str(ex)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollback()
            self.sendAlert(6, msg=msg)
            logging.error(msg)
            raise JobTrackerException(msg)

        return

    def trackJobs(self):
        """
        _trackJobs_

        Finds a list of running jobs and the sites that they're running at,
        and passes that off to tracking.
        """

        passedJobs = []
        failedJobs = []

        # Get all jobs WMBS thinks are running
        jobList = self.jobListAction.execute(state="Executing")

        if jobList == []:
            # No jobs: do nothing
            return

        logging.info("Have list of %i executing jobs" % len(jobList))

        # Now get all jobs that BossAir thinks are complete
        completeJobs = self.bossAir.getComplete()

        logging.info("%i jobs are complete in BossAir" % (len(completeJobs)))
        logging.debug(completeJobs)

        for job in completeJobs:
            if not job['id'] in jobList:
                continue
            if job['status'].lower() == 'timeout':
                failedJobs.append(job)
            else:
                passedJobs.append(job)

        # Assume all these jobs "passed" if they aren't in timeout
        self.passJobs(passedJobs)
        self.failJobs(failedJobs)

        return

    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """

        myThread = threading.currentThread()

        if len(failedJobs) == 0:
            return

        myThread = threading.currentThread()

        # Load DAOs
        setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        loadAction = self.daoFactory(classname="Jobs.LoadFromID")

        jrBinds = []

        for job in failedJobs:
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})
            #Make sure the job object goes packed with fwjr_path so it
            #can be persisted in couch

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                #Something went wrong reading the pickle
                logging.error(
                    "The pickle in %s could not be loaded, generating a new one"
                    % jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        # Set all paths at once
        myThread.transaction.begin()
        setFWJRAction.execute(binds=jrBinds)

        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs" % (len(failedJobs)))
        myThread.transaction.commit()

        return

    def passJobs(self, passedJobs):
        """
        _passJobs_

        Pass jobs and move their stuff?
        """

        if len(passedJobs) == 0:
            return

        #Mark 'em as complete
        loadAction = self.daoFactory(classname="Jobs.LoadFromID")
        setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")

        jrBinds = []
        for job in passedJobs:
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

        myThread = threading.currentThread()
        myThread.transaction.begin()
        setFWJRAction.execute(binds=jrBinds,
                              conn=myThread.transaction.conn,
                              transaction=True)
        myThread.transaction.commit()

        self.changeState.propagate(passedJobs, 'complete', 'executing')
        logging.debug("Propagating jobs in jobTracker")
        logging.info("Passed %i jobs" % len(passedJobs))
        return
Ejemplo n.º 7
0
class JobTrackerPoller(BaseWorkerThread):
    """
    _JobTrackerPoller_

    Polls the BossAir database for complete jobs
    Handles completed jobs
    """

    def __init__(self, config):
        """
        Initialise class members
        """

        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=config)
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)

        self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobTracker")

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """

        return

    def terminate(self, params=None):
        """
        _terminate_

        Terminate the function after one more run.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    def algorithm(self, parameters=None):
        """
        Performs the archiveJobs method, looking for each type of failure
        And deal with it as desired.
        """
        logging.info("Running Tracker algorithm")
        myThread = threading.currentThread()
        try:
            self.trackJobs()
        except WMException as ex:
            if getattr(myThread, "transaction", None):
                myThread.transaction.rollback()
            self.sendAlert(6, msg=str(ex))
            raise
        except Exception as ex:
            msg = "Unknown exception in JobTracker!\n"
            msg += str(ex)
            if getattr(myThread, "transaction", None):
                myThread.transaction.rollback()
            self.sendAlert(6, msg=msg)
            logging.error(msg)
            raise JobTrackerException(msg)

        return

    def trackJobs(self):
        """
        _trackJobs_

        Finds a list of running jobs and the sites that they're running at,
        and passes that off to tracking.
        """

        passedJobs = []
        failedJobs = []

        # Get all jobs WMBS thinks are running
        jobList = self.jobListAction.execute(state="Executing")

        if jobList == []:
            # No jobs: do nothing
            return

        logging.info("Have list of %i executing jobs" % len(jobList))

        # Now get all jobs that BossAir thinks are complete
        completeJobs = self.bossAir.getComplete()

        logging.info("%i jobs are complete in BossAir" % (len(completeJobs)))
        logging.debug(completeJobs)

        for job in completeJobs:
            if not job["id"] in jobList:
                continue
            if job["status"].lower() == "timeout":
                failedJobs.append(job)
            else:
                passedJobs.append(job)

        # Assume all these jobs "passed" if they aren't in timeout
        self.passJobs(passedJobs)
        self.failJobs(failedJobs)

        return

    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """

        myThread = threading.currentThread()

        if len(failedJobs) == 0:
            return

        myThread = threading.currentThread()

        # Load DAOs
        setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        loadAction = self.daoFactory(classname="Jobs.LoadFromID")

        jrBinds = []

        for job in failedJobs:
            jrPath = os.path.join(job.getCache(), "Report.%i.pkl" % (job["retry_count"]))
            jrBinds.append({"jobid": job["id"], "fwjrpath": jrPath})
            # Make sure the job object goes packed with fwjr_path so it
            # can be persisted in couch

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error("The pickle in %s could not be loaded, generating a new one" % jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        # Set all paths at once
        myThread.transaction.begin()
        setFWJRAction.execute(binds=jrBinds)

        self.changeState.propagate(failedJobs, "jobfailed", "executing")
        logging.info("Failed %i jobs" % (len(failedJobs)))
        myThread.transaction.commit()

        return

    def passJobs(self, passedJobs):
        """
        _passJobs_

        Pass jobs and move their stuff?
        """

        if len(passedJobs) == 0:
            return

        # Mark 'em as complete
        loadAction = self.daoFactory(classname="Jobs.LoadFromID")
        setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")

        jrBinds = []
        for job in passedJobs:
            jrPath = os.path.join(job.getCache(), "Report.%i.pkl" % (job["retry_count"]))
            jrBinds.append({"jobid": job["id"], "fwjrpath": jrPath})

        myThread = threading.currentThread()
        myThread.transaction.begin()
        setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True)
        myThread.transaction.commit()

        self.changeState.propagate(passedJobs, "complete", "executing")
        logging.debug("Propagating jobs in jobTracker")
        logging.info("Passed %i jobs" % len(passedJobs))
        return
Ejemplo n.º 8
0
class JobTrackerPoller(BaseWorkerThread):
    """
    _JobTrackerPoller_

    Polls the BossAir database for complete jobs
    Handles completed jobs
    """

    def __init__(self, config):
        """
        Initialise class members
        """

        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=config)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")
        self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """

        return

    def terminate(self, params=None):
        """
        _terminate_

        Terminate the function after one more run.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
        return

    def algorithm(self, parameters=None):
        """
        Performs the archiveJobs method, looking for each type of failure
        And deal with it as desired.
        """
        logging.info("Running Tracker algorithm")
        myThread = threading.currentThread()
        try:
            self.trackJobs()
        except WMException:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = "Unknown exception in JobTracker!\n"
            msg += str(ex)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollback()
            logging.error(msg)
            raise JobTrackerException(msg)

        return

    def trackJobs(self):
        """
        _trackJobs_

        Finds a list of running jobs and the sites that they're running at,
        and passes that off to tracking.
        """
        passedJobs = []
        failedJobs = []

        jobList = self.jobListAction.execute(state="executing")
        logging.info("Have list of %i executing jobs in WMBS", len(jobList))

        if not jobList:
            return

        # retrieve completed jobs from BossAir that are 'executing' in WMBS
        completeJobs = self.bossAir.getComplete()
        logging.info("Have list of %i jobs complete in BossAir but executing in WMBS", len(completeJobs))
        logging.debug(completeJobs)

        for job in completeJobs:
            if job['id'] not in jobList:
                logging.error("Found a complete job in BossAir without a correspondent in WMBS!")
                continue
            if job['status'].lower() == 'timeout':
                failedJobs.append(job)
            else:
                passedJobs.append(job)

        # Assume all these jobs "passed" if they aren't in timeout
        self.passJobs(passedJobs)
        self.failJobs(failedJobs)

        return

    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """
        if len(failedJobs) == 0:
            return

        jrBinds = []
        for job in failedJobs:
            # Make sure the job object goes packed with fwjr_path to be persisted in couch
            jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error("The pickle in %s could not be loaded, generating a new one", jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True)
        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs", len(failedJobs))
        myThread.transaction.commit()

        return

    def passJobs(self, passedJobs):
        """
        _passJobs_

        Pass jobs and move their stuff?
        """
        if len(passedJobs) == 0:
            return

        jrBinds = []
        for job in passedJobs:
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True)
        self.changeState.propagate(passedJobs, 'complete', 'executing')
        myThread.transaction.commit()

        logging.info("Passed %i jobs", len(passedJobs))

        return