Ejemplo n.º 1
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')
        sn = "T2_US_UCSD"

        # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist
        # in BossAir_t.py
        baAPI.updateSiteInformation(idleJobs, sn, True)

        # Now kill 'em manually
        #        command = ['condor_rm', self.user]
        #        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        #        pipe.communicate()
        
        del jobSubmitter

        return
Ejemplo n.º 2
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        baAPI.kill(jobs=idleJobs)

        del jobSubmitter

        return
Ejemplo n.º 3
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from PyCondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(
                jtok, errorCode=61301
            )  # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down)

        return
Ejemplo n.º 4
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')

        baAPI.kill(jobs = idleJobs)

        del jobSubmitter

        return
Ejemplo n.º 5
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from CondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        dummycacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(self.testDir,
                                                                      'workloadTest',
                                                                      workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(jtok, errorCode=71301)  # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down)

        return
Ejemplo n.º 6
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from CondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        
        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        baAPI  = BossAirAPI(config = config)
        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')
        print idleJobs
        for job in idleJobs :
            print job['id']
        baAPI.updateSiteInformation(idleJobs, info = None)
        
        return
Ejemplo n.º 7
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the CondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs)

        baAPI  = BossAirAPI(config = config)

        print self.testDir

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom']      = {'location': 'malpaquet'}
            tmpJob['name']        = j['name']
            tmpJob['cache_dir']   = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin']      = 'CondorPlugin'
            tmpJob['owner']       = 'tapas'
            tmpJob['packageDir']  = self.testDir
            tmpJob['sandbox']     = sandbox
            tmpJob['priority']    = None
            tmpJob['usergroup']   = "wheel"
            tmpJob['userrole']    = 'cmsuser'
            jobList.append(tmpJob)


        info = {}
        #info['packageDir'] = self.testDir
        info['index']      = 0
        info['sandbox']    = sandbox

        baAPI.submit(jobs = jobList, info = info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)


        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs = jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs = jobList, info = info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)


        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), 0)

        #newJobs = baAPI._loadByStatus(status = 'Removed')
        #self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status = 'Removed', complete = '0')
        self.assertEqual(len(newJobs), nJobs)

        return
Ejemplo n.º 8
0
class StatusPoller(BaseWorkerThread):
    """
    _StatusPoller_

    Prototype for polling for
    JobStatusAir
    """
    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts')

        # init alert system
        self.initAlerts(compName="StatusPoller")
        return

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Handle any exceptions with the actual code
        """
        myThread = threading.currentThread()
        try:
            logging.info("Running job status poller algorithm...")
            self.checkStatus()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            self.sendAlert(6, msg=str(ex))
            raise
        except Exception as ex:
            msg = "Unhandled error in statusPoller"
            msg += str(ex)
            logging.exception(msg)
            self.sendAlert(6, msg=msg)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise StatusPollerException(msg)

        return

    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """

        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if not self.timeouts:
            # Then we've set ourselves to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = []

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime = job.get('status_time', None)
            timeout = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero",
                              job['id'])
                continue
            if timeout and statusTime:
                if time.time() - float(statusTime) > float(timeout):
                    # Timeout status is used by JobTracker to fail jobs in WMBS database
                    logging.info(
                        "Killing job %i because it has exceeded timeout for status '%s'",
                        job['id'], globalState)
                    job['status'] = 'Timeout'
                    jobsToKill.append(job)

        # We need to show that the jobs are in state timeout
        # and then kill them.
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKill)
        self.bossAir.kill(jobs=jobsToKill,
                          killMsg=WM_JOB_ERROR_CODES[71304],
                          errorCode=71304)
        myThread.transaction.commit()

        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Ejemplo n.º 9
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI = BossAirAPI(config=config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs=nJobs,
                                          location='grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' %
                                config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn=userdn)
        for j in jobDummies:
            job = j  # {'id': j['id']}
            job['custom'] = {'location': 'grid-ce-01.ba.infn.it'}
            job['location'] = 'grid-ce-01.ba.infn.it'
            job['plugin'] = 'gLitePlugin'
            job['name'] = j['name']
            job['cache_dir'] = self.testDir
            job['retry_count'] = 0
            job['owner'] = userdn
            job['packageDir'] = self.testDir
            job['sandbox'] = sandbox
            job['priority'] = None
            jobList.append(job)

        baAPI.submit(jobs=jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status='New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)

        # Killing all the jobs
        baAPI.kill(jobList)
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
Ejemplo n.º 10
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI  = BossAirAPI(config = config)


        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin']   = 'TestPlugin'
            job['owner']    = 'tapas'

        baAPI.submit(jobs = jobDummies)


        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)


        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)


        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)


        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)


        baAPI.removeComplete(jobs = jobDummies)


        result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)


        return
Ejemplo n.º 11
0
class StatusPoller(BaseWorkerThread):
    """
    _StatusPoller_

    Prototype for polling for
    JobStatusAir
    """

    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts')

        return

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Handle any exceptions with the actual code
        """
        myThread = threading.currentThread()
        try:
            logging.info("Running job status poller algorithm...")
            self.checkStatus()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise
        except Exception as ex:
            msg = "Unhandled error in statusPoller"
            msg += str(ex)
            logging.exception(msg)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise StatusPollerException(msg)

        return

    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """


        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if not self.timeouts:
            # Then we've set ourselves to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = defaultdict(list)

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime = job.get('status_time', None)
            timeout = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero", job['id'])
                continue
            if timeout and statusTime:
                if time.time() - float(statusTime) > float(timeout):
                    # Timeout status is used by JobTracker to fail jobs in WMBS database
                    logging.info("Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState)
                    job['status'] = 'Timeout'
                    jobsToKill[globalState].append(job)

        timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306}
        # We need to show that the jobs are in state timeout
        # and then kill them.
        jobsToKillList = flattenList(jobsToKill.values())
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKillList)
        for preJobStatus in jobsToKill:
            eCode = timeOutCodeMap.get(preJobStatus, 71307) # it shouldn't have 71307 (states should be among Running, Pending, Error)
            self.bossAir.kill(jobs=jobsToKill[preJobStatus], killMsg=WM_JOB_ERROR_CODES[eCode], errorCode=eCode)
        myThread.transaction.commit()

        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Ejemplo n.º 12
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh'
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname = "Users.New")
        newuser.execute(dn = userdn)
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'grid-ce-01.ba.infn.it'}
            job['location']    = 'grid-ce-01.ba.infn.it'
            job['plugin']      = 'gLitePlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = userdn
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)


        # Killing all the jobs
        baAPI.kill( jobList )
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
Ejemplo n.º 13
0
    def testH_ARCTest(self):
        """
        _ARCTest_

        This test works on the ARCPlugin, checking all of
        its functions with a single set of jobs
        """

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginNames.append("ARCPlugin")
        #config.BossAir.pluginNames = ["ARCPlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi')
        #baAPI.createNewJobs(wmbsJobs = jobDummies)
        #changeState = ChangeState(config)
        #changeState.propagate(jobDummies, 'created', 'new')
        #changeState.propagate(jobDummies, 'executing', 'created')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'jade-cms.hip.fi'}
            job['location'] = 'jade-cms.hip.fi'
            job['plugin'] = 'ARCPlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = 'edelmann'
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        for j in rJobs:
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

            #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]:

        baAPI.kill(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        succ, fail = baAPI.submit(jobs = jobList)

        time.sleep(30)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        idStr = ""
        for j in rJobs:
            idStr += " " + j['gridid']
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

        # Now kill 'em manually
        no_jobs = True
        while no_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") >= 0:
                # It seems the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                no_jobs = False

            # Just to be sure, if the jobs were already finished, do a
            # 'ngclean' too.
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]

        # Make sure the killing of the jobs reaches the info.sys.
        still_jobs = True
        while still_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") < 0:
                # It seems the killing of the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                still_jobs = False

        # See what happened
        baAPI.track()

        idJobs = baAPI._loadByID(rJobs)
        nActiveJobs = 0
        nRemovedJobs = 0
        for j in idJobs:
            if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]:
                nActiveJobs += 1
            if j['status'] in [ "KILLING", "KILLED", "LOST" ]:
                nRemovedJobs += 1
        self.assertEqual(nActiveJobs, 0)
        self.assertEqual(nRemovedJobs, nJobs)

        return
Ejemplo n.º 14
0
class StatusPoller(BaseWorkerThread):
    """
    _StatusPoller_

    Prototype for polling for
    JobStatusAir
    """

    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts', {})

        # init alert system
        self.initAlerts(compName="StatusPoller")
        return

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Handle any exceptions with the actual code
        """
        myThread = threading.currentThread()
        try:
            self.checkStatus()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            self.sendAlert(6, msg=str(ex))
            raise
        except Exception as ex:
            msg = "Unhandled error in statusPoller"
            msg += str(ex)
            logging.exception(msg)
            self.sendAlert(6, msg=msg)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise StatusPollerException(msg)

        return

    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """


        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if self.timeouts == {}:
            # Then we've set outself to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = []

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime  = job.get('status_time', None)
            timeout     = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero" % job['id'])
                continue
            if timeout != None and statusTime != None:
                if time.time() - float(statusTime) > float(timeout):
                    # Then the job needs to be killed.
                    logging.info("Killing job %i because it has exceeded timeout for status %s" % (job['id'], globalState))
                    job['status'] = 'Timeout'
                    jobsToKill.append(job)

        # We need to show that the jobs are in state timeout
        # and then kill them.
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKill)
        self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[61304], errorCode=61304)
        myThread.transaction.commit()


        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Ejemplo n.º 15
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        dummycacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')
        sn = "T2_US_UCSD"

        # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist
        # in BossAir_t.py
        baAPI.updateSiteInformation(idleJobs, sn, True)

        # Now kill 'em manually
        #        command = ['condor_rm', self.user]
        #        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        #        pipe.communicate()

        del jobSubmitter

        return
Ejemplo n.º 16
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'

        baAPI.submit(jobs=jobDummies)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)

        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)

        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)

        baAPI.removeComplete(jobs=jobDummies)

        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)

        return
Ejemplo n.º 17
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the SimpleCondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10
        jobDummies = self.createDummyJobs(nJobs=nJobs)
        baAPI = BossAirAPI(config=config, insertStates=True)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'SimpleCondorPlugin'
            tmpJob['owner'] = 'tapas'
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            tmpJob['usergroup'] = "wheel"
            tmpJob['userrole'] = 'cmsuser'
            jobList.append(tmpJob)

        info = {}
        # info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs=jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        # newJobs = baAPI._loadByStatus(status = 'Removed')
        # self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status='Removed', complete='0')
        self.assertEqual(len(newJobs), nJobs)
        return