Ejemplo n.º 1
0
    def testG_monitoringDAO(self):
        """
        _monitoringDAO_

        Because I need a test for the monitoring DAO
        """
        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config, insertStates=True)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'
            job['location'] = 'T2_US_UCSD'
            job.save()

        baAPI.submit(jobs=jobDummies)

        results = baAPI.monitor()
        self.assertEqual(results[0]['Pending'], nJobs)

        return
Ejemplo n.º 2
0
    def changeSiteState(self, siteName, state):
        """
        _changeSiteState_
        Set a site to some of the possible states and perform
        proper actions with the jobs, according to the state
        """
        timeNow = int(time.time())
        state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303}
        executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState")
        jobInfo = executingJobs.execute(state='executing')

        if jobInfo:
            bossAir = BossAirAPI(self.config)
            jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state
                                                      in state2ExitCode)

            ercode = state2ExitCode.get(state, 71300)
            bossAir.kill(jobtokill, errorCode=ercode)

        # only now that jobs were updated by the plugin, we flip the site state
        setStateAction = self.wmbsDAOFactory(classname="Locations.SetState")
        setStateAction.execute(siteName=siteName,
                               state=state,
                               stateTime=timeNow,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        return
Ejemplo n.º 3
0
    def changeSiteState(self, siteName, state):
        """
        _changeSiteState_
        Set a site to some of the possible states,
        if the state is Aborted we must do extra actions.
        """
        setStateAction = self.wmbsDAOFactory(classname="Locations.SetState")
        setStateAction.execute(siteName=siteName,
                               state=state,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState")
        jobInfo = executingJobs.execute(state='executing')
        if not jobInfo:
            # then no jobs to look at
            return
        bossAir = BossAirAPI(self.config, noSetup=True)
        jobtokill = bossAir.updateSiteInformation(
            jobInfo, siteName, state in ("Aborted", "Draining", "Down"))

        if state == "Aborted":
            ercode = 71301
        elif state == "Draining":
            ercode = 71302
        elif state == "Down":
            ercode = 71303
        else:
            ercode = 71300
        bossAir.kill(jobtokill, errorCode=ercode)

        return
Ejemplo n.º 4
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package = "WMCore.WMBS", \
                                     logger = logging,
                                     dbinterface = myThread.dbi)

        self.config = config

        #Libraries
        self.resourceControl = ResourceControl()

        self.changeState = ChangeState(self.config)
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)

        # BossAir
        self.bossAir = BossAirAPI(config=self.config)

        # Additions for caching-based JobSubmitter
        self.workflowTimestamps = {}
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.siteKeys = {}
        self.locationDict = {}
        self.cmsNames = {}
        self.drainSites = []
        self.sortedSites = []
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobSubmitter")

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception, ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            self.sendAlert(6, msg=msg)
            try:
                logging.debug("PackageDir: %s" % self.packageDir)
                logging.debug("Config: %s" % config)
            except:
                pass
            raise JobSubmitterPollerException(msg)
Ejemplo n.º 5
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    existingTransaction = False
    if myThread.transaction.conn:
        existingTransaction = True
    else:
        myThread.transaction.begin()

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn,
                            transaction=True)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn,
                                      transaction=True)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                changeState.propagate(liveWMBSJob, "killed", liveJob["state"])
                killableJobs.append(liveJob)
        # Now kill them
        try:
            bossAir.kill(jobs=killableJobs)
        except BossAirException, ex:
            # Something's gone wrong
            # Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.
            pass
Ejemplo n.º 6
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        baAPI.kill(jobs=idleJobs)

        del jobSubmitter

        return
Ejemplo n.º 7
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        del jobSubmitter

        return
Ejemplo n.º 8
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from PyCondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(
                jtok, errorCode=61301
            )  # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down)

        return
Ejemplo n.º 9
0
    def __init__(self, config):
        """
        Initialise class members
        """

        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=config)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")
        self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
Ejemplo n.º 10
0
    def testKillWorkflow(self):
        """
        _testKillWorkflow_

        Verify that workflow killing works correctly.
        """
        baAPI = BossAirAPI(config=self.config, insertStates=True)

        # Create nine jobs
        self.setupForKillTest(baAPI=baAPI)
        self.assertEqual(len(baAPI._listRunJobs()), 9)
        killWorkflow("Main", self.config, self.config)

        self.verifyFileKillStatus()
        self.verifyJobKillStatus()
        self.assertEqual(len(baAPI._listRunJobs()), 8)

        return
Ejemplo n.º 11
0
    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts')

        return
Ejemplo n.º 12
0
    def __init__(self, config):
        """
        Initialise class members
        """

        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=config)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs")

        # initialize the alert framework (if available)
        self.initAlerts(compName="JobTracker")
Ejemplo n.º 13
0
    def __init__(self, config):
        """
        __init__
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        self.bossAir = BossAirAPI(config = self.config)
        self.reqmgr = RequestManager({'endpoint' : self.config.JobUpdater.reqMgrUrl})
        self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl,
                                   self.config.WorkQueueManager.dbname)

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)

        self.listWorkflowsDAO = self.daoFactory(classname = "Workflow.ListForJobUpdater")
        self.updateWorkflowPrioDAO = self.daoFactory(classname = "Workflow.UpdatePriority")
        self.executingJobsDAO = self.daoFactory(classname = "Jobs.GetNumberOfJobsForWorkflowTaskStatus")
Ejemplo n.º 14
0
    def testG_monitoringDAO(self):
        """
        _monitoringDAO_

        Because I need a test for the monitoring DAO
        """

        return

        myThread = threading.currentThread()

        config = self.getConfig()

        changeState = ChangeState(config)

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'mnorman'
            job['location'] = 'T2_US_UCSD'
            job.save()

        baAPI.submit(jobs=jobDummies)

        results = baAPI.monitor()

        self.assertEqual(len(results), nJobs)
        for job in results:
            self.assertEqual(job['plugin'], 'CondorPlugin')

        return
Ejemplo n.º 15
0
    def testKillWorkflow(self):
        """
        _testKillWorkflow_

        Verify that workflow killing works correctly.
        """
        configFile = EmulatorSetup.setupWMAgentConfig()

        config = loadConfigurationFile(configFile)

        baAPI = BossAirAPI(config=config)

        # Create nine jobs
        self.setupForKillTest(baAPI=baAPI)
        self.assertEqual(len(baAPI._listRunJobs()), 9)
        killWorkflow("Main", config, config)

        self.verifyFileKillStatus()
        self.verifyJobKillStatus()
        self.assertEqual(len(baAPI._listRunJobs()), 8)

        EmulatorSetup.deleteConfig(configFile)
        return
Ejemplo n.º 16
0
    def testB_PluginTest(self):
        """
        _PluginTest_


        Now check that these functions worked if called through plugins
        Instead of directly.

        There are only three plugin
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu')
        changeState = ChangeState(config)
        changeState.propagate(jobDummies, 'created', 'new')
        changeState.propagate(jobDummies, 'executing', 'created')

        # Prior to building the job, each job must have a plugin
        # and user assigned
        for job in jobDummies:
            job['plugin'] = 'TestPlugin'
            job['owner'] = 'tapas'

        baAPI.submit(jobs=jobDummies)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), nJobs)

        # Test Plugin should complete all jobs
        baAPI.track()

        # Should be no more running jobs
        runningJobs = baAPI._listRunJobs()
        self.assertEqual(len(runningJobs), 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nJobs)

        # Do this test because BossAir is specifically built
        # to keep it from finding completed jobs
        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), nJobs)

        baAPI.removeComplete(jobs=jobDummies)

        result = myThread.dbi.processData(
            "SELECT id FROM bl_runjob")[0].fetchall()
        self.assertEqual(len(result), 0)

        return
Ejemplo n.º 17
0
    def testA_APITest(self):
        """
        _APITest_

        This is a commissioning test that has very little to do
        with anything except loading the code.
        """
        #return

        myThread = threading.currentThread()

        config = self.getConfig()

        baAPI = BossAirAPI(config=config)

        # We should have loaded a plugin
        self.assertTrue('TestPlugin' in baAPI.plugins.keys())

        result = myThread.dbi.processData(
            "SELECT name FROM bl_status")[0].fetchall()
        statusList = []
        for i in result:
            statusList.append(i.values()[0])

        # We should have the plugin states in the database
        self.assertEqual(statusList.sort(), ['New', 'Dead', 'Gone'].sort())

        # Create some jobs
        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)
        print(jobDummies)

        baAPI.createNewJobs(wmbsJobs=jobDummies)

        runningJobs = baAPI._listRunJobs()

        self.assertEqual(len(runningJobs), nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)
        deadJobs = baAPI._loadByStatus(status='Dead')
        self.assertEqual(len(deadJobs), 0)
        raisesException = False

        self.assertRaises(BossAirException,
                          baAPI._loadByStatus,
                          status='FalseStatus')

        # Change the job status and update it
        for job in newJobs:
            job['status'] = 'Dead'

        baAPI._updateJobs(jobs=newJobs)

        # Test whether we see the job status as updated
        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)
        deadJobs = baAPI._loadByStatus(status='Dead')
        self.assertEqual(len(deadJobs), nJobs)

        # Can we load by BossAir ID?
        loadedJobs = baAPI._loadByID(jobs=deadJobs)
        self.assertEqual(len(loadedJobs), nJobs)

        # Can we load via WMBS?
        loadedJobs = baAPI.loadByWMBS(wmbsJobs=jobDummies)
        self.assertEqual(len(loadedJobs), nJobs)

        # See if we can delete jobs
        baAPI._deleteJobs(jobs=deadJobs)

        # Confirm that they're gone
        deadJobs = baAPI._loadByStatus(status='Dead')
        self.assertEqual(len(deadJobs), 0)

        self.assertEqual(len(baAPI.jobs), 0)

        return
Ejemplo n.º 18
0
    def testD_MyProxyDelegation(self):
        """
        _MyProxyDelegation_

        Test whether we can delegate a proxy via myproxy to this job

        IMPORTANT:
        If you are going to run this test you will have to set the serverCert/Key
        config options to point to your local server cert.  You will also have to
        run this job with your DN.  I don't recommend figuring out how to do this
        without knowing what you're doing in regards to proxy stuff.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        proxyDir = os.path.join(self.testDir, 'proxyDir')
        os.mkdir(proxyDir)
        config = self.getConfig()
        config.BossAir.removeTime = -10.0
        config.BossAir.pluginNames.append('VanillaCondorPlugin')
        config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem'
        config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem'
        config.BossAir.myproxyServer = 'myproxy.cern.ch'
        config.BossAir.proxyDir = proxyDir
        config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a'

        userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632'

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        baAPI = BossAirAPI(config=config)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'VanillaCondorPlugin'
            tmpJob['owner'] = userDN
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            jobList.append(tmpJob)

        info = {}
        #info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        proxyFile = os.listdir(proxyDir)[0]
        stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \
                                                    % os.path.join(proxyDir, proxyFile))
        self.assertEqual(
            stdout.split('\n')[0],
            'subject   : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        SubprocessAlgos.runCommand(cmd=command, shell=False)

        return
Ejemplo n.º 19
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Ejemplo n.º 20
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI = BossAirAPI(config=config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs=nJobs,
                                          location='grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' %
                                config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn=userdn)
        for j in jobDummies:
            job = j  # {'id': j['id']}
            job['custom'] = {'location': 'grid-ce-01.ba.infn.it'}
            job['location'] = 'grid-ce-01.ba.infn.it'
            job['plugin'] = 'gLitePlugin'
            job['name'] = j['name']
            job['cache_dir'] = self.testDir
            job['retry_count'] = 0
            job['owner'] = userdn
            job['packageDir'] = self.testDir
            job['sandbox'] = sandbox
            job['priority'] = None
            jobList.append(job)

        baAPI.submit(jobs=jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status='New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)

        # Killing all the jobs
        baAPI.kill(jobList)
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
Ejemplo n.º 21
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        dummycacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')
        sn = "T2_US_UCSD"

        # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist
        # in BossAir_t.py
        baAPI.updateSiteInformation(idleJobs, sn, True)

        # Now kill 'em manually
        #        command = ['condor_rm', self.user]
        #        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        #        pipe.communicate()

        del jobSubmitter

        return
Ejemplo n.º 22
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the SimpleCondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10
        jobDummies = self.createDummyJobs(nJobs=nJobs)
        baAPI = BossAirAPI(config=config, insertStates=True)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'SimpleCondorPlugin'
            tmpJob['owner'] = 'tapas'
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            tmpJob['usergroup'] = "wheel"
            tmpJob['userrole'] = 'cmsuser'
            jobList.append(tmpJob)

        info = {}
        # info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs=jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        # newJobs = baAPI._loadByStatus(status = 'Removed')
        # self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status='Removed', complete='0')
        self.assertEqual(len(newJobs), nJobs)
        return
Ejemplo n.º 23
0
    def testE_FullChain(self):
        """
        _FullChain_

        Full test going through the chain; using polling cycles and everything
        """

        from WMComponent.JobSubmitter.JobSubmitter import JobSubmitter
        from WMComponent.JobStatusLite.JobStatusLite import JobStatusLite
        from WMComponent.JobTracker.JobTracker import JobTracker

        myThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'SimpleCondorPlugin'

        baAPI = BossAirAPI(config=config, insertStates=True)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site='se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitter(config=config)
        jobTracker = JobTracker(config=config)
        jobStatus = JobStatusLite(config=config)

        jobSubmitter.prepareToStart()
        jobTracker.prepareToStart()
        jobStatus.prepareToStart()

        # What should happen here:
        # 1) The JobSubmitter should submit the jobs
        # 2) Because of the ridiculously short time on pending jobs
        #     the JobStatus poller should mark the jobs as done
        #     and kill them.
        # 3) The JobTracker should realize there are finished jobs
        #
        # So at the end of several polling cycles, the jobs should all
        # be done, but be in the failed status (they timed out)

        time.sleep(20)

        myThread.workerThreadManager.terminateWorkers()

        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state='JobFailed', jobType="Processing")
        self.assertEqual(len(result), nJobs * nSubs)
        return
Ejemplo n.º 24
0
    def testD_PrototypeChain(self):
        """
        _PrototypeChain_

        Prototype the BossAir workflow
        """
        dummymyThread = threading.currentThread()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'SimpleCondorPlugin'

        baAPI = BossAirAPI(config=config, insertStates=True)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        dummycacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site='se.T2_US_UCSD')
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)
        jobTracker = JobTrackerPoller(config=config)
        statusPoller = StatusPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Check WMBS
        getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs")
        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        statusPoller.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Tracker should do nothing
        jobTracker.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        # Wait for jobs to timeout due to short Pending wait period
        time.sleep(12)

        statusPoller.algorithm()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Timeout', complete='0')
        self.assertEqual(len(newJobs), nSubs * nJobs)

        # Jobs should be gone
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Check if they're complete
        completeJobs = baAPI.getComplete()
        self.assertEqual(len(completeJobs), nSubs * nJobs)

        # Because they timed out, they all should have failed
        jobTracker.algorithm()

        result = getJobsAction.execute(state='Executing', jobType="Processing")
        self.assertEqual(len(result), 0)

        result = getJobsAction.execute(state='JobFailed', jobType="Processing")
        self.assertEqual(len(result), nSubs * nJobs)

        return
Ejemplo n.º 25
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                killableJobs.append(liveJob)
        # Now kill them
        try:
            logging.info("Killing %d jobs for workflow: %s", len(killableJobs),
                         workflowName)
            bossAir.kill(jobs=killableJobs, workflowName=workflowName)
        except BossAirException as ex:
            # Something's gone wrong. Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.

    liveWMBSJobs = defaultdict(list)
    for liveJob in liveJobs:
        if liveJob["state"] == "killed":
            # Then we've killed it already
            continue
        liveWMBSJob = Job(id=liveJob["id"])
        liveWMBSJob.update(liveJob)
        liveWMBSJobs[liveJob["state"]].append(liveWMBSJob)

    for state, jobsByState in liveWMBSJobs.items():
        if len(jobsByState) > 100 and state != "executing":
            # if there are to many jobs skip the couch and dashboard update
            # TODO: couch and dashboard need to be updated or parallel.
            changeState.check("killed", state)
            changeState.persist(jobsByState, "killed", state)
        else:
            changeState.propagate(jobsByState, "killed", state)
    return
Ejemplo n.º 26
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except Exception as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            try:
                logging.debug("PackageDir: %s", self.packageDir)
                logging.debug("Config: %s", config)
            except:
                pass
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        return
Ejemplo n.º 27
0
    def testH_ARCTest(self):
        """
        _ARCTest_

        This test works on the ARCPlugin, checking all of
        its functions with a single set of jobs
        """

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginNames.append("ARCPlugin")
        #config.BossAir.pluginNames = ["ARCPlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi')
        #baAPI.createNewJobs(wmbsJobs = jobDummies)
        #changeState = ChangeState(config)
        #changeState.propagate(jobDummies, 'created', 'new')
        #changeState.propagate(jobDummies, 'executing', 'created')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'jade-cms.hip.fi'}
            job['location'] = 'jade-cms.hip.fi'
            job['plugin'] = 'ARCPlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = 'edelmann'
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        for j in rJobs:
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

            #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]:

        baAPI.kill(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        succ, fail = baAPI.submit(jobs = jobList)

        time.sleep(30)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        idStr = ""
        for j in rJobs:
            idStr += " " + j['gridid']
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

        # Now kill 'em manually
        no_jobs = True
        while no_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") >= 0:
                # It seems the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                no_jobs = False

            # Just to be sure, if the jobs were already finished, do a
            # 'ngclean' too.
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]

        # Make sure the killing of the jobs reaches the info.sys.
        still_jobs = True
        while still_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") < 0:
                # It seems the killing of the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                still_jobs = False

        # See what happened
        baAPI.track()

        idJobs = baAPI._loadByID(rJobs)
        nActiveJobs = 0
        nRemovedJobs = 0
        for j in idJobs:
            if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]:
                nActiveJobs += 1
            if j['status'] in [ "KILLING", "KILLED", "LOST" ]:
                nRemovedJobs += 1
        self.assertEqual(nActiveJobs, 0)
        self.assertEqual(nRemovedJobs, nJobs)

        return