Ejemplo n.º 1
0
    def changeSiteState(self, siteName, state):
        """
        _changeSiteState_
        Set a site to some of the possible states and perform
        proper actions with the jobs, according to the state
        """
        timeNow = int(time.time())
        state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303}
        executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState")
        jobInfo = executingJobs.execute(state='executing')

        if jobInfo:
            bossAir = BossAirAPI(self.config)
            jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state
                                                      in state2ExitCode)

            ercode = state2ExitCode.get(state, 71300)
            bossAir.kill(jobtokill, errorCode=ercode)

        # only now that jobs were updated by the plugin, we flip the site state
        setStateAction = self.wmbsDAOFactory(classname="Locations.SetState")
        setStateAction.execute(siteName=siteName,
                               state=state,
                               stateTime=timeNow,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        return
Ejemplo n.º 2
0
    def changeSiteState(self, siteName, state):
        """
        _changeSiteState_
        Set a site to some of the possible states and perform
        proper actions with the jobs, according to the state
        """
        state2ExitCode = {"Aborted": 71301,
                          "Draining": 71302,
                          "Down": 71303}
        executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState")
        jobInfo = executingJobs.execute(state='executing')

        if jobInfo:
            bossAir = BossAirAPI(self.config, noSetup=True)
            jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in state2ExitCode)

            ercode = state2ExitCode.get(state, 71300)
            bossAir.kill(jobtokill, errorCode=ercode)

        # only now that jobs were updated by the plugin, we flip the site state
        setStateAction = self.wmbsDAOFactory(classname="Locations.SetState")
        setStateAction.execute(siteName=siteName, state=state,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        return
Ejemplo n.º 3
0
    def changeSiteState(self, siteName, state):
        """
        _changeSiteState_
        Set a site to some of the possible states,
        if the state is Aborted we must do extra actions.
        """
        setStateAction = self.wmbsDAOFactory(classname = "Locations.SetState")
        setStateAction.execute(siteName = siteName, state = state,
                               conn = self.getDBConn(),
                               transaction = self.existingTransaction())

        executingJobs = self.wmbsDAOFactory(classname = "Jobs.ListByState")
        jobInfo = executingJobs.execute(state = 'executing')
        if not jobInfo:
            # then no jobs to look at
            return
        bossAir = BossAirAPI(self.config, noSetup = True)
        jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in ("Aborted","Draining","Down"))

        if state == "Aborted":
            ercode=71301
        elif state == "Draining":
            ercode=71302
        elif state == "Down":
            ercode=71303
        else:
            ercode=71300
        bossAir.kill(jobtokill, errorCode=ercode)
        
        return
Ejemplo n.º 4
0
    def changeSiteState(self, siteName, state):
        """
        _changeSiteState_
        Set a site to some of the possible states,
        if the state is Aborted we must do extra actions.
        """
        setStateAction = self.wmbsDAOFactory(classname="Locations.SetState")
        setStateAction.execute(siteName=siteName,
                               state=state,
                               conn=self.getDBConn(),
                               transaction=self.existingTransaction())

        executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState")
        jobInfo = executingJobs.execute(state='executing')
        if not jobInfo:
            # then no jobs to look at
            return
        bossAir = BossAirAPI(self.config, noSetup=True)
        jobtokill = bossAir.updateSiteInformation(
            jobInfo, siteName, state in ("Aborted", "Draining", "Down"))

        if state == "Aborted":
            ercode = 71301
        elif state == "Draining":
            ercode = 71302
        elif state == "Down":
            ercode = 71303
        else:
            ercode = 71300
        bossAir.kill(jobtokill, errorCode=ercode)

        return
Ejemplo n.º 5
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    existingTransaction = False
    if myThread.transaction.conn:
        existingTransaction = True
    else:
        myThread.transaction.begin()

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn,
                            transaction=True)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn,
                                      transaction=True)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                changeState.propagate(liveWMBSJob, "killed", liveJob["state"])
                killableJobs.append(liveJob)
        # Now kill them
        try:
            bossAir.kill(jobs=killableJobs)
        except BossAirException, ex:
            # Something's gone wrong
            # Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.
            pass
Ejemplo n.º 6
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig = None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package = "WMCore.WMBS",
                            logger = myThread.logger,
                            dbinterface = myThread.dbi)
    killFilesAction = daoFactory(classname = "Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname = "Jobs.KillWorkflow")

    existingTransaction = False
    if myThread.transaction.conn:
        existingTransaction = True
    else:
        myThread.transaction.begin()

    killFilesAction.execute(workflowName = workflowName,
                            conn = myThread.transaction.conn,
                            transaction = True)

    liveJobs = killJobsAction.execute(workflowName = workflowName,
                                      conn = myThread.transaction.conn,
                                      transaction = True)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config = bossAirConfig, noSetup = True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id = liveJob["id"])
                liveWMBSJob.update(liveJob)
                changeState.propagate(liveWMBSJob, "killed", liveJob["state"])
                killableJobs.append(liveJob)
        # Now kill them
        try:
            bossAir.kill(jobs = killableJobs)
        except BossAirException, ex:
            # Something's gone wrong
            # Jobs not killed!
            logging.error("Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.
            pass
Ejemplo n.º 7
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from PyCondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        cacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(
                jtok, errorCode=61301
            )  # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down)

        return
Ejemplo n.º 8
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI = BossAirAPI(config=config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs=nSubs,
                                            nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(
                                                self.testDir, 'workloadTest',
                                                workloadName),
                                            site=None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')

        jobSubmitter = JobSubmitterPoller(config=config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        baAPI.kill(jobs=idleJobs)

        del jobSubmitter

        return
Ejemplo n.º 9
0
    def testF_WMSMode(self):
        """
        _WMSMode_

        Try running things in WMS Mode.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'PyCondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config = config)

        workload = self.createTestWorkload()

        workloadName = "basicWorkload"

        changeState = ChangeState(config)

        nSubs = 5
        nJobs = 10

        cacheDir = os.path.join(self.testDir, 'CacheDir')

        jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs,
                                            task = workload.getTask("ReReco"),
                                            workloadSpec = os.path.join(self.testDir,
                                                                        'workloadTest',
                                                                        workloadName),
                                            site = None)
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')


        jobSubmitter = JobSubmitterPoller(config = config)

        jobSubmitter.algorithm()

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status = 'Idle')

        baAPI.kill(jobs = idleJobs)

        del jobSubmitter

        return
Ejemplo n.º 10
0
    def testT_updateJobInfo(self):
        """
        _updateJobInfo_

        Test the updateSiteInformation method from CondorPlugin.py
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginName = 'CondorPlugin'
        config.BossAir.submitWMSMode = True

        baAPI  = BossAirAPI(config=config)
        workload = self.createTestWorkload()
        workloadName = "basicWorkload"
        changeState = ChangeState(config)

        nSubs = 1
        nJobs = 2
        dummycacheDir = os.path.join(self.testDir, 'CacheDir')
        jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs,
                                            task=workload.getTask("ReReco"),
                                            workloadSpec=os.path.join(self.testDir,
                                                                      'workloadTest',
                                                                      workloadName),
                                            site="se.T2_US_UCSD")
        for group in jobGroupList:
            changeState.propagate(group.jobs, 'created', 'new')
        jobSubmitter = JobSubmitterPoller(config=config)
        jobSubmitter.algorithm()
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nSubs * nJobs)

        baAPI.track()
        idleJobs = baAPI._loadByStatus(status='Idle')

        ##
        # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN
        # updateSiteInformation() method should edit the classAd for all the jobs
        # that are bound for the site
        # Check the Q manually using condor_q -l <job id>
        #
        jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True)
        if jtok != None:
            baAPI.kill(jtok, errorCode=71301)  # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down)

        return
Ejemplo n.º 11
0
 def changeSiteState(self, siteName, state):
     """
     _changeSiteState_
     Set a site to some of the possible states,
     if the state is Aborted we must do extra actions.
     """
     setStateAction = self.wmbsDAOFactory(classname = "Locations.SetState")
     setStateAction.execute(siteName = siteName, state = state,
                            conn = self.getDBConn(),
                            transaction = self.existingTransaction())
     if state == "Aborted" and self.config:
         # Kill all jobs in the batch system assigned to this site
         executingJobs = self.wmbsDAOFactory(classname = "Jobs.ListByStateAndLocation")
         jobIds = executingJobs.execute(state = 'executing', location = siteName)
         bossAir = BossAirAPI(self.config, noSetup = True)
         bossAir.kill(jobIds, errorCode = 61301)
     return
Ejemplo n.º 12
0
    def testH_ARCTest(self):
        """
        _ARCTest_

        This test works on the ARCPlugin, checking all of
        its functions with a single set of jobs
        """

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        config = self.getConfig()
        config.BossAir.pluginNames.append("ARCPlugin")
        #config.BossAir.pluginNames = ["ARCPlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi')
        #baAPI.createNewJobs(wmbsJobs = jobDummies)
        #changeState = ChangeState(config)
        #changeState.propagate(jobDummies, 'created', 'new')
        #changeState.propagate(jobDummies, 'executing', 'created')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'jade-cms.hip.fi'}
            job['location'] = 'jade-cms.hip.fi'
            job['plugin'] = 'ARCPlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = 'edelmann'
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        for j in rJobs:
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

            #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]:

        baAPI.kill(jobs = jobList)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        succ, fail = baAPI.submit(jobs = jobList)

        time.sleep(30)

        nRunning = getNArcJobs()
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        rJobs = baAPI._listRunJobs()
        nOldJobs = 0
        idStr = ""
        for j in rJobs:
            idStr += " " + j['gridid']
            if j['status'] != "New":
                nOldJobs += 1
        self.assertEqual(nOldJobs, nJobs)

        # Now kill 'em manually
        no_jobs = True
        while no_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") >= 0:
                # It seems the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                no_jobs = False

            # Just to be sure, if the jobs were already finished, do a
            # 'ngclean' too.
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a'
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]

        # Make sure the killing of the jobs reaches the info.sys.
        still_jobs = True
        while still_jobs:
            command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr
            pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True)
            output = pipe.communicate()[0]
            if output.find("Job information not found") < 0:
                # It seems the killing of the jobs hasn't reached the ARC info.sys yet.
                # Sleep a while and try again
                time.sleep(20)
                continue
            else:
                still_jobs = False

        # See what happened
        baAPI.track()

        idJobs = baAPI._loadByID(rJobs)
        nActiveJobs = 0
        nRemovedJobs = 0
        for j in idJobs:
            if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]:
                nActiveJobs += 1
            if j['status'] in [ "KILLING", "KILLED", "LOST" ]:
                nRemovedJobs += 1
        self.assertEqual(nActiveJobs, 0)
        self.assertEqual(nRemovedJobs, nJobs)

        return
Ejemplo n.º 13
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh'
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI  = BossAirAPI(config = config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname = "Users.New")
        newuser.execute(dn = userdn)
        for j in jobDummies:
            job = j # {'id': j['id']}
            job['custom']      = {'location': 'grid-ce-01.ba.infn.it'}
            job['location']    = 'grid-ce-01.ba.infn.it'
            job['plugin']      = 'gLitePlugin'
            job['name']        = j['name']
            job['cache_dir']   = self.testDir
            job['retry_count'] = 0
            job['owner']       = userdn
            job['packageDir']  = self.testDir
            job['sandbox']     = sandbox
            job['priority']    = None
            jobList.append(job)

        baAPI.submit(jobs = jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)


        # Killing all the jobs
        baAPI.kill( jobList )
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
Ejemplo n.º 14
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the CondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs)

        baAPI  = BossAirAPI(config = config)

        print self.testDir

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom']      = {'location': 'malpaquet'}
            tmpJob['name']        = j['name']
            tmpJob['cache_dir']   = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin']      = 'CondorPlugin'
            tmpJob['owner']       = 'tapas'
            tmpJob['packageDir']  = self.testDir
            tmpJob['sandbox']     = sandbox
            tmpJob['priority']    = None
            tmpJob['usergroup']   = "wheel"
            tmpJob['userrole']    = 'cmsuser'
            jobList.append(tmpJob)


        info = {}
        #info['packageDir'] = self.testDir
        info['index']      = 0
        info['sandbox']    = sandbox

        baAPI.submit(jobs = jobList, info = info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)


        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs = jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs = jobList, info = info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), nJobs)


        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status = 'Idle')
        self.assertEqual(len(newJobs), 0)

        #newJobs = baAPI._loadByStatus(status = 'Removed')
        #self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status = 'Removed', complete = '0')
        self.assertEqual(len(newJobs), nJobs)

        return
Ejemplo n.º 15
0
    def testG_gLiteTest(self):
        """
        _gLiteTest_

        This test works on the gLitePlugin, checking all of
        its functions with a single set of jobs
        """

        config = self.getConfig()
        config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf'
        config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/'
        config.BossAir.gLiteProcesses = 2
        config.BossAir.gLitePrefixEnv = "/lib64/"
        config.BossAir.pluginNames.append("gLitePlugin")
        config.BossAir.manualProxyPath = environ['X509_USER_PROXY']

        config.Agent.serverDN = "/we/bypass/myproxy/logon"

        #config.BossAir.pluginNames = ["gLitePlugin"]
        baAPI = BossAirAPI(config=config)

        nJobs = 2
        jobDummies = self.createDummyJobs(nJobs=nJobs,
                                          location='grid-ce-01.ba.infn.it')

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        userdn = executeCommand('grid-cert-info -subject -file %s' %
                                config.BossAir.manualProxyPath)
        newuser = self.daoFactory(classname="Users.New")
        newuser.execute(dn=userdn)
        for j in jobDummies:
            job = j  # {'id': j['id']}
            job['custom'] = {'location': 'grid-ce-01.ba.infn.it'}
            job['location'] = 'grid-ce-01.ba.infn.it'
            job['plugin'] = 'gLitePlugin'
            job['name'] = j['name']
            job['cache_dir'] = self.testDir
            job['retry_count'] = 0
            job['owner'] = userdn
            job['packageDir'] = self.testDir
            job['sandbox'] = sandbox
            job['priority'] = None
            jobList.append(job)

        baAPI.submit(jobs=jobList)

        # Should be new jobs
        newJobs = baAPI._loadByStatus(status='New')
        self.assertNotEqual(len(newJobs), nJobs)

        time.sleep(2)
        baAPI.track()

        # Should be not anymore marked as new
        newJobs = baAPI._loadByStatus('New', 0)
        self.assertNotEqual(len(newJobs), nJobs)

        # Killing all the jobs
        baAPI.kill(jobList)
        #time.sleep(15)
        baAPI.track()

        ## Issues running tests below due to glite delay on marking job as killed
        # Should be just running jobs
        #killedJobs = baAPI._loadByStatus('Cancelled by user', 0)
        #self.assertEqual(len(killedJobs), 0)

        # Check if they're complete
        #completeJobs = baAPI.getComplete()
        #self.assertEqual(len(completeJobs), nJobs)

        return
Ejemplo n.º 16
0
class StatusPoller(BaseWorkerThread):
    """
    _StatusPoller_

    Prototype for polling for
    JobStatusAir
    """
    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts')

        # init alert system
        self.initAlerts(compName="StatusPoller")
        return

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Handle any exceptions with the actual code
        """
        myThread = threading.currentThread()
        try:
            logging.info("Running job status poller algorithm...")
            self.checkStatus()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            self.sendAlert(6, msg=str(ex))
            raise
        except Exception as ex:
            msg = "Unhandled error in statusPoller"
            msg += str(ex)
            logging.exception(msg)
            self.sendAlert(6, msg=msg)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise StatusPollerException(msg)

        return

    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """

        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if not self.timeouts:
            # Then we've set ourselves to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = []

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime = job.get('status_time', None)
            timeout = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero",
                              job['id'])
                continue
            if timeout and statusTime:
                if time.time() - float(statusTime) > float(timeout):
                    # Timeout status is used by JobTracker to fail jobs in WMBS database
                    logging.info(
                        "Killing job %i because it has exceeded timeout for status '%s'",
                        job['id'], globalState)
                    job['status'] = 'Timeout'
                    jobsToKill.append(job)

        # We need to show that the jobs are in state timeout
        # and then kill them.
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKill)
        self.bossAir.kill(jobs=jobsToKill,
                          killMsg=WM_JOB_ERROR_CODES[71304],
                          errorCode=71304)
        myThread.transaction.commit()

        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Ejemplo n.º 17
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                killableJobs.append(liveJob)
        # Now kill them
        try:
            logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName)
            bossAir.kill(jobs=killableJobs, workflowName=workflowName)
        except BossAirException as ex:
            # Something's gone wrong. Jobs not killed!
            logging.error("Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.

    liveWMBSJobs = defaultdict(list)
    for liveJob in liveJobs:
        if liveJob["state"] == "killed":
            # Then we've killed it already
            continue
        liveWMBSJob = Job(id=liveJob["id"])
        liveWMBSJob.update(liveJob)
        liveWMBSJobs[liveJob["state"]].append(liveWMBSJob)

    for state, jobsByState in liveWMBSJobs.items():
        if len(jobsByState) > 100 and state != "executing":
            # if there are to many jobs skip the couch and dashboard update
            # TODO: couch and dashboard need to be updated or parallel.
            changeState.check("killed", state)
            changeState.persist(jobsByState, "killed", state)
        else:
            changeState.propagate(jobsByState, "killed", state)
    return
Ejemplo n.º 18
0
class StatusPoller(BaseWorkerThread):
    """
    _StatusPoller_

    Prototype for polling for
    JobStatusAir
    """

    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts')

        return

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Handle any exceptions with the actual code
        """
        myThread = threading.currentThread()
        try:
            logging.info("Running job status poller algorithm...")
            self.checkStatus()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise
        except Exception as ex:
            msg = "Unhandled error in statusPoller"
            msg += str(ex)
            logging.exception(msg)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise StatusPollerException(msg)

        return

    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """


        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if not self.timeouts:
            # Then we've set ourselves to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = defaultdict(list)

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime = job.get('status_time', None)
            timeout = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero", job['id'])
                continue
            if timeout and statusTime:
                if time.time() - float(statusTime) > float(timeout):
                    # Timeout status is used by JobTracker to fail jobs in WMBS database
                    logging.info("Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState)
                    job['status'] = 'Timeout'
                    jobsToKill[globalState].append(job)

        timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306}
        # We need to show that the jobs are in state timeout
        # and then kill them.
        jobsToKillList = flattenList(jobsToKill.values())
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKillList)
        for preJobStatus in jobsToKill:
            eCode = timeOutCodeMap.get(preJobStatus, 71307) # it shouldn't have 71307 (states should be among Running, Pending, Error)
            self.bossAir.kill(jobs=jobsToKill[preJobStatus], killMsg=WM_JOB_ERROR_CODES[eCode], errorCode=eCode)
        myThread.transaction.commit()

        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Ejemplo n.º 19
0
    def testC_CondorTest(self):
        """
        _CondorTest_

        This test works on the SimpleCondorPlugin, checking all of
        its functions with a single set of jobs
        """
        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        config = self.getConfig()
        config.BossAir.removeTime = -10.0

        nJobs = 10
        jobDummies = self.createDummyJobs(nJobs=nJobs)
        baAPI = BossAirAPI(config=config, insertStates=True)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'SimpleCondorPlugin'
            tmpJob['owner'] = 'tapas'
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            tmpJob['usergroup'] = "wheel"
            tmpJob['userrole'] = 'cmsuser'
            jobList.append(tmpJob)

        info = {}
        # info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Do a second time to make sure that the cache
        # doesn't die on us
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        baAPI.kill(jobs=jobList)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0)

        # Try resubmission
        for j in jobList:
            j['retry_count'] = 1

        baAPI.submit(jobs=jobList, info=info)

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, nJobs)

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), nJobs)

        # See where they are
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='New')
        self.assertEqual(len(newJobs), 0)

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), nJobs)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False)
        pipe.communicate()

        # See what happened
        baAPI.track()

        newJobs = baAPI._loadByStatus(status='Idle')
        self.assertEqual(len(newJobs), 0)

        # newJobs = baAPI._loadByStatus(status = 'Removed')
        # self.assertEqual(len(newJobs), nJobs)

        # Because removal time is -10.0, jobs should remove immediately
        baAPI.track()

        # Assert that jobs were listed as completed
        myThread = threading.currentThread()
        newJobs = baAPI._loadByStatus(status='Removed', complete='0')
        self.assertEqual(len(newJobs), nJobs)
        return
Ejemplo n.º 20
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                killableJobs.append(liveJob)
        # Now kill them
        try:
            logging.info("Killing %d jobs for workflow: %s", len(killableJobs),
                         workflowName)
            bossAir.kill(jobs=killableJobs, workflowName=workflowName)
        except BossAirException as ex:
            # Something's gone wrong. Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.

    liveWMBSJobs = defaultdict(list)
    for liveJob in liveJobs:
        if liveJob["state"] == "killed":
            # Then we've killed it already
            continue
        liveWMBSJob = Job(id=liveJob["id"])
        liveWMBSJob.update(liveJob)
        liveWMBSJobs[liveJob["state"]].append(liveWMBSJob)

    for state, jobsByState in liveWMBSJobs.items():
        if len(jobsByState) > 100 and state != "executing":
            # if there are to many jobs skip the couch and dashboard update
            # TODO: couch and dashboard need to be updated or parallel.
            changeState.check("killed", state)
            changeState.persist(jobsByState, "killed", state)
        else:
            changeState.propagate(jobsByState, "killed", state)
    return
Ejemplo n.º 21
0
class StatusPoller(BaseWorkerThread):
    """
    _StatusPoller_

    Prototype for polling for
    JobStatusAir
    """

    def __init__(self, config):
        """
        __init__

        Set up the caching and other objects
        """
        self.config = config
        BaseWorkerThread.__init__(self)

        self.cachedJobs = []

        self.bossAir = BossAirAPI(config=config)

        # With no timeouts, nothing ever happens
        # Otherwise we expect a dictionary with the keys representing
        # the states and the values the timeouts.
        self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts', {})

        # init alert system
        self.initAlerts(compName="StatusPoller")
        return

    def algorithm(self, parameters=None):
        """
        _algorithm_

        Handle any exceptions with the actual code
        """
        myThread = threading.currentThread()
        try:
            self.checkStatus()
        except WMException as ex:
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            self.sendAlert(6, msg=str(ex))
            raise
        except Exception as ex:
            msg = "Unhandled error in statusPoller"
            msg += str(ex)
            logging.exception(msg)
            self.sendAlert(6, msg=msg)
            if getattr(myThread, 'transaction', None):
                myThread.transaction.rollbackForError()
            raise StatusPollerException(msg)

        return

    def checkStatus(self):
        """
        _checkStatus_

        Run the BossAir track() function (self-contained)
        and then check for jobs that have timed out.
        """


        runningJobs = self.bossAir.track()

        if len(runningJobs) < 1:
            # Then we have no jobs
            return

        if self.timeouts == {}:
            # Then we've set outself to have no timeouts
            # Get out and stay out
            return

        # Look for jobs that need to be killed
        jobsToKill = []

        # Now check for timeouts
        for job in runningJobs:
            globalState = job.get('globalState', 'Error')
            statusTime  = job.get('status_time', None)
            timeout     = self.timeouts.get(globalState, None)
            if statusTime == 0:
                logging.error("Not killing job %i, the status time was zero" % job['id'])
                continue
            if timeout != None and statusTime != None:
                if time.time() - float(statusTime) > float(timeout):
                    # Then the job needs to be killed.
                    logging.info("Killing job %i because it has exceeded timeout for status %s" % (job['id'], globalState))
                    job['status'] = 'Timeout'
                    jobsToKill.append(job)

        # We need to show that the jobs are in state timeout
        # and then kill them.
        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.bossAir.update(jobs=jobsToKill)
        self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[61304], errorCode=61304)
        myThread.transaction.commit()


        return

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)