Esempio n. 1
0
    def testJobKilling(self):
        """
        _testJobKilling_

        Test that we can successfully set jobs to the killed state
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname = "Locations.New")
        locationAction.execute("site1", seName = "somese.cern.ch")

        testWorkflow = Workflow(spec = "spec.xml", owner = "Steve",
                                name = "wf001", task = self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name = "TestFileset")
        testFileset.create()

        for i in range(4):
            newFile = File(lfn = "File%s" % i, locations = set(["somese.cern.ch"]))
            newFile.create()
            testFileset.addFile(newFile)

        testFileset.commit()
        testSubscription = Subscription(fileset = testFileset,
                                        workflow = testWorkflow,
                                        split_algo = "FileBased")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package = "WMCore.WMBS",
                              subscription = testSubscription)
        jobGroup = jobFactory(files_per_job = 1)[0]

        assert len(jobGroup.jobs) == 4, \
               "Error: Splitting should have created four jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Processing"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"
        testJobC = jobGroup.jobs[2]
        testJobC["user"] = "******"
        testJobC["group"] = "DMWM"
        testJobC["taskType"] = "Processing"
        testJobD = jobGroup.jobs[3]
        testJobD["user"] = "******"
        testJobD["group"] = "DMWM"
        testJobD["taskType"] = "Processing"

        change.persist([testJobA], "created", "new")
        change.persist([testJobB], "jobfailed", "executing")
        change.persist([testJobC, testJobD], "executing", "created")

        change.persist([testJobA], "killed", "created")
        change.persist([testJobB], "killed", "jobfailed")
        change.persist([testJobC, testJobD], "killed", "executing")

        for job in [testJobA, testJobB, testJobC, testJobD]:
            job.load()
            self.assertEqual(job['retry_count'], 99999)
            self.assertEqual(job['state'], 'killed')

        return
Esempio n. 2
0
    def testPersist(self):
        """
        _testPersist_

        This is the test class for function Propagate from module ChangeState
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname = "Locations.New")
        locationAction.execute("site1", seName = "somese.cern.ch")

        testWorkflow = Workflow(spec = "spec.xml", owner = "Steve",
                                name = "wf001", task = self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name = "TestFileset")
        testFileset.create()

        for i in range(4):
            newFile = File(lfn = "File%s" % i, locations = set(["somese.cern.ch"]))
            newFile.create()
            testFileset.addFile(newFile)

        testFileset.commit()
        testSubscription = Subscription(fileset = testFileset,
                                        workflow = testWorkflow,
                                        split_algo = "FileBased")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package = "WMCore.WMBS",
                              subscription = testSubscription)
        jobGroup = jobFactory(files_per_job = 1)[0]

        assert len(jobGroup.jobs) == 4, \
               "Error: Splitting should have created four jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Processing"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"
        testJobC = jobGroup.jobs[2]
        testJobC["user"] = "******"
        testJobC["group"] = "DMWM"
        testJobC["taskType"] = "Processing"
        testJobD = jobGroup.jobs[3]
        testJobD["user"] = "******"
        testJobD["group"] = "DMWM"
        testJobD["taskType"] = "Processing"

        change.persist([testJobA, testJobB], "created", "new")
        change.persist([testJobC, testJobD], "new", "none")

        stateDAO = self.daoFactory(classname = "Jobs.GetState")

        jobAState = stateDAO.execute(id = testJobA["id"])
        jobBState = stateDAO.execute(id = testJobB["id"])
        jobCState = stateDAO.execute(id = testJobC["id"])
        jobDState = stateDAO.execute(id = testJobD["id"])

        assert jobAState == "created" and jobBState =="created" and \
               jobCState == "new" and jobDState == "new", \
               "Error: Jobs didn't change state correctly."

        return
Esempio n. 3
0
    def testRetryCount(self):
        """
        _testRetryCount_

        Verify that the retry count is incremented when we move out of the
        submitcooloff or jobcooloff state.
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname = "Locations.New")
        locationAction.execute("site1", seName = "somese.cern.ch")

        testWorkflow = Workflow(spec = "spec.xml", owner = "Steve",
                                name = "wf001", task = self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name = "TestFileset")
        testFileset.create()

        for i in range(4):
            newFile = File(lfn = "File%s" % i, locations = set(["somese.cern.ch"]))
            newFile.create()
            testFileset.addFile(newFile)

        testFileset.commit()
        testSubscription = Subscription(fileset = testFileset,
                                        workflow = testWorkflow,
                                        split_algo = "FileBased")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package = "WMCore.WMBS",
                              subscription = testSubscription)
        jobGroup = jobFactory(files_per_job = 1)[0]

        assert len(jobGroup.jobs) == 4, \
               "Error: Splitting should have created four jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Processing"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"
        testJobC = jobGroup.jobs[2]
        testJobC["user"] = "******"
        testJobC["group"] = "DMWM"
        testJobC["taskType"] = "Processing"
        testJobD = jobGroup.jobs[3]
        testJobD["user"] = "******"
        testJobD["group"] = "DMWM"
        testJobD["taskType"] = "Processing"

        change.persist([testJobA], "created", "submitcooloff")
        change.persist([testJobB], "created", "jobcooloff")
        change.persist([testJobC, testJobD], "new", "none")

        testJobA.load()
        testJobB.load()
        testJobC.load()
        testJobD.load()

        assert testJobA["retry_count"] == 1, \
               "Error: Retry count is wrong."
        assert testJobB["retry_count"] == 1, \
               "Error: Retry count is wrong."
        assert testJobC["retry_count"] == 0, \
               "Error: Retry count is wrong."
        assert testJobD["retry_count"] == 0, \
               "Error: Retry count is wrong."

        return
Esempio n. 4
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                killableJobs.append(liveJob)
        # Now kill them
        try:
            logging.info("Killing %d jobs for workflow: %s", len(killableJobs),
                         workflowName)
            bossAir.kill(jobs=killableJobs, workflowName=workflowName)
        except BossAirException as ex:
            # Something's gone wrong. Jobs not killed!
            logging.error(
                "Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.

    liveWMBSJobs = defaultdict(list)
    for liveJob in liveJobs:
        if liveJob["state"] == "killed":
            # Then we've killed it already
            continue
        liveWMBSJob = Job(id=liveJob["id"])
        liveWMBSJob.update(liveJob)
        liveWMBSJobs[liveJob["state"]].append(liveWMBSJob)

    for state, jobsByState in liveWMBSJobs.items():
        if len(jobsByState) > 100 and state != "executing":
            # if there are to many jobs skip the couch and dashboard update
            # TODO: couch and dashboard need to be updated or parallel.
            changeState.check("killed", state)
            changeState.persist(jobsByState, "killed", state)
        else:
            changeState.propagate(jobsByState, "killed", state)
    return
Esempio n. 5
0
    def testJobKilling(self):
        """
        _testJobKilling_

        Test that we can successfully set jobs to the killed state
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute("site1", pnn="T2_CH_CERN")

        testWorkflow = Workflow(spec=self.specUrl, owner="Steve",
                                name="wf001", task=self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name="TestFileset")
        testFileset.create()

        for i in range(4):
            newFile = File(lfn="File%s" % i, locations=set(["T2_CH_CERN"]))
            newFile.create()
            testFileset.addFile(newFile)

        testFileset.commit()
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)
        jobGroup = jobFactory(files_per_job=1)[0]

        assert len(jobGroup.jobs) == 4, \
               "Error: Splitting should have created four jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Processing"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"
        testJobC = jobGroup.jobs[2]
        testJobC["user"] = "******"
        testJobC["group"] = "DMWM"
        testJobC["taskType"] = "Processing"
        testJobD = jobGroup.jobs[3]
        testJobD["user"] = "******"
        testJobD["group"] = "DMWM"
        testJobD["taskType"] = "Processing"

        change.persist([testJobA], "created", "new")
        change.persist([testJobB], "jobfailed", "executing")
        change.persist([testJobC, testJobD], "executing", "created")

        change.persist([testJobA], "killed", "created")
        change.persist([testJobB], "killed", "jobfailed")
        change.persist([testJobC, testJobD], "killed", "executing")

        for job in [testJobA, testJobB, testJobC, testJobD]:
            job.load()
            self.assertEqual(job['retry_count'], 99999)
            self.assertEqual(job['state'], 'killed')

        return
Esempio n. 6
0
    def testRetryCount(self):
        """
        _testRetryCount_

        Verify that the retry count is incremented when we move out of the
        submitcooloff or jobcooloff state.
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname = "Locations.New")
        locationAction.execute("site1", pnn = "T2_CH_CERN")

        testWorkflow = Workflow(spec=self.specUrl, owner="Steve",
                                name="wf001", task=self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name="TestFileset")
        testFileset.create()

        for i in range(4):
            newFile = File(lfn="File%s" % i, locations=set(["T2_CH_CERN"]))
            newFile.create()
            testFileset.addFile(newFile)

        testFileset.commit()
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)
        jobGroup = jobFactory(files_per_job=1)[0]

        assert len(jobGroup.jobs) == 4, \
               "Error: Splitting should have created four jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Processing"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"
        testJobC = jobGroup.jobs[2]
        testJobC["user"] = "******"
        testJobC["group"] = "DMWM"
        testJobC["taskType"] = "Processing"
        testJobD = jobGroup.jobs[3]
        testJobD["user"] = "******"
        testJobD["group"] = "DMWM"
        testJobD["taskType"] = "Processing"

        change.persist([testJobA], "created", "submitcooloff")
        change.persist([testJobB], "created", "jobcooloff")
        change.persist([testJobC, testJobD], "new", "none")

        testJobA.load()
        testJobB.load()
        testJobC.load()
        testJobD.load()

        assert testJobA["retry_count"] == 1, \
               "Error: Retry count is wrong."
        assert testJobB["retry_count"] == 1, \
               "Error: Retry count is wrong."
        assert testJobC["retry_count"] == 0, \
               "Error: Retry count is wrong."
        assert testJobD["retry_count"] == 0, \
               "Error: Retry count is wrong."

        return
Esempio n. 7
0
    def testPersist(self):
        """
        _testPersist_

        This is the test class for function Propagate from module ChangeState
        """
        change = ChangeState(self.config, "changestate_t")

        locationAction = self.daoFactory(classname="Locations.New")
        locationAction.execute("site1", pnn="T2_CH_CERN")

        testWorkflow = Workflow(spec=self.specUrl, owner="Steve",
                                name="wf001", task=self.taskName)
        testWorkflow.create()
        testFileset = Fileset(name="TestFileset")
        testFileset.create()

        for i in range(4):
            newFile = File(lfn="File%s" % i, locations=set(["T2_CH_CERN"]))
            newFile.create()
            testFileset.addFile(newFile)

        testFileset.commit()
        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)
        jobGroup = jobFactory(files_per_job=1)[0]

        assert len(jobGroup.jobs) == 4, \
               "Error: Splitting should have created four jobs."

        testJobA = jobGroup.jobs[0]
        testJobA["user"] = "******"
        testJobA["group"] = "DMWM"
        testJobA["taskType"] = "Processing"
        testJobB = jobGroup.jobs[1]
        testJobB["user"] = "******"
        testJobB["group"] = "DMWM"
        testJobB["taskType"] = "Processing"
        testJobC = jobGroup.jobs[2]
        testJobC["user"] = "******"
        testJobC["group"] = "DMWM"
        testJobC["taskType"] = "Processing"
        testJobD = jobGroup.jobs[3]
        testJobD["user"] = "******"
        testJobD["group"] = "DMWM"
        testJobD["taskType"] = "Processing"

        change.persist([testJobA, testJobB], "created", "new")
        change.persist([testJobC, testJobD], "new", "none")

        stateDAO = self.daoFactory(classname="Jobs.GetState")

        jobAState = stateDAO.execute(id=testJobA["id"])
        jobBState = stateDAO.execute(id=testJobB["id"])
        jobCState = stateDAO.execute(id=testJobC["id"])
        jobDState = stateDAO.execute(id=testJobD["id"])

        assert jobAState == "created" and jobBState =="created" and \
               jobCState == "new" and jobDState == "new", \
               "Error: Jobs didn't change state correctly."

        return
Esempio n. 8
0
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None):
    """
    _killWorkflow_

    Kill a workflow that is already executing inside the agent.  This will
    mark all incomplete jobs as failed and files that belong to all
    non-cleanup and non-logcollect subscriptions as failed.  The name of the
    JSM couch database and the URL to the database must be passed in as well
    so the state transitions are logged.
    """
    myThread = threading.currentThread()
    daoFactory = DAOFactory(package="WMCore.WMBS",
                            logger=myThread.logger,
                            dbinterface=myThread.dbi)
    killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow")
    killJobsAction = daoFactory(classname="Jobs.KillWorkflow")

    killFilesAction.execute(workflowName=workflowName,
                            conn=myThread.transaction.conn)

    liveJobs = killJobsAction.execute(workflowName=workflowName,
                                      conn=myThread.transaction.conn)

    changeState = ChangeState(jobCouchConfig)

    # Deal with any jobs that are running in the batch system
    # only works if we can start the API
    if bossAirConfig:
        bossAir = BossAirAPI(config=bossAirConfig, noSetup=True)
        killableJobs = []
        for liveJob in liveJobs:
            if liveJob["state"].lower() == 'executing':
                # Then we need to kill this on the batch system
                liveWMBSJob = Job(id=liveJob["id"])
                liveWMBSJob.update(liveJob)
                killableJobs.append(liveJob)
        # Now kill them
        try:
            logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName)
            bossAir.kill(jobs=killableJobs, workflowName=workflowName)
        except BossAirException as ex:
            # Something's gone wrong. Jobs not killed!
            logging.error("Error while trying to kill running jobs in workflow!\n")
            logging.error(str(ex))
            trace = getattr(ex, 'traceback', '')
            logging.error(trace)
            # But continue; we need to kill the jobs in the master
            # the batch system will have to take care of itself.

    liveWMBSJobs = defaultdict(list)
    for liveJob in liveJobs:
        if liveJob["state"] == "killed":
            # Then we've killed it already
            continue
        liveWMBSJob = Job(id=liveJob["id"])
        liveWMBSJob.update(liveJob)
        liveWMBSJobs[liveJob["state"]].append(liveWMBSJob)

    for state, jobsByState in liveWMBSJobs.items():
        if len(jobsByState) > 100 and state != "executing":
            # if there are to many jobs skip the couch and dashboard update
            # TODO: couch and dashboard need to be updated or parallel.
            changeState.check("killed", state)
            changeState.persist(jobsByState, "killed", state)
        else:
            changeState.propagate(jobsByState, "killed", state)
    return