コード例 #1
0
 def testBatchResourceLimits(self):
     jobNode1 = JobNode(command="sleep 1000",
                        requirements=dict(memory=1 << 30, cores=1,
                                          disk=1000, preemptable=preemptable),
                        jobName='testResourceLimits', unitName=None,
                        jobStoreID='1')
     job1 = self.batchSystem.issueBatchJob(jobNode1)
     self.assertIsNotNone(job1)
     jobNode2 = JobNode(command="sleep 1000",
                        requirements=dict(memory=2 << 30, cores=1,
                                          disk=1000, preemptable=preemptable),
                        jobName='testResourceLimits', unitName=None,
                        jobStoreID='2')
     job2 = self.batchSystem.issueBatchJob(jobNode2)
     self.assertIsNotNone(job2)
     batches = self._getBatchList()
     self.assertEqual(len(batches), 2)
     # It would be better to directly check that the batches have the correct memory and cpu
     # values, but Parasol seems to slightly change the values sometimes.
     self.assertNotEqual(batches[0]['ram'], batches[1]['ram'])
     # Need to kill one of the jobs because there are only two cores available
     self.batchSystem.killBatchJobs([job2])
     job3 = self.batchSystem.issueBatchJob(jobNode1)
     self.assertIsNotNone(job3)
     batches = self._getBatchList()
     self.assertEqual(len(batches), 1)
コード例 #2
0
        def testSetEnv(self):
            # Parasol disobeys shell rules and stupidly splits the command at the space character
            # before exec'ing it, whether the space is quoted, escaped or not. This means that we
            # can't have escaped or quotes spaces in the command line. So we can't use bash -c
            #  '...' or python -c '...'. The safest thing to do here is to script the test and
            # invoke that script rather than inline the test via -c.
            def assertEnv():
                import os, sys
                sys.exit(0 if os.getenv('FOO') == 'bar' else 42)

            script_body = dedent('\n'.join(getsource(assertEnv).split('\n')[1:]))
            with tempFileContaining(script_body, suffix='.py') as script_path:
                # First, ensure that the test fails if the variable is *not* set
                command = sys.executable + ' ' + script_path
                jobNode4 = JobNode(command=command, jobName='test4', unitName=None,
                                   jobStoreID='4', requirements=defaultRequirements)
                job4 = self.batchSystem.issueBatchJob(jobNode4)
                jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000)
                self.assertEqual(exitStatus, 42)
                self.assertEqual(jobID, job4)
                # Now set the variable and ensure that it is present
                self.batchSystem.setEnv('FOO', 'bar')
                jobNode5 = JobNode(command=command, jobName='test5', unitName=None,
                                   jobStoreID='5', requirements=defaultRequirements)
                job5 = self.batchSystem.issueBatchJob(jobNode5)
                jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000)
                self.assertEqual(exitStatus, 0)
                self.assertEqual(jobID, job5)
コード例 #3
0
ファイル: batchSystemTest.py プロジェクト: diekhans/toil
        def testSetEnv(self):
            # Parasol disobeys shell rules and stupidly splits the command at
            # the space character into arguments before exec'ing it, whether
            # the space is quoted, escaped or not.

            script_shell = 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi'

            # Escape the semicolons
            script_protected = script_shell.replace(';', '\;')

            # Turn into a string which convinces bash to take all args and paste them back together and run them
            command = "bash -c \"\\${@}\" bash eval " + script_protected
            log.critical(command)
            jobNode4 = JobNode(command=command,
                               jobName='test4',
                               unitName=None,
                               jobStoreID='4',
                               requirements=defaultRequirements)
            job4 = self.batchSystem.issueBatchJob(jobNode4)
            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)
            self.assertEqual(exitStatus, 42)
            self.assertEqual(jobID, job4)
            # Now set the variable and ensure that it is present
            self.batchSystem.setEnv('FOO', 'bar')
            jobNode5 = JobNode(command=command,
                               jobName='test5',
                               unitName=None,
                               jobStoreID='5',
                               requirements=defaultRequirements)
            job5 = self.batchSystem.issueBatchJob(jobNode5)
            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)
            self.assertEqual(exitStatus, 23)
            self.assertEqual(jobID, job5)
コード例 #4
0
        def testRunJobs(self):
            jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None,
                               jobStoreID='1', requirements=defaultRequirements)
            jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None,
                               jobStoreID='2', requirements=defaultRequirements)
            job1 = self.batchSystem.issueBatchJob(jobNode1)
            job2 = self.batchSystem.issueBatchJob(jobNode2)

            issuedIDs = self._waitForJobsToIssue(2)
            self.assertEqual(set(issuedIDs), {job1, job2})

            # Now at some point we want these jobs to become running
            # But since we may be testing against a live cluster (Kubernetes)
            # we want to handle weird cases and high cluster load as much as we can.

            # Wait a bit for any Dockers to download and for the
            # jobs to have a chance to start.
            # TODO: We insist on neither of these ever finishing when we test
            # getUpdatedBatchJob, and the sleep time is longer than the time we
            # should spend waiting for both to start, so if our cluster can
            # only run one job at a time, we will fail the test.
            runningJobIDs = self._waitForJobsToStart(2, tries=120)
            self.assertEqual(set(runningJobIDs), {job1, job2})

            # Killing the jobs instead of allowing them to complete means this test can run very
            # quickly if the batch system issues and starts the jobs quickly.
            self.batchSystem.killBatchJobs([job1, job2])
            self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs())

            # Issue a job and then allow it to finish by itself, causing it to be added to the
            # updated jobs queue.
            # We would like to have this touch something on the filesystem and
            # then check for it having happened, but we can't guarantee that
            # the batch system will run against the same filesystem we are
            # looking at.
            jobNode3 = JobNode(command="mktemp -d", jobName='test3', unitName=None,
                               jobStoreID='3', requirements=defaultRequirements)
            job3 = self.batchSystem.issueBatchJob(jobNode3)

            jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000)
            jobID, exitStatus, wallTime = jobUpdateInfo.jobID, jobUpdateInfo.exitStatus, jobUpdateInfo.wallTime
            log.info('Third job completed: {} {} {}'.format(jobID, exitStatus, wallTime))

            # Since the first two jobs were killed, the only job in the updated jobs queue should
            # be job 3. If the first two jobs were (incorrectly) added to the queue, this will
            # fail with jobID being equal to job1 or job2.
            self.assertEqual(jobID, job3)
            self.assertEqual(exitStatus, 0)
            if self.supportsWallTime():
                self.assertTrue(wallTime > 0)
            else:
                self.assertIsNone(wallTime)
            # TODO: Work out a way to check if the job we asked to run actually ran.
            # Don't just believe the batch system, but don't assume it ran on this machine either.
            self.assertFalse(self.batchSystem.getUpdatedBatchJob(0))

            # Make sure killBatchJobs can handle jobs that don't exist
            self.batchSystem.killBatchJobs([10])
コード例 #5
0
ファイル: batchSystemTest.py プロジェクト: diekhans/toil
        def testRunJobs(self):
            jobNode1 = JobNode(command='sleep 1000',
                               jobName='test1',
                               unitName=None,
                               jobStoreID='1',
                               requirements=defaultRequirements)
            jobNode2 = JobNode(command='sleep 1000',
                               jobName='test2',
                               unitName=None,
                               jobStoreID='2',
                               requirements=defaultRequirements)
            job1 = self.batchSystem.issueBatchJob(jobNode1)
            job2 = self.batchSystem.issueBatchJob(jobNode2)

            issuedIDs = self._waitForJobsToIssue(2)
            self.assertEqual(set(issuedIDs), {job1, job2})

            runningJobIDs = self._waitForJobsToStart(2)
            self.assertEqual(set(runningJobIDs), {job1, job2})

            # Killing the jobs instead of allowing them to complete means this test can run very
            # quickly if the batch system issues and starts the jobs quickly.
            self.batchSystem.killBatchJobs([job1, job2])
            self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs())

            # Issue a job and then allow it to finish by itself, causing it to be added to the
            # updated jobs queue.
            # We would like to have this touch something on the filesystem and
            # then check for it having happened, but we can't guarantee that
            # the batch system will run against the same filesystem we are
            # looking at.
            jobNode3 = JobNode(command="mktemp -d",
                               jobName='test3',
                               unitName=None,
                               jobStoreID='3',
                               requirements=defaultRequirements)
            job3 = self.batchSystem.issueBatchJob(jobNode3)

            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)

            # Since the first two jobs were killed, the only job in the updated jobs queue should
            # be job 3. If the first two jobs were (incorrectly) added to the queue, this will
            # fail with jobID being equal to job1 or job2.
            self.assertEqual(jobID, job3)
            self.assertEqual(exitStatus, 0)
            if self.supportsWallTime():
                self.assertTrue(wallTime > 0)
            else:
                self.assertIsNone(wallTime)
            # TODO: Work out a way to check if the job we asked to run actually ran.
            # Don't just believe the batch system, but don't assume it ran on this machine either.
            self.assertFalse(self.batchSystem.getUpdatedBatchJob(0))

            # Make sure killBatchJobs can handle jobs that don't exist
            self.batchSystem.killBatchJobs([10])
コード例 #6
0
        def testRunJobs(self):
            testPath = os.path.join(self.tempDir, "test.txt")
            jobNode1 = JobNode(command='sleep 1000',
                               jobName='test1',
                               unitName=None,
                               jobStoreID='1',
                               requirements=defaultRequirements)
            jobNode2 = JobNode(command='sleep 1000',
                               jobName='test2',
                               unitName=None,
                               jobStoreID='2',
                               requirements=defaultRequirements)
            job1 = self.batchSystem.issueBatchJob(jobNode1)
            job2 = self.batchSystem.issueBatchJob(jobNode2)

            issuedIDs = self._waitForJobsToIssue(2)
            self.assertEqual(set(issuedIDs), {job1, job2})

            runningJobIDs = self._waitForJobsToStart(2)
            self.assertEqual(set(runningJobIDs), {job1, job2})

            # Killing the jobs instead of allowing them to complete means this test can run very
            # quickly if the batch system issues and starts the jobs quickly.
            self.batchSystem.killBatchJobs([job1, job2])
            self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs())

            # Issue a job and then allow it to finish by itself, causing it to be added to the
            # updated jobs queue.
            self.assertFalse(os.path.exists(testPath))
            jobNode3 = JobNode(command="touch %s" % testPath,
                               jobName='test3',
                               unitName=None,
                               jobStoreID='3',
                               requirements=defaultRequirements)
            job3 = self.batchSystem.issueBatchJob(jobNode3)

            jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(
                maxWait=1000)

            # Since the first two jobs were killed, the only job in the updated jobs queue should
            # be job 3. If the first two jobs were (incorrectly) added to the queue, this will
            # fail with jobID being equal to job1 or job2.
            self.assertEqual(exitStatus, 0)
            self.assertEqual(jobID, job3)
            if self.supportsWallTime():
                self.assertTrue(wallTime > 0)
            else:
                self.assertIsNone(wallTime)
            if not os.path.exists(testPath):
                time.sleep(20)
            self.assertTrue(os.path.exists(testPath))
            self.assertFalse(self.batchSystem.getUpdatedBatchJob(0))

            # Make sure killBatchJobs can handle jobs that don't exist
            self.batchSystem.killBatchJobs([10])
コード例 #7
0
    def _processFailedSuccessors(self, jobGraph):
        """Some of the jobs successors failed then either fail the job
        or restart it if it has retries left and is a checkpoint job"""

        if jobGraph.jobStoreID in self.toilState.servicesIssued:
            # The job has services running, signal for them to be killed
            # once they are killed then the jobGraph will be re-added to
            # the updatedJobs set and then scheduled to be removed
            logger.debug("Telling job: %s to terminate its services due to successor failure",
                         jobGraph.jobStoreID)
            self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID],
                                             error=True)
        elif jobGraph.jobStoreID in self.toilState.successorCounts:
            # The job has non-service jobs running wait for them to finish
            # the job will be re-added to the updated jobs when these jobs
            # are done
            logger.debug("Job %s with ID: %s with failed successors still has successor jobs running",
                         jobGraph, jobGraph.jobStoreID)
        elif jobGraph.checkpoint is not None and jobGraph.remainingRetryCount > 1:
            # If the job is a checkpoint and has remaining retries then reissue it.
            # The logic behind using > 1 rather than > 0 here: Since this job has
            # been tried once (without decreasing its retry count as the job
            # itself was successful), and its subtree failed, it shouldn't be retried
            # unless it has more than 1 try.
            logger.warn('Job: %s is being restarted as a checkpoint after the total '
                        'failure of jobs in its subtree.', jobGraph.jobStoreID)
            self.issueJob(JobNode.fromJobGraph(jobGraph))
        else:
            # Mark it totally failed
            logger.debug("Job %s is being processed as completely failed", jobGraph.jobStoreID)
            self.processTotallyFailedJob(jobGraph)
コード例 #8
0
ファイル: clusterScalerTest.py プロジェクト: ratschlab/toil
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs, jobShape):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain that
        autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        mock.start()
        clusterScaler = ScalerThread(mock, mock, config)
        clusterScaler.start()
        try:
            # Add 100 jobs to complete
            list(map(lambda x: mock.addJob(jobShape=jobShape),
                     list(range(numJobs))))
            list(map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True),
                     list(range(numPreemptableJobs))))

            # Add some completed jobs
            for preemptable in (True, False):
                if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                    # Add 1000 random jobs
                    for _ in range(1000):
                        x = mock.getNodeShape(nodeType=jobShape)
                        iJ = JobNode(jobStoreID=1,
                                     requirements=dict(
                                         memory=random.choice(list(range(1, x.memory))),
                                         cores=random.choice(list(range(1, x.cores))),
                                         disk=random.choice(list(range(1, x.disk))),
                                         preemptable=preemptable),
                                     command=None,
                                     jobName='testClusterScaling', unitName='')
                        clusterScaler.addCompletedJob(iJ, random.choice(list(range(1, x.wallTime))))

            startTime = time.time()
            # Wait while the cluster processes the jobs
            while (mock.getNumberOfJobsIssued(preemptable=False) > 0
                   or mock.getNumberOfJobsIssued(preemptable=True) > 0
                   or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0):
                logger.debug("Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                            "preemptable queue size: %s, preemptable workers: %s" %
                            (mock.getNumberOfJobsIssued(preemptable=False),
                             mock.getNumberOfNodes(preemptable=False),
                             mock.getNumberOfJobsIssued(preemptable=True),
                             mock.getNumberOfNodes(preemptable=True)))
                clusterScaler.check()
                time.sleep(0.5)
            logger.debug("We waited %s for cluster to finish" % (time.time() - startTime))
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Print some info about the autoscaling
        logger.debug("Total-jobs: %s: Max-workers: %s, "
                     "Total-worker-time: %s, Worker-time-per-job: %s" %
                    (mock.totalJobs, sum(mock.maxWorkers.values()),
                     mock.totalWorkerTime,
                     old_div(mock.totalWorkerTime, mock.totalJobs) if mock.totalJobs > 0 else 0.0))
コード例 #9
0
ファイル: jobStoreTest.py プロジェクト: brainstorm/toil
    def testOverlargeJob(self):
        master = self.master
        masterRequirements = dict(memory=12, cores=34, disk=35, preemptable=True)
        overlargeJobNodeOnMaster = JobNode(command='master-overlarge',
                                    requirements=masterRequirements,
                                    jobName='test-overlarge', unitName='onMaster',
                                    jobStoreID=None, predecessorNumber=0)

        #Make the pickled size of the job larger than 256K
        with open("/dev/urandom", "r") as random:
            overlargeJobNodeOnMaster.jobName = random.read(512 * 1024)
        overlargeJobOnMaster = master.create(overlargeJobNodeOnMaster)
        self.assertTrue(master.exists(overlargeJobOnMaster.jobStoreID))
        overlargeJobOnMasterDownloaded = master.load(overlargeJobOnMaster.jobStoreID)
        jobsOnMaster = [job for job in master.jobs()]
        self.assertEqual(jobsOnMaster, [overlargeJobOnMaster])
        master.delete(overlargeJobOnMaster.jobStoreID)
コード例 #10
0
 def setUp(self):
     super(AbstractJobStoreTest.Test, self).setUp()
     self.namePrefix = 'jobstore-test-' + str(uuid.uuid4())
     self.master = self._createJobStore()
     self.config = self._createConfig()
     self.master.initialize(self.config)
     self.arbitraryRequirements = {'memory': 1, 'disk': 2, 'cores': 1, 'preemptable': False}
     self.arbitraryJob = JobNode(command='command',
                                 jobStoreID=None,
                                 jobName='arbitrary', unitName=None,
                                 requirements=self.arbitraryRequirements)
コード例 #11
0
    def testIgnoreNode(self):
        self.batchSystem.ignoreNode('localhost')
        jobNode = JobNode(command='sleep 1000', jobName='test2', unitName=None,
                           jobStoreID='1', requirements=defaultRequirements)
        job = self.batchSystem.issueBatchJob(jobNode)

        issuedID = self._waitForJobsToIssue(1)
        self.assertEqual(set(issuedID), {job})

        runningJobIDs = self._waitForJobsToStart(1)
        #Make sure job is NOT running
        self.assertEqual(set(runningJobIDs), set({}))
コード例 #12
0
 def test(self):
     # We'll use fractions to avoid rounding errors. Remember that not every fraction can be
     # represented as a floating point number.
     F = Fraction
     # This test isn't general enough to cover every possible value of minCores in
     # SingleMachineBatchSystem. Instead we hard-code a value and assert it.
     minCores = F(1, 10)
     self.assertEquals(float(minCores), SingleMachineBatchSystem.minCores)
     for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}:
         for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}:
             for load in (F(1, 10), F(1), F(10)):
                 jobs = int(maxCores / coresPerJob * load)
                 if jobs >= 1 and minCores <= coresPerJob < maxCores:
                     self.assertEquals(maxCores, float(maxCores))
                     bs = SingleMachineBatchSystem(
                         config=hidden.AbstractBatchSystemTest.createConfig(),
                         maxCores=float(maxCores),
                         # Ensure that memory or disk requirements don't get in the way.
                         maxMemory=jobs * 10,
                         maxDisk=jobs * 10)
                     try:
                         jobIds = set()
                         for i in range(0, int(jobs)):
                             jobIds.add(bs.issueBatchJob(JobNode(command=self.scriptCommand(),
                                                                 requirements=dict(
                                                                     cores=float( coresPerJob),
                                                                     memory=1, disk=1,
                                                                     preemptable=preemptable),
                                                                 jobName=str(i), unitName='', jobStoreID=str(i))))
                         self.assertEquals(len(jobIds), jobs)
                         while jobIds:
                             job = bs.getUpdatedBatchJob(maxWait=10)
                             self.assertIsNotNone(job)
                             jobId, status, wallTime = job
                             self.assertEquals(status, 0)
                             # would raise KeyError on absence
                             jobIds.remove(jobId)
                     finally:
                         bs.shutdown()
                     concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath)
                     self.assertEquals(concurrentTasks, 0)
                     log.info('maxCores: {maxCores}, '
                              'coresPerJob: {coresPerJob}, '
                              'load: {load}'.format(**locals()))
                     # This is the key assertion:
                     expectedMaxConcurrentTasks = min(old_div(maxCores, coresPerJob), jobs)
                     self.assertEquals(maxConcurrentTasks, expectedMaxConcurrentTasks)
                     resetCounters(self.counterPath)
コード例 #13
0
    def processTotallyFailedJob(self, jobGraph):
        """
        Processes a totally failed job.
        """
        # Mark job as a totally failed job
        self.toilState.totalFailedJobs.add(JobNode.fromJobGraph(jobGraph))
        if self.toilMetrics:
            self.toilMetrics.logFailedJob(jobGraph)

        if jobGraph.jobStoreID in self.toilState.serviceJobStoreIDToPredecessorJob: # Is
            # a service job
            logger.debug("Service job is being processed as a totally failed job: %s", jobGraph)

            predecesssorJobGraph = self.toilState.serviceJobStoreIDToPredecessorJob[jobGraph.jobStoreID]

            # This removes the service job as a service of the predecessor
            # and potentially makes the predecessor active
            self._updatePredecessorStatus(jobGraph.jobStoreID)

            # Remove the start flag, if it still exists. This indicates
            # to the service manager that the job has "started", this prevents
            # the service manager from deadlocking while waiting
            self.jobStore.deleteFile(jobGraph.startJobStoreID)

            # Signal to any other services in the group that they should
            # terminate. We do this to prevent other services in the set
            # of services from deadlocking waiting for this service to start properly
            if predecesssorJobGraph.jobStoreID in self.toilState.servicesIssued:
                self.serviceManager.killServices(self.toilState.servicesIssued[predecesssorJobGraph.jobStoreID], error=True)
                logger.debug("Job: %s is instructing all the services of its parent job to quit", jobGraph)

            self.toilState.hasFailedSuccessors.add(predecesssorJobGraph.jobStoreID) # This ensures that the
            # job will not attempt to run any of it's successors on the stack
        else:
            # Is a non-service job
            assert jobGraph.jobStoreID not in self.toilState.servicesIssued

            # Traverse failed job's successor graph and get the jobStoreID of new successors.
            # Any successor already in toilState.failedSuccessors will not be traversed
            # All successors traversed will be added to toilState.failedSuccessors and returned
            # as a set (unseenSuccessors).
            unseenSuccessors = self.getSuccessors(jobGraph, self.toilState.failedSuccessors,
                                                  self.jobStore)
            logger.debug("Found new failed successors: %s of job: %s", " ".join(
                         unseenSuccessors), jobGraph)

            # For each newly found successor
            for successorJobStoreID in unseenSuccessors:

                # If the successor is a successor of other jobs that have already tried to schedule it
                if successorJobStoreID in self.toilState.successorJobStoreIDToPredecessorJobs:

                    # For each such predecessor job
                    # (we remove the successor from toilState.successorJobStoreIDToPredecessorJobs to avoid doing
                    # this multiple times for each failed predecessor)
                    for predecessorJob in self.toilState.successorJobStoreIDToPredecessorJobs.pop(successorJobStoreID):

                        # Reduce the predecessor job's successor count.
                        self.toilState.successorCounts[predecessorJob.jobStoreID] -= 1

                        # Indicate that it has failed jobs.
                        self.toilState.hasFailedSuccessors.add(predecessorJob.jobStoreID)
                        logger.debug("Marking job: %s as having failed successors (found by "
                                     "reading successors failed job)", predecessorJob)

                        # If the predecessor has no remaining successors, add to list of active jobs
                        assert self.toilState.successorCounts[predecessorJob.jobStoreID] >= 0
                        if self.toilState.successorCounts[predecessorJob.jobStoreID] == 0:
                            self.toilState.updatedJobs.add((predecessorJob, 0))

                            # Remove the predecessor job from the set of jobs with successors.
                            self.toilState.successorCounts.pop(predecessorJob.jobStoreID)

            # If the job has predecessor(s)
            if jobGraph.jobStoreID in self.toilState.successorJobStoreIDToPredecessorJobs:

                # For each predecessor of the job
                for predecessorJobGraph in self.toilState.successorJobStoreIDToPredecessorJobs[jobGraph.jobStoreID]:

                    # Mark the predecessor as failed
                    self.toilState.hasFailedSuccessors.add(predecessorJobGraph.jobStoreID)
                    logger.debug("Totally failed job: %s is marking direct predecessor: %s "
                                 "as having failed jobs", jobGraph, predecessorJobGraph)

                self._updatePredecessorStatus(jobGraph.jobStoreID)
コード例 #14
0
    def _processReadyJob(self, jobGraph, resultStatus):
        logger.debug('Updating status of job %s with ID %s: with result status: %s',
                     jobGraph, jobGraph.jobStoreID, resultStatus)

        if jobGraph in self.serviceManager.jobGraphsWithServicesBeingStarted:
            # This stops a job with services being issued by the serviceManager from
            # being considered further in this loop. This catch is necessary because
            # the job's service's can fail while being issued, causing the job to be
            # added to updated jobs.
            logger.debug("Got a job to update which is still owned by the service "
                         "manager: %s", jobGraph.jobStoreID)
        elif jobGraph.jobStoreID in self.toilState.hasFailedSuccessors:
            self._processFailedSuccessors(jobGraph)
        elif jobGraph.command is not None or resultStatus != 0:
            # The jobGraph has a command it must be run before any successors.
            # Similarly, if the job previously failed we rerun it, even if it doesn't have a
            # command to run, to eliminate any parts of the stack now completed.
            isServiceJob = jobGraph.jobStoreID in self.toilState.serviceJobStoreIDToPredecessorJob

            # If the job has run out of retries or is a service job whose error flag has
            # been indicated, fail the job.
            if (jobGraph.remainingRetryCount == 0
                or isServiceJob and not self.jobStore.fileExists(jobGraph.errorJobStoreID)):
                self.processTotallyFailedJob(jobGraph)
                logger.warn("Job %s with ID %s is completely failed",
                            jobGraph, jobGraph.jobStoreID)
            else:
                # Otherwise try the job again
                self.issueJob(JobNode.fromJobGraph(jobGraph))
        elif len(jobGraph.services) > 0:
            # the job has services to run, which have not been started, start them
            # Build a map from the service jobs to the job and a map
            # of the services created for the job
            assert jobGraph.jobStoreID not in self.toilState.servicesIssued
            self.toilState.servicesIssued[jobGraph.jobStoreID] = {}
            for serviceJobList in jobGraph.services:
                for serviceTuple in serviceJobList:
                    serviceID = serviceTuple.jobStoreID
                    assert serviceID not in self.toilState.serviceJobStoreIDToPredecessorJob
                    self.toilState.serviceJobStoreIDToPredecessorJob[serviceID] = jobGraph
                    self.toilState.servicesIssued[jobGraph.jobStoreID][serviceID] = serviceTuple

            # Use the service manager to start the services
            self.serviceManager.scheduleServices(jobGraph)

            logger.debug("Giving job: %s to service manager to schedule its jobs", jobGraph.jobStoreID)
        elif len(jobGraph.stack) > 0:
            # There are exist successors to run
            self._runJobSuccessors(jobGraph)
        elif jobGraph.jobStoreID in self.toilState.servicesIssued:
            logger.debug("Telling job: %s to terminate its services due to the "
                         "successful completion of its successor jobs",
                         jobGraph)
            self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID], error=False)
        else:
            #There are no remaining tasks to schedule within the jobGraph, but
            #we schedule it anyway to allow it to be deleted. Remove the job

            #TODO: An alternative would be simple delete it here and add it to the
            #list of jobs to process, or (better) to create an asynchronous
            #process that deletes jobs and then feeds them back into the set
            #of jobs to be processed
            if jobGraph.remainingRetryCount > 0:
                self.issueJob(JobNode.fromJobGraph(jobGraph))
                logger.debug("Job: %s is empty, we are scheduling to clean it up", jobGraph.jobStoreID)
            else:
                self.processTotallyFailedJob(jobGraph)
                logger.warn("Job: %s is empty but completely failed - something is very wrong", jobGraph.jobStoreID)
コード例 #15
0
ファイル: clusterScalerTest.py プロジェクト: vallurumk/toil
    def testClusterScalingMultipleNodeTypes(self):

        smallNode = Shape(20, 5, 10, 10, False)
        mediumNode = Shape(20, 10, 10, 10, False)
        largeNode = Shape(20, 20, 10, 10, False)

        numJobs = 100

        config = Config()

        # Make defaults dummy values
        config.defaultMemory = 1
        config.defaultCores = 1
        config.defaultDisk = 1

        # No preemptable nodes/jobs
        config.preemptableNodeTypes = []
        config.minPreemptableNodes = []
        config.maxPreemptableNodes = []  # No preemptable nodes

        # Make sure the node types don't have to be ordered
        config.nodeTypes = [largeNode, smallNode, mediumNode]
        config.minNodes = [0, 0, 0]
        config.maxNodes = [10, 10]  # test expansion of this list

        # Algorithm parameters
        config.targetTime = defaultTargetTime
        config.betaInertia = 0.1
        config.scaleInterval = 3

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ScalerThread(mock, mock, config)
        clusterScaler.start()
        mock.start()

        try:
            # Add small jobs
            list(
                map(lambda x: mock.addJob(jobShape=smallNode),
                    list(range(numJobs))))
            list(
                map(lambda x: mock.addJob(jobShape=mediumNode),
                    list(range(numJobs))))

            # Add medium completed jobs
            for i in range(1000):
                iJ = JobNode(jobStoreID=1,
                             requirements=dict(memory=random.choice(
                                 range(smallNode.memory, mediumNode.memory)),
                                               cores=mediumNode.cores,
                                               disk=largeNode.cores,
                                               preemptable=False),
                             command=None,
                             jobName='testClusterScaling',
                             unitName='')
                clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10)))

            while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes(
            ) > 0:
                logger.debug("%i nodes currently provisioned" %
                             mock.getNumberOfNodes())
                # Make sure there are no large nodes
                self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0)
                clusterScaler.check()
                time.sleep(0.5)
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Make sure jobs ran on both the small and medium node types
        self.assertTrue(mock.totalJobs > 0)
        self.assertTrue(mock.maxWorkers[smallNode] > 0)
        self.assertTrue(mock.maxWorkers[mediumNode] > 0)

        self.assertEqual(mock.maxWorkers[largeNode], 0)
コード例 #16
0
ファイル: clusterScalerTest.py プロジェクト: trojanspike/toil
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain
        that autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        logger.info("Creating dummy batch system and scalar")

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ClusterScaler(mock, mock, config)
        clusterScaler.start()
        try:
            # Add 100 jobs to complete
            logger.info("Creating test jobs")
            map(lambda x: mock.addJob(), range(numJobs))
            map(lambda x: mock.addJob(preemptable=True),
                range(numPreemptableJobs))

            # Add some completed jobs
            for preemptable in (True, False):
                if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                    # Add a 1000 random jobs
                    for i in xrange(1000):
                        x = mock.getNodeShape(preemptable)
                        iJ = JobNode(jobStoreID=1,
                                     requirements=dict(
                                         memory=random.choice(
                                             range(1, x.memory)),
                                         cores=random.choice(range(1,
                                                                   x.cores)),
                                         disk=random.choice(range(1, x.disk)),
                                         preemptable=preemptable),
                                     command=None,
                                     jobName='testClusterScaling',
                                     unitName='')
                        clusterScaler.addCompletedJob(
                            iJ, random.choice(range(1, x.wallTime)))

            logger.info("Waiting for jobs to be processed")
            startTime = time.time()
            # Wait while the cluster the process chunks through the jobs
            while (mock.getNumberOfJobsIssued(preemptable=False) > 0
                   or mock.getNumberOfJobsIssued(preemptable=True) > 0
                   or mock.getNumberOfNodes() > 0
                   or mock.getNumberOfNodes(preemptable=True) > 0):
                logger.info(
                    "Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                    "preemptable queue size: %s, preemptable workers: %s" %
                    (mock.getNumberOfJobsIssued(preemptable=False),
                     mock.getNumberOfNodes(preemptable=False),
                     mock.getNumberOfJobsIssued(preemptable=True),
                     mock.getNumberOfNodes(preemptable=True)))
                clusterScaler.check()
                time.sleep(0.5)
            logger.info("We waited %s for cluster to finish" %
                        (time.time() - startTime))
        finally:
            clusterScaler.shutdown()

        # Print some info about the autoscaling
        for i, bs in enumerate(mock.delegates):
            preemptable = bool(i)
            logger.info("Preemptable: %s, Total-jobs: %s: Max-workers: %s,"
                        " Total-worker-time: %s, Worker-time-per-job: %s" %
                        (preemptable, bs.totalJobs, bs.maxWorkers,
                         bs.totalWorkerTime, bs.totalWorkerTime /
                         bs.totalJobs if bs.totalJobs > 0 else 0.0))
コード例 #17
0
        def test(self):
            """
            This is a front-to-back test of the "happy" path in a job store, i.e. covering things
            that occur in the dat to day life of a job store. The purist might insist that this be
            split up into several cases and I agree wholeheartedly.
            """
            master = self.master

            # Test initial state
            #
            self.assertFalse(master.exists('foo'))
            self.assertRaises(NoSuchJobException, master.load, 'foo')

            # Create parent job and verify its existence/properties
            #
            masterRequirements = dict(memory=12,
                                      cores=34,
                                      disk=35,
                                      preemptable=True)
            jobNodeOnMaster = JobNode(command='master1',
                                      requirements=masterRequirements,
                                      jobName='test1',
                                      unitName='onMaster',
                                      jobStoreID=None,
                                      predecessorNumber=0)
            jobOnMaster = master.create(jobNodeOnMaster)
            self.assertTrue(master.exists(jobOnMaster.jobStoreID))
            self.assertEquals(jobOnMaster.command, 'master1')
            self.assertEquals(jobOnMaster.memory, masterRequirements['memory'])
            self.assertEquals(jobOnMaster.cores, masterRequirements['cores'])
            self.assertEquals(jobOnMaster.disk, masterRequirements['disk'])
            self.assertEquals(jobOnMaster.preemptable,
                              masterRequirements['preemptable'])
            self.assertEquals(jobOnMaster.jobName, 'test1')
            self.assertEquals(jobOnMaster.unitName, 'onMaster')
            self.assertEquals(jobOnMaster.stack, [])
            self.assertEquals(jobOnMaster.predecessorNumber, 0)
            self.assertEquals(jobOnMaster.predecessorsFinished, set())
            self.assertEquals(jobOnMaster.logJobStoreFileID, None)

            # Create a second instance of the job store, simulating a worker ...
            #
            worker = self._createJobStore()
            worker.resume()
            self.assertEquals(worker.config, self.config)
            self.assertIsNot(worker.config, self.config)
            # ... and load the parent job there.
            jobOnWorker = worker.load(jobOnMaster.jobStoreID)
            self.assertEquals(jobOnMaster, jobOnWorker)

            # Update state on job
            #
            # The following demonstrates the job update pattern, where files to be deleted are
            # referenced in "filesToDelete" array, which is persisted to disk first. If things go
            # wrong during the update, this list of files to delete is used to remove the
            # unneeded files
            jobOnWorker.filesToDelete = ['1', '2']
            worker.update(jobOnWorker)
            # Check jobs to delete persisted
            self.assertEquals(
                master.load(jobOnWorker.jobStoreID).filesToDelete, ['1', '2'])
            # Create children
            childRequirements1 = dict(memory=23,
                                      cores=45,
                                      disk=46,
                                      preemptable=True)
            jobNodeOnChild1 = JobNode(command='child1',
                                      requirements=childRequirements1,
                                      jobName='test2',
                                      unitName='onChild1',
                                      jobStoreID=None)
            childRequirements2 = dict(memory=34,
                                      cores=56,
                                      disk=57,
                                      preemptable=False)
            jobNodeOnChild2 = JobNode(command='master1',
                                      requirements=childRequirements2,
                                      jobName='test3',
                                      unitName='onChild2',
                                      jobStoreID=None)
            child1 = worker.create(jobNodeOnChild1)
            child2 = worker.create(jobNodeOnChild2)
            # Update parent
            jobOnWorker.stack.append((child1, child2))
            jobOnWorker.filesToDelete = []
            worker.update(jobOnWorker)

            # Check equivalence between master and worker
            #
            self.assertNotEquals(jobOnWorker, jobOnMaster)
            # Reload parent job on master
            jobOnMaster = master.load(jobOnMaster.jobStoreID)
            self.assertEquals(jobOnWorker, jobOnMaster)
            # Load children on master an check equivalence
            self.assertEquals(master.load(child1.jobStoreID), child1)
            self.assertEquals(master.load(child2.jobStoreID), child2)

            # Test changing and persisting job state across multiple jobs
            #
            childJobs = [
                worker.load(childNode.jobStoreID)
                for childNode in jobOnMaster.stack[-1]
            ]
            for childJob in childJobs:
                childJob.logJobStoreFileID = str(uuid.uuid4())
                childJob.remainingRetryCount = 66
                self.assertNotEquals(childJob,
                                     master.load(childJob.jobStoreID))
            for childJob in childJobs:
                worker.update(childJob)
            for childJob in childJobs:
                self.assertEquals(master.load(childJob.jobStoreID), childJob)
                self.assertEquals(worker.load(childJob.jobStoreID), childJob)

            # Test job iterator - the results of the iterator are effected by eventual
            # consistency. We cannot guarantee all jobs will appear but we can assert that all
            # jobs that show up are a subset of all existing jobs. If we had deleted jobs before
            # this we would have to worry about ghost jobs appearing and this assertion would not
            # be valid
            self.assertTrue(
                set(childJobs + [jobOnMaster]) >= set(worker.jobs()))
            self.assertTrue(
                set(childJobs + [jobOnMaster]) >= set(master.jobs()))

            # Test job deletions
            #
            # First delete parent, this should have no effect on the children
            self.assertTrue(master.exists(jobOnMaster.jobStoreID))
            self.assertTrue(worker.exists(jobOnMaster.jobStoreID))
            master.delete(jobOnMaster.jobStoreID)
            self.assertFalse(master.exists(jobOnMaster.jobStoreID))
            self.assertFalse(worker.exists(jobOnMaster.jobStoreID))

            for childJob in childJobs:
                self.assertTrue(master.exists(childJob.jobStoreID))
                self.assertTrue(worker.exists(childJob.jobStoreID))
                master.delete(childJob.jobStoreID)
                self.assertFalse(master.exists(childJob.jobStoreID))
                self.assertFalse(worker.exists(childJob.jobStoreID))
                self.assertRaises(NoSuchJobException, worker.load,
                                  childJob.jobStoreID)
                self.assertRaises(NoSuchJobException, master.load,
                                  childJob.jobStoreID)

            try:
                with master.readSharedFileStream('missing') as _:
                    pass
                self.fail('Expecting NoSuchFileException')
            except NoSuchFileException:
                pass

            # Test shared files: Write shared file on master, ...
            #
            with master.writeSharedFileStream('foo') as f:
                f.write('bar')
            # ... read that file on worker, ...
            with worker.readSharedFileStream('foo') as f:
                self.assertEquals('bar', f.read())
            # ... and read it again on master.
            with master.readSharedFileStream('foo') as f:
                self.assertEquals('bar', f.read())

            with master.writeSharedFileStream('nonEncrypted',
                                              isProtected=False) as f:
                f.write('bar')
            self.assertUrl(master.getSharedPublicUrl('nonEncrypted'))
            self.assertRaises(NoSuchFileException, master.getSharedPublicUrl,
                              'missing')

            # Test per-job files: Create empty file on master, ...
            #
            # First recreate job
            jobOnMaster = master.create(jobNodeOnMaster)
            fileOne = worker.getEmptyFileStoreID(jobOnMaster.jobStoreID)
            # Check file exists
            self.assertTrue(worker.fileExists(fileOne))
            self.assertTrue(master.fileExists(fileOne))
            # ... write to the file on worker, ...
            with worker.updateFileStream(fileOne) as f:
                f.write('one')
            # ... read the file as a stream on the master, ....
            with master.readFileStream(fileOne) as f:
                self.assertEquals(f.read(), 'one')

            # ... and copy it to a temporary physical file on the master.
            fh, path = tempfile.mkstemp()
            try:
                os.close(fh)
                tmpPath = path + '.read-only'
                master.readFile(fileOne, tmpPath)
                try:
                    shutil.copyfile(tmpPath, path)
                finally:
                    os.unlink(tmpPath)
                with open(path, 'r+') as f:
                    self.assertEquals(f.read(), 'one')
                    # Write a different string to the local file ...
                    f.seek(0)
                    f.truncate(0)
                    f.write('two')
                # ... and create a second file from the local file.
                fileTwo = master.writeFile(path, jobOnMaster.jobStoreID)
                with worker.readFileStream(fileTwo) as f:
                    self.assertEquals(f.read(), 'two')
                # Now update the first file from the local file ...
                master.updateFile(fileOne, path)
                with worker.readFileStream(fileOne) as f:
                    self.assertEquals(f.read(), 'two')
            finally:
                os.unlink(path)
            # Create a third file to test the last remaining method.
            with worker.writeFileStream(jobOnMaster.jobStoreID) as (f,
                                                                    fileThree):
                f.write('three')
            with master.readFileStream(fileThree) as f:
                self.assertEquals(f.read(), 'three')
            # Delete a file explicitly but leave files for the implicit deletion through the parent
            worker.deleteFile(fileOne)

            # Check the file is gone
            #
            for store in worker, master:
                self.assertFalse(store.fileExists(fileOne))
                self.assertRaises(NoSuchFileException, store.readFile, fileOne,
                                  '')
                try:
                    with store.readFileStream(fileOne) as _:
                        pass
                    self.fail('Expecting NoSuchFileException')
                except NoSuchFileException:
                    pass

            # Test stats and logging
            #
            stats = None

            def callback(f2):
                stats.add(f2.read())

            stats = set()
            self.assertEquals(0, master.readStatsAndLogging(callback))
            self.assertEquals(set(), stats)
            worker.writeStatsAndLogging('1')
            self.assertEquals(1, master.readStatsAndLogging(callback))
            self.assertEquals({'1'}, stats)
            self.assertEquals(0, master.readStatsAndLogging(callback))
            worker.writeStatsAndLogging('1')
            worker.writeStatsAndLogging('2')
            stats = set()
            self.assertEquals(2, master.readStatsAndLogging(callback))
            self.assertEquals({'1', '2'}, stats)
            largeLogEntry = os.urandom(self._largeLogEntrySize())
            stats = set()
            worker.writeStatsAndLogging(largeLogEntry)
            self.assertEquals(1, master.readStatsAndLogging(callback))
            self.assertEquals({largeLogEntry}, stats)

            # test the readAll parameter
            self.assertEqual(
                4, master.readStatsAndLogging(callback, readAll=True))

            # Delete parent
            #
            master.delete(jobOnMaster.jobStoreID)
            self.assertFalse(master.exists(jobOnMaster.jobStoreID))
コード例 #18
0
    def innerLoop(self):
        """
        The main loop for processing jobs by the leader.
        """
        # Sets up the timing of the jobGraph rescuing method
        timeSinceJobsLastRescued = time.time()

        logger.info("Starting the main loop")
        while True:
            # Process jobs that are ready to be scheduled/have successors to schedule
            if len(self.toilState.updatedJobs) > 0:
                logger.debug('Built the jobs list, currently have %i jobs to update and %i jobs issued',
                             len(self.toilState.updatedJobs), self.getNumberOfJobsIssued())

                updatedJobs = self.toilState.updatedJobs # The updated jobs to consider below
                self.toilState.updatedJobs = set() # Resetting the list for the next set

                for jobGraph, resultStatus in updatedJobs:

                    logger.debug('Updating status of job %s with ID %s: with result status: %s',
                                 jobGraph, jobGraph.jobStoreID, resultStatus)

                    # This stops a job with services being issued by the serviceManager from
                    # being considered further in this loop. This catch is necessary because
                    # the job's service's can fail while being issued, causing the job to be
                    # added to updated jobs.
                    if jobGraph in self.serviceManager.jobGraphsWithServicesBeingStarted:
                        logger.debug("Got a job to update which is still owned by the service "
                                     "manager: %s", jobGraph.jobStoreID)
                        continue

                    # If some of the jobs successors failed then either fail the job
                    # or restart it if it has retries left and is a checkpoint job
                    if jobGraph.jobStoreID in self.toilState.hasFailedSuccessors:

                        # If the job has services running, signal for them to be killed
                        # once they are killed then the jobGraph will be re-added to the
                        # updatedJobs set and then scheduled to be removed
                        if jobGraph.jobStoreID in self.toilState.servicesIssued:
                            logger.debug("Telling job: %s to terminate its services due to successor failure",
                                         jobGraph.jobStoreID)
                            self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID],
                                                        error=True)

                        # If the job has non-service jobs running wait for them to finish
                        # the job will be re-added to the updated jobs when these jobs are done
                        elif jobGraph.jobStoreID in self.toilState.successorCounts:
                            logger.debug("Job %s with ID: %s with failed successors still has successor jobs running",
                                         jobGraph, jobGraph.jobStoreID)
                            continue

                        # If the job is a checkpoint and has remaining retries then reissue it.
                        # The logic behind using > 1 rather than > 0 here: Since this job has
                        # been tried once (without decreasing its retry count as the job
                        # itself was successful), and its subtree failed, it shouldn't be retried
                        # unless it has more than 1 try.
                        elif jobGraph.checkpoint is not None and jobGraph.remainingRetryCount > 1:
                            logger.warn('Job: %s is being restarted as a checkpoint after the total '
                                        'failure of jobs in its subtree.', jobGraph.jobStoreID)
                            self.issueJob(JobNode.fromJobGraph(jobGraph))
                        else: # Mark it totally failed
                            logger.debug("Job %s is being processed as completely failed", jobGraph.jobStoreID)
                            self.processTotallyFailedJob(jobGraph)

                    # If the jobGraph has a command it must be run before any successors.
                    # Similarly, if the job previously failed we rerun it, even if it doesn't have a
                    # command to run, to eliminate any parts of the stack now completed.
                    elif jobGraph.command is not None or resultStatus != 0:
                        isServiceJob = jobGraph.jobStoreID in self.toilState.serviceJobStoreIDToPredecessorJob

                        # If the job has run out of retries or is a service job whose error flag has
                        # been indicated, fail the job.
                        if (jobGraph.remainingRetryCount == 0
                            or isServiceJob and not self.jobStore.fileExists(jobGraph.errorJobStoreID)):
                            self.processTotallyFailedJob(jobGraph)
                            logger.warn("Job %s with ID %s is completely failed",
                                        jobGraph, jobGraph.jobStoreID)
                        else:
                            # Otherwise try the job again
                            self.issueJob(JobNode.fromJobGraph(jobGraph))

                    # If the job has services to run, which have not been started, start them
                    elif len(jobGraph.services) > 0:
                        # Build a map from the service jobs to the job and a map
                        # of the services created for the job
                        assert jobGraph.jobStoreID not in self.toilState.servicesIssued
                        self.toilState.servicesIssued[jobGraph.jobStoreID] = {}
                        for serviceJobList in jobGraph.services:
                            for serviceTuple in serviceJobList:
                                serviceID = serviceTuple.jobStoreID
                                assert serviceID not in self.toilState.serviceJobStoreIDToPredecessorJob
                                self.toilState.serviceJobStoreIDToPredecessorJob[serviceID] = jobGraph
                                self.toilState.servicesIssued[jobGraph.jobStoreID][serviceID] = serviceTuple

                        # Use the service manager to start the services
                        self.serviceManager.scheduleServices(jobGraph)

                        logger.debug("Giving job: %s to service manager to schedule its jobs", jobGraph.jobStoreID)

                    # There exist successors to run
                    elif len(jobGraph.stack) > 0:
                        assert len(jobGraph.stack[-1]) > 0
                        logger.debug("Job: %s has %i successors to schedule",
                                     jobGraph.jobStoreID, len(jobGraph.stack[-1]))
                        #Record the number of successors that must be completed before
                        #the jobGraph can be considered again
                        assert jobGraph.jobStoreID not in self.toilState.successorCounts
                        self.toilState.successorCounts[jobGraph.jobStoreID] = len(jobGraph.stack[-1])
                        #List of successors to schedule
                        successors = []

                        #For each successor schedule if all predecessors have been completed
                        for jobNode in jobGraph.stack[-1]:
                            successorJobStoreID = jobNode.jobStoreID
                            #Build map from successor to predecessors.
                            if successorJobStoreID not in self.toilState.successorJobStoreIDToPredecessorJobs:
                                self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID] = []
                            self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID].append(jobGraph)
                            #Case that the jobGraph has multiple predecessors
                            if jobNode.predecessorNumber > 1:
                                logger.debug("Successor job: %s of job: %s has multiple "
                                             "predecessors", jobNode, jobGraph)

                                # Get the successor job, using a cache
                                # (if the successor job has already been seen it will be in this cache,
                                # but otherwise put it in the cache)
                                if successorJobStoreID not in self.toilState.jobsToBeScheduledWithMultiplePredecessors:
                                    self.toilState.jobsToBeScheduledWithMultiplePredecessors[successorJobStoreID] = self.jobStore.load(successorJobStoreID)
                                successorJobGraph = self.toilState.jobsToBeScheduledWithMultiplePredecessors[successorJobStoreID]

                                #Add the jobGraph as a finished predecessor to the successor
                                successorJobGraph.predecessorsFinished.add(jobGraph.jobStoreID)

                                # If the successor is in the set of successors of failed jobs
                                if successorJobStoreID in self.toilState.failedSuccessors:
                                    logger.debug("Successor job: %s of job: %s has failed "
                                                 "predecessors", jobNode, jobGraph)

                                    # Add the job to the set having failed successors
                                    self.toilState.hasFailedSuccessors.add(jobGraph.jobStoreID)

                                    # Reduce active successor count and remove the successor as an active successor of the job
                                    self.toilState.successorCounts[jobGraph.jobStoreID] -= 1
                                    assert self.toilState.successorCounts[jobGraph.jobStoreID] >= 0
                                    self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID].remove(jobGraph)
                                    if len(self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID]) == 0:
                                        self.toilState.successorJobStoreIDToPredecessorJobs.pop(successorJobStoreID)

                                    # If the job now has no active successors add to active jobs
                                    # so it can be processed as a job with failed successors
                                    if self.toilState.successorCounts[jobGraph.jobStoreID] == 0:
                                        logger.debug("Job: %s has no successors to run "
                                                     "and some are failed, adding to list of jobs "
                                                     "with failed successors", jobGraph)
                                        self.toilState.successorCounts.pop(jobGraph.jobStoreID)
                                        self.toilState.updatedJobs.add((jobGraph, 0))
                                        continue

                                # If the successor job's predecessors have all not all completed then
                                # ignore the jobGraph as is not yet ready to run
                                assert len(successorJobGraph.predecessorsFinished) <= successorJobGraph.predecessorNumber
                                if len(successorJobGraph.predecessorsFinished) < successorJobGraph.predecessorNumber:
                                    continue
                                else:
                                    # Remove the successor job from the cache
                                    self.toilState.jobsToBeScheduledWithMultiplePredecessors.pop(successorJobStoreID)

                            # Add successor to list of successors to schedule
                            successors.append(jobNode)
                        self.issueJobs(successors)

                    elif jobGraph.jobStoreID in self.toilState.servicesIssued:
                        logger.debug("Telling job: %s to terminate its services due to the "
                                     "successful completion of its successor jobs",
                                     jobGraph)
                        self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID], error=False)

                    #There are no remaining tasks to schedule within the jobGraph, but
                    #we schedule it anyway to allow it to be deleted.

                    #TODO: An alternative would be simple delete it here and add it to the
                    #list of jobs to process, or (better) to create an asynchronous
                    #process that deletes jobs and then feeds them back into the set
                    #of jobs to be processed
                    else:
                        # Remove the job
                        if jobGraph.remainingRetryCount > 0:
                            self.issueJob(JobNode.fromJobGraph(jobGraph))
                            logger.debug("Job: %s is empty, we are scheduling to clean it up", jobGraph.jobStoreID)
                        else:
                            self.processTotallyFailedJob(jobGraph)
                            logger.warn("Job: %s is empty but completely failed - something is very wrong", jobGraph.jobStoreID)

            # Start any service jobs available from the service manager
            self.issueQueingServiceJobs()
            while True:
                serviceJob = self.serviceManager.getServiceJobsToStart(0)
                # Stop trying to get jobs when function returns None
                if serviceJob is None:
                    break
                logger.debug('Launching service job: %s', serviceJob)
                self.issueServiceJob(serviceJob)

            # Get jobs whose services have started
            while True:
                jobGraph = self.serviceManager.getJobGraphWhoseServicesAreRunning(0)
                if jobGraph is None: # Stop trying to get jobs when function returns None
                    break
                logger.debug('Job: %s has established its services.', jobGraph.jobStoreID)
                jobGraph.services = []
                self.toilState.updatedJobs.add((jobGraph, 0))

            # Gather any new, updated jobGraph from the batch system
            updatedJobTuple = self.batchSystem.getUpdatedBatchJob(2)
            if updatedJobTuple is not None:
                jobID, result, wallTime = updatedJobTuple
                # easy, track different state
                try:
                    updatedJob = self.jobBatchSystemIDToIssuedJob[jobID]
                except KeyError:
                    logger.warn("A result seems to already have been processed "
                                "for job %s", jobID)
                else:
                    if result == 0:
                        cur_logger = (logger.debug if str(updatedJob.jobName).startswith(CWL_INTERNAL_JOBS)
                                      else logger.info)
                        cur_logger('Job ended successfully: %s', updatedJob)
                        if self.toilMetrics:
                            self.toilMetrics.logCompletedJob(updatedJob)
                    else:
                        logger.warn('Job failed with exit value %i: %s',
                                    result, updatedJob)
                    self.processFinishedJob(jobID, result, wallTime=wallTime)

            else:
                # Process jobs that have gone awry

                #In the case that there is nothing happening
                #(no updated jobs to gather for 10 seconds)
                #check if there are any jobs that have run too long
                #(see self.reissueOverLongJobs) or which
                #have gone missing from the batch system (see self.reissueMissingJobs)
                if (time.time() - timeSinceJobsLastRescued >=
                    self.config.rescueJobsFrequency): #We only
                    #rescue jobs every N seconds, and when we have
                    #apparently exhausted the current jobGraph supply
                    self.reissueOverLongJobs()
                    logger.info("Reissued any over long jobs")

                    hasNoMissingJobs = self.reissueMissingJobs()
                    if hasNoMissingJobs:
                        timeSinceJobsLastRescued = time.time()
                    else:
                        timeSinceJobsLastRescued += 60 #This means we'll try again
                        #in a minute, providing things are quiet
                    logger.info("Rescued any (long) missing jobs")

            # Check on the associated threads and exit if a failure is detected
            self.statsAndLogging.check()
            self.serviceManager.check()
            # the cluster scaler object will only be instantiated if autoscaling is enabled
            if self.clusterScaler is not None:
                self.clusterScaler.check()

            # The exit criterion
            if len(self.toilState.updatedJobs) == 0 and self.getNumberOfJobsIssued() == 0 and self.serviceManager.jobsIssuedToServiceManager == 0:
                logger.info("No jobs left to run so exiting.")
                break

            # Check for deadlocks
            self.checkForDeadlocks()

        logger.info("Finished the main loop")

        # Consistency check the toil state
        assert self.toilState.updatedJobs == set()
        assert self.toilState.successorCounts == {}
        assert self.toilState.successorJobStoreIDToPredecessorJobs == {}
        assert self.toilState.serviceJobStoreIDToPredecessorJob == {}
        assert self.toilState.servicesIssued == {}