def testBatchResourceLimits(self): jobNode1 = JobNode(command="sleep 1000", requirements=dict(memory=1 << 30, cores=1, disk=1000, preemptable=preemptable), jobName='testResourceLimits', unitName=None, jobStoreID='1') job1 = self.batchSystem.issueBatchJob(jobNode1) self.assertIsNotNone(job1) jobNode2 = JobNode(command="sleep 1000", requirements=dict(memory=2 << 30, cores=1, disk=1000, preemptable=preemptable), jobName='testResourceLimits', unitName=None, jobStoreID='2') job2 = self.batchSystem.issueBatchJob(jobNode2) self.assertIsNotNone(job2) batches = self._getBatchList() self.assertEqual(len(batches), 2) # It would be better to directly check that the batches have the correct memory and cpu # values, but Parasol seems to slightly change the values sometimes. self.assertNotEqual(batches[0]['ram'], batches[1]['ram']) # Need to kill one of the jobs because there are only two cores available self.batchSystem.killBatchJobs([job2]) job3 = self.batchSystem.issueBatchJob(jobNode1) self.assertIsNotNone(job3) batches = self._getBatchList() self.assertEqual(len(batches), 1)
def testSetEnv(self): # Parasol disobeys shell rules and stupidly splits the command at the space character # before exec'ing it, whether the space is quoted, escaped or not. This means that we # can't have escaped or quotes spaces in the command line. So we can't use bash -c # '...' or python -c '...'. The safest thing to do here is to script the test and # invoke that script rather than inline the test via -c. def assertEnv(): import os, sys sys.exit(0 if os.getenv('FOO') == 'bar' else 42) script_body = dedent('\n'.join(getsource(assertEnv).split('\n')[1:])) with tempFileContaining(script_body, suffix='.py') as script_path: # First, ensure that the test fails if the variable is *not* set command = sys.executable + ' ' + script_path jobNode4 = JobNode(command=command, jobName='test4', unitName=None, jobStoreID='4', requirements=defaultRequirements) job4 = self.batchSystem.issueBatchJob(jobNode4) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(exitStatus, 42) self.assertEqual(jobID, job4) # Now set the variable and ensure that it is present self.batchSystem.setEnv('FOO', 'bar') jobNode5 = JobNode(command=command, jobName='test5', unitName=None, jobStoreID='5', requirements=defaultRequirements) job5 = self.batchSystem.issueBatchJob(jobNode5) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob(maxWait=1000) self.assertEqual(exitStatus, 0) self.assertEqual(jobID, job5)
def testSetEnv(self): # Parasol disobeys shell rules and stupidly splits the command at # the space character into arguments before exec'ing it, whether # the space is quoted, escaped or not. script_shell = 'if [ "x${FOO}" == "xbar" ] ; then exit 23 ; else exit 42 ; fi' # Escape the semicolons script_protected = script_shell.replace(';', '\;') # Turn into a string which convinces bash to take all args and paste them back together and run them command = "bash -c \"\\${@}\" bash eval " + script_protected log.critical(command) jobNode4 = JobNode(command=command, jobName='test4', unitName=None, jobStoreID='4', requirements=defaultRequirements) job4 = self.batchSystem.issueBatchJob(jobNode4) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) self.assertEqual(exitStatus, 42) self.assertEqual(jobID, job4) # Now set the variable and ensure that it is present self.batchSystem.setEnv('FOO', 'bar') jobNode5 = JobNode(command=command, jobName='test5', unitName=None, jobStoreID='5', requirements=defaultRequirements) job5 = self.batchSystem.issueBatchJob(jobNode5) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) self.assertEqual(exitStatus, 23) self.assertEqual(jobID, job5)
def testRunJobs(self): jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None, jobStoreID='1', requirements=defaultRequirements) jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='2', requirements=defaultRequirements) job1 = self.batchSystem.issueBatchJob(jobNode1) job2 = self.batchSystem.issueBatchJob(jobNode2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) # Now at some point we want these jobs to become running # But since we may be testing against a live cluster (Kubernetes) # we want to handle weird cases and high cluster load as much as we can. # Wait a bit for any Dockers to download and for the # jobs to have a chance to start. # TODO: We insist on neither of these ever finishing when we test # getUpdatedBatchJob, and the sleep time is longer than the time we # should spend waiting for both to start, so if our cluster can # only run one job at a time, we will fail the test. runningJobIDs = self._waitForJobsToStart(2, tries=120) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very # quickly if the batch system issues and starts the jobs quickly. self.batchSystem.killBatchJobs([job1, job2]) self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs()) # Issue a job and then allow it to finish by itself, causing it to be added to the # updated jobs queue. # We would like to have this touch something on the filesystem and # then check for it having happened, but we can't guarantee that # the batch system will run against the same filesystem we are # looking at. jobNode3 = JobNode(command="mktemp -d", jobName='test3', unitName=None, jobStoreID='3', requirements=defaultRequirements) job3 = self.batchSystem.issueBatchJob(jobNode3) jobUpdateInfo = self.batchSystem.getUpdatedBatchJob(maxWait=1000) jobID, exitStatus, wallTime = jobUpdateInfo.jobID, jobUpdateInfo.exitStatus, jobUpdateInfo.wallTime log.info('Third job completed: {} {} {}'.format(jobID, exitStatus, wallTime)) # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will # fail with jobID being equal to job1 or job2. self.assertEqual(jobID, job3) self.assertEqual(exitStatus, 0) if self.supportsWallTime(): self.assertTrue(wallTime > 0) else: self.assertIsNone(wallTime) # TODO: Work out a way to check if the job we asked to run actually ran. # Don't just believe the batch system, but don't assume it ran on this machine either. self.assertFalse(self.batchSystem.getUpdatedBatchJob(0)) # Make sure killBatchJobs can handle jobs that don't exist self.batchSystem.killBatchJobs([10])
def testRunJobs(self): jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None, jobStoreID='1', requirements=defaultRequirements) jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='2', requirements=defaultRequirements) job1 = self.batchSystem.issueBatchJob(jobNode1) job2 = self.batchSystem.issueBatchJob(jobNode2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) runningJobIDs = self._waitForJobsToStart(2) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very # quickly if the batch system issues and starts the jobs quickly. self.batchSystem.killBatchJobs([job1, job2]) self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs()) # Issue a job and then allow it to finish by itself, causing it to be added to the # updated jobs queue. # We would like to have this touch something on the filesystem and # then check for it having happened, but we can't guarantee that # the batch system will run against the same filesystem we are # looking at. jobNode3 = JobNode(command="mktemp -d", jobName='test3', unitName=None, jobStoreID='3', requirements=defaultRequirements) job3 = self.batchSystem.issueBatchJob(jobNode3) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will # fail with jobID being equal to job1 or job2. self.assertEqual(jobID, job3) self.assertEqual(exitStatus, 0) if self.supportsWallTime(): self.assertTrue(wallTime > 0) else: self.assertIsNone(wallTime) # TODO: Work out a way to check if the job we asked to run actually ran. # Don't just believe the batch system, but don't assume it ran on this machine either. self.assertFalse(self.batchSystem.getUpdatedBatchJob(0)) # Make sure killBatchJobs can handle jobs that don't exist self.batchSystem.killBatchJobs([10])
def testRunJobs(self): testPath = os.path.join(self.tempDir, "test.txt") jobNode1 = JobNode(command='sleep 1000', jobName='test1', unitName=None, jobStoreID='1', requirements=defaultRequirements) jobNode2 = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='2', requirements=defaultRequirements) job1 = self.batchSystem.issueBatchJob(jobNode1) job2 = self.batchSystem.issueBatchJob(jobNode2) issuedIDs = self._waitForJobsToIssue(2) self.assertEqual(set(issuedIDs), {job1, job2}) runningJobIDs = self._waitForJobsToStart(2) self.assertEqual(set(runningJobIDs), {job1, job2}) # Killing the jobs instead of allowing them to complete means this test can run very # quickly if the batch system issues and starts the jobs quickly. self.batchSystem.killBatchJobs([job1, job2]) self.assertEqual({}, self.batchSystem.getRunningBatchJobIDs()) # Issue a job and then allow it to finish by itself, causing it to be added to the # updated jobs queue. self.assertFalse(os.path.exists(testPath)) jobNode3 = JobNode(command="touch %s" % testPath, jobName='test3', unitName=None, jobStoreID='3', requirements=defaultRequirements) job3 = self.batchSystem.issueBatchJob(jobNode3) jobID, exitStatus, wallTime = self.batchSystem.getUpdatedBatchJob( maxWait=1000) # Since the first two jobs were killed, the only job in the updated jobs queue should # be job 3. If the first two jobs were (incorrectly) added to the queue, this will # fail with jobID being equal to job1 or job2. self.assertEqual(exitStatus, 0) self.assertEqual(jobID, job3) if self.supportsWallTime(): self.assertTrue(wallTime > 0) else: self.assertIsNone(wallTime) if not os.path.exists(testPath): time.sleep(20) self.assertTrue(os.path.exists(testPath)) self.assertFalse(self.batchSystem.getUpdatedBatchJob(0)) # Make sure killBatchJobs can handle jobs that don't exist self.batchSystem.killBatchJobs([10])
def _processFailedSuccessors(self, jobGraph): """Some of the jobs successors failed then either fail the job or restart it if it has retries left and is a checkpoint job""" if jobGraph.jobStoreID in self.toilState.servicesIssued: # The job has services running, signal for them to be killed # once they are killed then the jobGraph will be re-added to # the updatedJobs set and then scheduled to be removed logger.debug("Telling job: %s to terminate its services due to successor failure", jobGraph.jobStoreID) self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID], error=True) elif jobGraph.jobStoreID in self.toilState.successorCounts: # The job has non-service jobs running wait for them to finish # the job will be re-added to the updated jobs when these jobs # are done logger.debug("Job %s with ID: %s with failed successors still has successor jobs running", jobGraph, jobGraph.jobStoreID) elif jobGraph.checkpoint is not None and jobGraph.remainingRetryCount > 1: # If the job is a checkpoint and has remaining retries then reissue it. # The logic behind using > 1 rather than > 0 here: Since this job has # been tried once (without decreasing its retry count as the job # itself was successful), and its subtree failed, it shouldn't be retried # unless it has more than 1 try. logger.warn('Job: %s is being restarted as a checkpoint after the total ' 'failure of jobs in its subtree.', jobGraph.jobStoreID) self.issueJob(JobNode.fromJobGraph(jobGraph)) else: # Mark it totally failed logger.debug("Job %s is being processed as completely failed", jobGraph.jobStoreID) self.processTotallyFailedJob(jobGraph)
def _testClusterScaling(self, config, numJobs, numPreemptableJobs, jobShape): """ Test the ClusterScaler class with different patterns of job creation. Tests ascertain that autoscaling occurs and that all the jobs are run. """ # First do simple test of creating 100 preemptable and non-premptable jobs and check the # jobs are completed okay, then print the amount of worker time expended and the total # number of worker nodes used. mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) mock.start() clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() try: # Add 100 jobs to complete list(map(lambda x: mock.addJob(jobShape=jobShape), list(range(numJobs)))) list(map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True), list(range(numPreemptableJobs)))) # Add some completed jobs for preemptable in (True, False): if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0: # Add 1000 random jobs for _ in range(1000): x = mock.getNodeShape(nodeType=jobShape) iJ = JobNode(jobStoreID=1, requirements=dict( memory=random.choice(list(range(1, x.memory))), cores=random.choice(list(range(1, x.cores))), disk=random.choice(list(range(1, x.disk))), preemptable=preemptable), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob(iJ, random.choice(list(range(1, x.wallTime)))) startTime = time.time() # Wait while the cluster processes the jobs while (mock.getNumberOfJobsIssued(preemptable=False) > 0 or mock.getNumberOfJobsIssued(preemptable=True) > 0 or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0): logger.debug("Running, non-preemptable queue size: %s, non-preemptable workers: %s, " "preemptable queue size: %s, preemptable workers: %s" % (mock.getNumberOfJobsIssued(preemptable=False), mock.getNumberOfNodes(preemptable=False), mock.getNumberOfJobsIssued(preemptable=True), mock.getNumberOfNodes(preemptable=True))) clusterScaler.check() time.sleep(0.5) logger.debug("We waited %s for cluster to finish" % (time.time() - startTime)) finally: clusterScaler.shutdown() mock.shutDown() # Print some info about the autoscaling logger.debug("Total-jobs: %s: Max-workers: %s, " "Total-worker-time: %s, Worker-time-per-job: %s" % (mock.totalJobs, sum(mock.maxWorkers.values()), mock.totalWorkerTime, old_div(mock.totalWorkerTime, mock.totalJobs) if mock.totalJobs > 0 else 0.0))
def testOverlargeJob(self): master = self.master masterRequirements = dict(memory=12, cores=34, disk=35, preemptable=True) overlargeJobNodeOnMaster = JobNode(command='master-overlarge', requirements=masterRequirements, jobName='test-overlarge', unitName='onMaster', jobStoreID=None, predecessorNumber=0) #Make the pickled size of the job larger than 256K with open("/dev/urandom", "r") as random: overlargeJobNodeOnMaster.jobName = random.read(512 * 1024) overlargeJobOnMaster = master.create(overlargeJobNodeOnMaster) self.assertTrue(master.exists(overlargeJobOnMaster.jobStoreID)) overlargeJobOnMasterDownloaded = master.load(overlargeJobOnMaster.jobStoreID) jobsOnMaster = [job for job in master.jobs()] self.assertEqual(jobsOnMaster, [overlargeJobOnMaster]) master.delete(overlargeJobOnMaster.jobStoreID)
def setUp(self): super(AbstractJobStoreTest.Test, self).setUp() self.namePrefix = 'jobstore-test-' + str(uuid.uuid4()) self.master = self._createJobStore() self.config = self._createConfig() self.master.initialize(self.config) self.arbitraryRequirements = {'memory': 1, 'disk': 2, 'cores': 1, 'preemptable': False} self.arbitraryJob = JobNode(command='command', jobStoreID=None, jobName='arbitrary', unitName=None, requirements=self.arbitraryRequirements)
def testIgnoreNode(self): self.batchSystem.ignoreNode('localhost') jobNode = JobNode(command='sleep 1000', jobName='test2', unitName=None, jobStoreID='1', requirements=defaultRequirements) job = self.batchSystem.issueBatchJob(jobNode) issuedID = self._waitForJobsToIssue(1) self.assertEqual(set(issuedID), {job}) runningJobIDs = self._waitForJobsToStart(1) #Make sure job is NOT running self.assertEqual(set(runningJobIDs), set({}))
def test(self): # We'll use fractions to avoid rounding errors. Remember that not every fraction can be # represented as a floating point number. F = Fraction # This test isn't general enough to cover every possible value of minCores in # SingleMachineBatchSystem. Instead we hard-code a value and assert it. minCores = F(1, 10) self.assertEquals(float(minCores), SingleMachineBatchSystem.minCores) for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}: for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}: for load in (F(1, 10), F(1), F(10)): jobs = int(maxCores / coresPerJob * load) if jobs >= 1 and minCores <= coresPerJob < maxCores: self.assertEquals(maxCores, float(maxCores)) bs = SingleMachineBatchSystem( config=hidden.AbstractBatchSystemTest.createConfig(), maxCores=float(maxCores), # Ensure that memory or disk requirements don't get in the way. maxMemory=jobs * 10, maxDisk=jobs * 10) try: jobIds = set() for i in range(0, int(jobs)): jobIds.add(bs.issueBatchJob(JobNode(command=self.scriptCommand(), requirements=dict( cores=float( coresPerJob), memory=1, disk=1, preemptable=preemptable), jobName=str(i), unitName='', jobStoreID=str(i)))) self.assertEquals(len(jobIds), jobs) while jobIds: job = bs.getUpdatedBatchJob(maxWait=10) self.assertIsNotNone(job) jobId, status, wallTime = job self.assertEquals(status, 0) # would raise KeyError on absence jobIds.remove(jobId) finally: bs.shutdown() concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath) self.assertEquals(concurrentTasks, 0) log.info('maxCores: {maxCores}, ' 'coresPerJob: {coresPerJob}, ' 'load: {load}'.format(**locals())) # This is the key assertion: expectedMaxConcurrentTasks = min(old_div(maxCores, coresPerJob), jobs) self.assertEquals(maxConcurrentTasks, expectedMaxConcurrentTasks) resetCounters(self.counterPath)
def processTotallyFailedJob(self, jobGraph): """ Processes a totally failed job. """ # Mark job as a totally failed job self.toilState.totalFailedJobs.add(JobNode.fromJobGraph(jobGraph)) if self.toilMetrics: self.toilMetrics.logFailedJob(jobGraph) if jobGraph.jobStoreID in self.toilState.serviceJobStoreIDToPredecessorJob: # Is # a service job logger.debug("Service job is being processed as a totally failed job: %s", jobGraph) predecesssorJobGraph = self.toilState.serviceJobStoreIDToPredecessorJob[jobGraph.jobStoreID] # This removes the service job as a service of the predecessor # and potentially makes the predecessor active self._updatePredecessorStatus(jobGraph.jobStoreID) # Remove the start flag, if it still exists. This indicates # to the service manager that the job has "started", this prevents # the service manager from deadlocking while waiting self.jobStore.deleteFile(jobGraph.startJobStoreID) # Signal to any other services in the group that they should # terminate. We do this to prevent other services in the set # of services from deadlocking waiting for this service to start properly if predecesssorJobGraph.jobStoreID in self.toilState.servicesIssued: self.serviceManager.killServices(self.toilState.servicesIssued[predecesssorJobGraph.jobStoreID], error=True) logger.debug("Job: %s is instructing all the services of its parent job to quit", jobGraph) self.toilState.hasFailedSuccessors.add(predecesssorJobGraph.jobStoreID) # This ensures that the # job will not attempt to run any of it's successors on the stack else: # Is a non-service job assert jobGraph.jobStoreID not in self.toilState.servicesIssued # Traverse failed job's successor graph and get the jobStoreID of new successors. # Any successor already in toilState.failedSuccessors will not be traversed # All successors traversed will be added to toilState.failedSuccessors and returned # as a set (unseenSuccessors). unseenSuccessors = self.getSuccessors(jobGraph, self.toilState.failedSuccessors, self.jobStore) logger.debug("Found new failed successors: %s of job: %s", " ".join( unseenSuccessors), jobGraph) # For each newly found successor for successorJobStoreID in unseenSuccessors: # If the successor is a successor of other jobs that have already tried to schedule it if successorJobStoreID in self.toilState.successorJobStoreIDToPredecessorJobs: # For each such predecessor job # (we remove the successor from toilState.successorJobStoreIDToPredecessorJobs to avoid doing # this multiple times for each failed predecessor) for predecessorJob in self.toilState.successorJobStoreIDToPredecessorJobs.pop(successorJobStoreID): # Reduce the predecessor job's successor count. self.toilState.successorCounts[predecessorJob.jobStoreID] -= 1 # Indicate that it has failed jobs. self.toilState.hasFailedSuccessors.add(predecessorJob.jobStoreID) logger.debug("Marking job: %s as having failed successors (found by " "reading successors failed job)", predecessorJob) # If the predecessor has no remaining successors, add to list of active jobs assert self.toilState.successorCounts[predecessorJob.jobStoreID] >= 0 if self.toilState.successorCounts[predecessorJob.jobStoreID] == 0: self.toilState.updatedJobs.add((predecessorJob, 0)) # Remove the predecessor job from the set of jobs with successors. self.toilState.successorCounts.pop(predecessorJob.jobStoreID) # If the job has predecessor(s) if jobGraph.jobStoreID in self.toilState.successorJobStoreIDToPredecessorJobs: # For each predecessor of the job for predecessorJobGraph in self.toilState.successorJobStoreIDToPredecessorJobs[jobGraph.jobStoreID]: # Mark the predecessor as failed self.toilState.hasFailedSuccessors.add(predecessorJobGraph.jobStoreID) logger.debug("Totally failed job: %s is marking direct predecessor: %s " "as having failed jobs", jobGraph, predecessorJobGraph) self._updatePredecessorStatus(jobGraph.jobStoreID)
def _processReadyJob(self, jobGraph, resultStatus): logger.debug('Updating status of job %s with ID %s: with result status: %s', jobGraph, jobGraph.jobStoreID, resultStatus) if jobGraph in self.serviceManager.jobGraphsWithServicesBeingStarted: # This stops a job with services being issued by the serviceManager from # being considered further in this loop. This catch is necessary because # the job's service's can fail while being issued, causing the job to be # added to updated jobs. logger.debug("Got a job to update which is still owned by the service " "manager: %s", jobGraph.jobStoreID) elif jobGraph.jobStoreID in self.toilState.hasFailedSuccessors: self._processFailedSuccessors(jobGraph) elif jobGraph.command is not None or resultStatus != 0: # The jobGraph has a command it must be run before any successors. # Similarly, if the job previously failed we rerun it, even if it doesn't have a # command to run, to eliminate any parts of the stack now completed. isServiceJob = jobGraph.jobStoreID in self.toilState.serviceJobStoreIDToPredecessorJob # If the job has run out of retries or is a service job whose error flag has # been indicated, fail the job. if (jobGraph.remainingRetryCount == 0 or isServiceJob and not self.jobStore.fileExists(jobGraph.errorJobStoreID)): self.processTotallyFailedJob(jobGraph) logger.warn("Job %s with ID %s is completely failed", jobGraph, jobGraph.jobStoreID) else: # Otherwise try the job again self.issueJob(JobNode.fromJobGraph(jobGraph)) elif len(jobGraph.services) > 0: # the job has services to run, which have not been started, start them # Build a map from the service jobs to the job and a map # of the services created for the job assert jobGraph.jobStoreID not in self.toilState.servicesIssued self.toilState.servicesIssued[jobGraph.jobStoreID] = {} for serviceJobList in jobGraph.services: for serviceTuple in serviceJobList: serviceID = serviceTuple.jobStoreID assert serviceID not in self.toilState.serviceJobStoreIDToPredecessorJob self.toilState.serviceJobStoreIDToPredecessorJob[serviceID] = jobGraph self.toilState.servicesIssued[jobGraph.jobStoreID][serviceID] = serviceTuple # Use the service manager to start the services self.serviceManager.scheduleServices(jobGraph) logger.debug("Giving job: %s to service manager to schedule its jobs", jobGraph.jobStoreID) elif len(jobGraph.stack) > 0: # There are exist successors to run self._runJobSuccessors(jobGraph) elif jobGraph.jobStoreID in self.toilState.servicesIssued: logger.debug("Telling job: %s to terminate its services due to the " "successful completion of its successor jobs", jobGraph) self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID], error=False) else: #There are no remaining tasks to schedule within the jobGraph, but #we schedule it anyway to allow it to be deleted. Remove the job #TODO: An alternative would be simple delete it here and add it to the #list of jobs to process, or (better) to create an asynchronous #process that deletes jobs and then feeds them back into the set #of jobs to be processed if jobGraph.remainingRetryCount > 0: self.issueJob(JobNode.fromJobGraph(jobGraph)) logger.debug("Job: %s is empty, we are scheduling to clean it up", jobGraph.jobStoreID) else: self.processTotallyFailedJob(jobGraph) logger.warn("Job: %s is empty but completely failed - something is very wrong", jobGraph.jobStoreID)
def testClusterScalingMultipleNodeTypes(self): smallNode = Shape(20, 5, 10, 10, False) mediumNode = Shape(20, 10, 10, 10, False) largeNode = Shape(20, 20, 10, 10, False) numJobs = 100 config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # No preemptable nodes/jobs config.preemptableNodeTypes = [] config.minPreemptableNodes = [] config.maxPreemptableNodes = [] # No preemptable nodes # Make sure the node types don't have to be ordered config.nodeTypes = [largeNode, smallNode, mediumNode] config.minNodes = [0, 0, 0] config.maxNodes = [10, 10] # test expansion of this list # Algorithm parameters config.targetTime = defaultTargetTime config.betaInertia = 0.1 config.scaleInterval = 3 mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() mock.start() try: # Add small jobs list( map(lambda x: mock.addJob(jobShape=smallNode), list(range(numJobs)))) list( map(lambda x: mock.addJob(jobShape=mediumNode), list(range(numJobs)))) # Add medium completed jobs for i in range(1000): iJ = JobNode(jobStoreID=1, requirements=dict(memory=random.choice( range(smallNode.memory, mediumNode.memory)), cores=mediumNode.cores, disk=largeNode.cores, preemptable=False), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10))) while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes( ) > 0: logger.debug("%i nodes currently provisioned" % mock.getNumberOfNodes()) # Make sure there are no large nodes self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0) clusterScaler.check() time.sleep(0.5) finally: clusterScaler.shutdown() mock.shutDown() # Make sure jobs ran on both the small and medium node types self.assertTrue(mock.totalJobs > 0) self.assertTrue(mock.maxWorkers[smallNode] > 0) self.assertTrue(mock.maxWorkers[mediumNode] > 0) self.assertEqual(mock.maxWorkers[largeNode], 0)
def _testClusterScaling(self, config, numJobs, numPreemptableJobs): """ Test the ClusterScaler class with different patterns of job creation. Tests ascertain that autoscaling occurs and that all the jobs are run. """ # First do simple test of creating 100 preemptable and non-premptable jobs and check the # jobs are completed okay, then print the amount of worker time expended and the total # number of worker nodes used. logger.info("Creating dummy batch system and scalar") mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) clusterScaler = ClusterScaler(mock, mock, config) clusterScaler.start() try: # Add 100 jobs to complete logger.info("Creating test jobs") map(lambda x: mock.addJob(), range(numJobs)) map(lambda x: mock.addJob(preemptable=True), range(numPreemptableJobs)) # Add some completed jobs for preemptable in (True, False): if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0: # Add a 1000 random jobs for i in xrange(1000): x = mock.getNodeShape(preemptable) iJ = JobNode(jobStoreID=1, requirements=dict( memory=random.choice( range(1, x.memory)), cores=random.choice(range(1, x.cores)), disk=random.choice(range(1, x.disk)), preemptable=preemptable), command=None, jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob( iJ, random.choice(range(1, x.wallTime))) logger.info("Waiting for jobs to be processed") startTime = time.time() # Wait while the cluster the process chunks through the jobs while (mock.getNumberOfJobsIssued(preemptable=False) > 0 or mock.getNumberOfJobsIssued(preemptable=True) > 0 or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0): logger.info( "Running, non-preemptable queue size: %s, non-preemptable workers: %s, " "preemptable queue size: %s, preemptable workers: %s" % (mock.getNumberOfJobsIssued(preemptable=False), mock.getNumberOfNodes(preemptable=False), mock.getNumberOfJobsIssued(preemptable=True), mock.getNumberOfNodes(preemptable=True))) clusterScaler.check() time.sleep(0.5) logger.info("We waited %s for cluster to finish" % (time.time() - startTime)) finally: clusterScaler.shutdown() # Print some info about the autoscaling for i, bs in enumerate(mock.delegates): preemptable = bool(i) logger.info("Preemptable: %s, Total-jobs: %s: Max-workers: %s," " Total-worker-time: %s, Worker-time-per-job: %s" % (preemptable, bs.totalJobs, bs.maxWorkers, bs.totalWorkerTime, bs.totalWorkerTime / bs.totalJobs if bs.totalJobs > 0 else 0.0))
def test(self): """ This is a front-to-back test of the "happy" path in a job store, i.e. covering things that occur in the dat to day life of a job store. The purist might insist that this be split up into several cases and I agree wholeheartedly. """ master = self.master # Test initial state # self.assertFalse(master.exists('foo')) self.assertRaises(NoSuchJobException, master.load, 'foo') # Create parent job and verify its existence/properties # masterRequirements = dict(memory=12, cores=34, disk=35, preemptable=True) jobNodeOnMaster = JobNode(command='master1', requirements=masterRequirements, jobName='test1', unitName='onMaster', jobStoreID=None, predecessorNumber=0) jobOnMaster = master.create(jobNodeOnMaster) self.assertTrue(master.exists(jobOnMaster.jobStoreID)) self.assertEquals(jobOnMaster.command, 'master1') self.assertEquals(jobOnMaster.memory, masterRequirements['memory']) self.assertEquals(jobOnMaster.cores, masterRequirements['cores']) self.assertEquals(jobOnMaster.disk, masterRequirements['disk']) self.assertEquals(jobOnMaster.preemptable, masterRequirements['preemptable']) self.assertEquals(jobOnMaster.jobName, 'test1') self.assertEquals(jobOnMaster.unitName, 'onMaster') self.assertEquals(jobOnMaster.stack, []) self.assertEquals(jobOnMaster.predecessorNumber, 0) self.assertEquals(jobOnMaster.predecessorsFinished, set()) self.assertEquals(jobOnMaster.logJobStoreFileID, None) # Create a second instance of the job store, simulating a worker ... # worker = self._createJobStore() worker.resume() self.assertEquals(worker.config, self.config) self.assertIsNot(worker.config, self.config) # ... and load the parent job there. jobOnWorker = worker.load(jobOnMaster.jobStoreID) self.assertEquals(jobOnMaster, jobOnWorker) # Update state on job # # The following demonstrates the job update pattern, where files to be deleted are # referenced in "filesToDelete" array, which is persisted to disk first. If things go # wrong during the update, this list of files to delete is used to remove the # unneeded files jobOnWorker.filesToDelete = ['1', '2'] worker.update(jobOnWorker) # Check jobs to delete persisted self.assertEquals( master.load(jobOnWorker.jobStoreID).filesToDelete, ['1', '2']) # Create children childRequirements1 = dict(memory=23, cores=45, disk=46, preemptable=True) jobNodeOnChild1 = JobNode(command='child1', requirements=childRequirements1, jobName='test2', unitName='onChild1', jobStoreID=None) childRequirements2 = dict(memory=34, cores=56, disk=57, preemptable=False) jobNodeOnChild2 = JobNode(command='master1', requirements=childRequirements2, jobName='test3', unitName='onChild2', jobStoreID=None) child1 = worker.create(jobNodeOnChild1) child2 = worker.create(jobNodeOnChild2) # Update parent jobOnWorker.stack.append((child1, child2)) jobOnWorker.filesToDelete = [] worker.update(jobOnWorker) # Check equivalence between master and worker # self.assertNotEquals(jobOnWorker, jobOnMaster) # Reload parent job on master jobOnMaster = master.load(jobOnMaster.jobStoreID) self.assertEquals(jobOnWorker, jobOnMaster) # Load children on master an check equivalence self.assertEquals(master.load(child1.jobStoreID), child1) self.assertEquals(master.load(child2.jobStoreID), child2) # Test changing and persisting job state across multiple jobs # childJobs = [ worker.load(childNode.jobStoreID) for childNode in jobOnMaster.stack[-1] ] for childJob in childJobs: childJob.logJobStoreFileID = str(uuid.uuid4()) childJob.remainingRetryCount = 66 self.assertNotEquals(childJob, master.load(childJob.jobStoreID)) for childJob in childJobs: worker.update(childJob) for childJob in childJobs: self.assertEquals(master.load(childJob.jobStoreID), childJob) self.assertEquals(worker.load(childJob.jobStoreID), childJob) # Test job iterator - the results of the iterator are effected by eventual # consistency. We cannot guarantee all jobs will appear but we can assert that all # jobs that show up are a subset of all existing jobs. If we had deleted jobs before # this we would have to worry about ghost jobs appearing and this assertion would not # be valid self.assertTrue( set(childJobs + [jobOnMaster]) >= set(worker.jobs())) self.assertTrue( set(childJobs + [jobOnMaster]) >= set(master.jobs())) # Test job deletions # # First delete parent, this should have no effect on the children self.assertTrue(master.exists(jobOnMaster.jobStoreID)) self.assertTrue(worker.exists(jobOnMaster.jobStoreID)) master.delete(jobOnMaster.jobStoreID) self.assertFalse(master.exists(jobOnMaster.jobStoreID)) self.assertFalse(worker.exists(jobOnMaster.jobStoreID)) for childJob in childJobs: self.assertTrue(master.exists(childJob.jobStoreID)) self.assertTrue(worker.exists(childJob.jobStoreID)) master.delete(childJob.jobStoreID) self.assertFalse(master.exists(childJob.jobStoreID)) self.assertFalse(worker.exists(childJob.jobStoreID)) self.assertRaises(NoSuchJobException, worker.load, childJob.jobStoreID) self.assertRaises(NoSuchJobException, master.load, childJob.jobStoreID) try: with master.readSharedFileStream('missing') as _: pass self.fail('Expecting NoSuchFileException') except NoSuchFileException: pass # Test shared files: Write shared file on master, ... # with master.writeSharedFileStream('foo') as f: f.write('bar') # ... read that file on worker, ... with worker.readSharedFileStream('foo') as f: self.assertEquals('bar', f.read()) # ... and read it again on master. with master.readSharedFileStream('foo') as f: self.assertEquals('bar', f.read()) with master.writeSharedFileStream('nonEncrypted', isProtected=False) as f: f.write('bar') self.assertUrl(master.getSharedPublicUrl('nonEncrypted')) self.assertRaises(NoSuchFileException, master.getSharedPublicUrl, 'missing') # Test per-job files: Create empty file on master, ... # # First recreate job jobOnMaster = master.create(jobNodeOnMaster) fileOne = worker.getEmptyFileStoreID(jobOnMaster.jobStoreID) # Check file exists self.assertTrue(worker.fileExists(fileOne)) self.assertTrue(master.fileExists(fileOne)) # ... write to the file on worker, ... with worker.updateFileStream(fileOne) as f: f.write('one') # ... read the file as a stream on the master, .... with master.readFileStream(fileOne) as f: self.assertEquals(f.read(), 'one') # ... and copy it to a temporary physical file on the master. fh, path = tempfile.mkstemp() try: os.close(fh) tmpPath = path + '.read-only' master.readFile(fileOne, tmpPath) try: shutil.copyfile(tmpPath, path) finally: os.unlink(tmpPath) with open(path, 'r+') as f: self.assertEquals(f.read(), 'one') # Write a different string to the local file ... f.seek(0) f.truncate(0) f.write('two') # ... and create a second file from the local file. fileTwo = master.writeFile(path, jobOnMaster.jobStoreID) with worker.readFileStream(fileTwo) as f: self.assertEquals(f.read(), 'two') # Now update the first file from the local file ... master.updateFile(fileOne, path) with worker.readFileStream(fileOne) as f: self.assertEquals(f.read(), 'two') finally: os.unlink(path) # Create a third file to test the last remaining method. with worker.writeFileStream(jobOnMaster.jobStoreID) as (f, fileThree): f.write('three') with master.readFileStream(fileThree) as f: self.assertEquals(f.read(), 'three') # Delete a file explicitly but leave files for the implicit deletion through the parent worker.deleteFile(fileOne) # Check the file is gone # for store in worker, master: self.assertFalse(store.fileExists(fileOne)) self.assertRaises(NoSuchFileException, store.readFile, fileOne, '') try: with store.readFileStream(fileOne) as _: pass self.fail('Expecting NoSuchFileException') except NoSuchFileException: pass # Test stats and logging # stats = None def callback(f2): stats.add(f2.read()) stats = set() self.assertEquals(0, master.readStatsAndLogging(callback)) self.assertEquals(set(), stats) worker.writeStatsAndLogging('1') self.assertEquals(1, master.readStatsAndLogging(callback)) self.assertEquals({'1'}, stats) self.assertEquals(0, master.readStatsAndLogging(callback)) worker.writeStatsAndLogging('1') worker.writeStatsAndLogging('2') stats = set() self.assertEquals(2, master.readStatsAndLogging(callback)) self.assertEquals({'1', '2'}, stats) largeLogEntry = os.urandom(self._largeLogEntrySize()) stats = set() worker.writeStatsAndLogging(largeLogEntry) self.assertEquals(1, master.readStatsAndLogging(callback)) self.assertEquals({largeLogEntry}, stats) # test the readAll parameter self.assertEqual( 4, master.readStatsAndLogging(callback, readAll=True)) # Delete parent # master.delete(jobOnMaster.jobStoreID) self.assertFalse(master.exists(jobOnMaster.jobStoreID))
def innerLoop(self): """ The main loop for processing jobs by the leader. """ # Sets up the timing of the jobGraph rescuing method timeSinceJobsLastRescued = time.time() logger.info("Starting the main loop") while True: # Process jobs that are ready to be scheduled/have successors to schedule if len(self.toilState.updatedJobs) > 0: logger.debug('Built the jobs list, currently have %i jobs to update and %i jobs issued', len(self.toilState.updatedJobs), self.getNumberOfJobsIssued()) updatedJobs = self.toilState.updatedJobs # The updated jobs to consider below self.toilState.updatedJobs = set() # Resetting the list for the next set for jobGraph, resultStatus in updatedJobs: logger.debug('Updating status of job %s with ID %s: with result status: %s', jobGraph, jobGraph.jobStoreID, resultStatus) # This stops a job with services being issued by the serviceManager from # being considered further in this loop. This catch is necessary because # the job's service's can fail while being issued, causing the job to be # added to updated jobs. if jobGraph in self.serviceManager.jobGraphsWithServicesBeingStarted: logger.debug("Got a job to update which is still owned by the service " "manager: %s", jobGraph.jobStoreID) continue # If some of the jobs successors failed then either fail the job # or restart it if it has retries left and is a checkpoint job if jobGraph.jobStoreID in self.toilState.hasFailedSuccessors: # If the job has services running, signal for them to be killed # once they are killed then the jobGraph will be re-added to the # updatedJobs set and then scheduled to be removed if jobGraph.jobStoreID in self.toilState.servicesIssued: logger.debug("Telling job: %s to terminate its services due to successor failure", jobGraph.jobStoreID) self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID], error=True) # If the job has non-service jobs running wait for them to finish # the job will be re-added to the updated jobs when these jobs are done elif jobGraph.jobStoreID in self.toilState.successorCounts: logger.debug("Job %s with ID: %s with failed successors still has successor jobs running", jobGraph, jobGraph.jobStoreID) continue # If the job is a checkpoint and has remaining retries then reissue it. # The logic behind using > 1 rather than > 0 here: Since this job has # been tried once (without decreasing its retry count as the job # itself was successful), and its subtree failed, it shouldn't be retried # unless it has more than 1 try. elif jobGraph.checkpoint is not None and jobGraph.remainingRetryCount > 1: logger.warn('Job: %s is being restarted as a checkpoint after the total ' 'failure of jobs in its subtree.', jobGraph.jobStoreID) self.issueJob(JobNode.fromJobGraph(jobGraph)) else: # Mark it totally failed logger.debug("Job %s is being processed as completely failed", jobGraph.jobStoreID) self.processTotallyFailedJob(jobGraph) # If the jobGraph has a command it must be run before any successors. # Similarly, if the job previously failed we rerun it, even if it doesn't have a # command to run, to eliminate any parts of the stack now completed. elif jobGraph.command is not None or resultStatus != 0: isServiceJob = jobGraph.jobStoreID in self.toilState.serviceJobStoreIDToPredecessorJob # If the job has run out of retries or is a service job whose error flag has # been indicated, fail the job. if (jobGraph.remainingRetryCount == 0 or isServiceJob and not self.jobStore.fileExists(jobGraph.errorJobStoreID)): self.processTotallyFailedJob(jobGraph) logger.warn("Job %s with ID %s is completely failed", jobGraph, jobGraph.jobStoreID) else: # Otherwise try the job again self.issueJob(JobNode.fromJobGraph(jobGraph)) # If the job has services to run, which have not been started, start them elif len(jobGraph.services) > 0: # Build a map from the service jobs to the job and a map # of the services created for the job assert jobGraph.jobStoreID not in self.toilState.servicesIssued self.toilState.servicesIssued[jobGraph.jobStoreID] = {} for serviceJobList in jobGraph.services: for serviceTuple in serviceJobList: serviceID = serviceTuple.jobStoreID assert serviceID not in self.toilState.serviceJobStoreIDToPredecessorJob self.toilState.serviceJobStoreIDToPredecessorJob[serviceID] = jobGraph self.toilState.servicesIssued[jobGraph.jobStoreID][serviceID] = serviceTuple # Use the service manager to start the services self.serviceManager.scheduleServices(jobGraph) logger.debug("Giving job: %s to service manager to schedule its jobs", jobGraph.jobStoreID) # There exist successors to run elif len(jobGraph.stack) > 0: assert len(jobGraph.stack[-1]) > 0 logger.debug("Job: %s has %i successors to schedule", jobGraph.jobStoreID, len(jobGraph.stack[-1])) #Record the number of successors that must be completed before #the jobGraph can be considered again assert jobGraph.jobStoreID not in self.toilState.successorCounts self.toilState.successorCounts[jobGraph.jobStoreID] = len(jobGraph.stack[-1]) #List of successors to schedule successors = [] #For each successor schedule if all predecessors have been completed for jobNode in jobGraph.stack[-1]: successorJobStoreID = jobNode.jobStoreID #Build map from successor to predecessors. if successorJobStoreID not in self.toilState.successorJobStoreIDToPredecessorJobs: self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID] = [] self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID].append(jobGraph) #Case that the jobGraph has multiple predecessors if jobNode.predecessorNumber > 1: logger.debug("Successor job: %s of job: %s has multiple " "predecessors", jobNode, jobGraph) # Get the successor job, using a cache # (if the successor job has already been seen it will be in this cache, # but otherwise put it in the cache) if successorJobStoreID not in self.toilState.jobsToBeScheduledWithMultiplePredecessors: self.toilState.jobsToBeScheduledWithMultiplePredecessors[successorJobStoreID] = self.jobStore.load(successorJobStoreID) successorJobGraph = self.toilState.jobsToBeScheduledWithMultiplePredecessors[successorJobStoreID] #Add the jobGraph as a finished predecessor to the successor successorJobGraph.predecessorsFinished.add(jobGraph.jobStoreID) # If the successor is in the set of successors of failed jobs if successorJobStoreID in self.toilState.failedSuccessors: logger.debug("Successor job: %s of job: %s has failed " "predecessors", jobNode, jobGraph) # Add the job to the set having failed successors self.toilState.hasFailedSuccessors.add(jobGraph.jobStoreID) # Reduce active successor count and remove the successor as an active successor of the job self.toilState.successorCounts[jobGraph.jobStoreID] -= 1 assert self.toilState.successorCounts[jobGraph.jobStoreID] >= 0 self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID].remove(jobGraph) if len(self.toilState.successorJobStoreIDToPredecessorJobs[successorJobStoreID]) == 0: self.toilState.successorJobStoreIDToPredecessorJobs.pop(successorJobStoreID) # If the job now has no active successors add to active jobs # so it can be processed as a job with failed successors if self.toilState.successorCounts[jobGraph.jobStoreID] == 0: logger.debug("Job: %s has no successors to run " "and some are failed, adding to list of jobs " "with failed successors", jobGraph) self.toilState.successorCounts.pop(jobGraph.jobStoreID) self.toilState.updatedJobs.add((jobGraph, 0)) continue # If the successor job's predecessors have all not all completed then # ignore the jobGraph as is not yet ready to run assert len(successorJobGraph.predecessorsFinished) <= successorJobGraph.predecessorNumber if len(successorJobGraph.predecessorsFinished) < successorJobGraph.predecessorNumber: continue else: # Remove the successor job from the cache self.toilState.jobsToBeScheduledWithMultiplePredecessors.pop(successorJobStoreID) # Add successor to list of successors to schedule successors.append(jobNode) self.issueJobs(successors) elif jobGraph.jobStoreID in self.toilState.servicesIssued: logger.debug("Telling job: %s to terminate its services due to the " "successful completion of its successor jobs", jobGraph) self.serviceManager.killServices(self.toilState.servicesIssued[jobGraph.jobStoreID], error=False) #There are no remaining tasks to schedule within the jobGraph, but #we schedule it anyway to allow it to be deleted. #TODO: An alternative would be simple delete it here and add it to the #list of jobs to process, or (better) to create an asynchronous #process that deletes jobs and then feeds them back into the set #of jobs to be processed else: # Remove the job if jobGraph.remainingRetryCount > 0: self.issueJob(JobNode.fromJobGraph(jobGraph)) logger.debug("Job: %s is empty, we are scheduling to clean it up", jobGraph.jobStoreID) else: self.processTotallyFailedJob(jobGraph) logger.warn("Job: %s is empty but completely failed - something is very wrong", jobGraph.jobStoreID) # Start any service jobs available from the service manager self.issueQueingServiceJobs() while True: serviceJob = self.serviceManager.getServiceJobsToStart(0) # Stop trying to get jobs when function returns None if serviceJob is None: break logger.debug('Launching service job: %s', serviceJob) self.issueServiceJob(serviceJob) # Get jobs whose services have started while True: jobGraph = self.serviceManager.getJobGraphWhoseServicesAreRunning(0) if jobGraph is None: # Stop trying to get jobs when function returns None break logger.debug('Job: %s has established its services.', jobGraph.jobStoreID) jobGraph.services = [] self.toilState.updatedJobs.add((jobGraph, 0)) # Gather any new, updated jobGraph from the batch system updatedJobTuple = self.batchSystem.getUpdatedBatchJob(2) if updatedJobTuple is not None: jobID, result, wallTime = updatedJobTuple # easy, track different state try: updatedJob = self.jobBatchSystemIDToIssuedJob[jobID] except KeyError: logger.warn("A result seems to already have been processed " "for job %s", jobID) else: if result == 0: cur_logger = (logger.debug if str(updatedJob.jobName).startswith(CWL_INTERNAL_JOBS) else logger.info) cur_logger('Job ended successfully: %s', updatedJob) if self.toilMetrics: self.toilMetrics.logCompletedJob(updatedJob) else: logger.warn('Job failed with exit value %i: %s', result, updatedJob) self.processFinishedJob(jobID, result, wallTime=wallTime) else: # Process jobs that have gone awry #In the case that there is nothing happening #(no updated jobs to gather for 10 seconds) #check if there are any jobs that have run too long #(see self.reissueOverLongJobs) or which #have gone missing from the batch system (see self.reissueMissingJobs) if (time.time() - timeSinceJobsLastRescued >= self.config.rescueJobsFrequency): #We only #rescue jobs every N seconds, and when we have #apparently exhausted the current jobGraph supply self.reissueOverLongJobs() logger.info("Reissued any over long jobs") hasNoMissingJobs = self.reissueMissingJobs() if hasNoMissingJobs: timeSinceJobsLastRescued = time.time() else: timeSinceJobsLastRescued += 60 #This means we'll try again #in a minute, providing things are quiet logger.info("Rescued any (long) missing jobs") # Check on the associated threads and exit if a failure is detected self.statsAndLogging.check() self.serviceManager.check() # the cluster scaler object will only be instantiated if autoscaling is enabled if self.clusterScaler is not None: self.clusterScaler.check() # The exit criterion if len(self.toilState.updatedJobs) == 0 and self.getNumberOfJobsIssued() == 0 and self.serviceManager.jobsIssuedToServiceManager == 0: logger.info("No jobs left to run so exiting.") break # Check for deadlocks self.checkForDeadlocks() logger.info("Finished the main loop") # Consistency check the toil state assert self.toilState.updatedJobs == set() assert self.toilState.successorCounts == {} assert self.toilState.successorJobStoreIDToPredecessorJobs == {} assert self.toilState.serviceJobStoreIDToPredecessorJob == {} assert self.toilState.servicesIssued == {}