def testBatchResourceLimits(self): jobDesc1 = JobDescription(command="sleep 1000", requirements=dict(memory=1 << 30, cores=1, disk=1000, preemptable=preemptable), jobName='testResourceLimits') job1 = self.batchSystem.issueBatchJob(jobDesc1) self.assertIsNotNone(job1) jobDesc2 = JobDescription(command="sleep 1000", requirements=dict(memory=2 << 30, cores=1, disk=1000, preemptable=preemptable), jobName='testResourceLimits') job2 = self.batchSystem.issueBatchJob(jobDesc2) self.assertIsNotNone(job2) batches = self._getBatchList() self.assertEqual(len(batches), 2) # It would be better to directly check that the batches have the correct memory and cpu # values, but Parasol seems to slightly change the values sometimes. self.assertNotEqual(batches[0]['ram'], batches[1]['ram']) # Need to kill one of the jobs because there are only two cores available self.batchSystem.killBatchJobs([job2]) job3 = self.batchSystem.issueBatchJob(jobDesc1) self.assertIsNotNone(job3) batches = self._getBatchList() self.assertEqual(len(batches), 1)
def _mockJobDescription(self, jobStoreID=None, command=None, **kwargs): """ Create a mock-up JobDescription with the given ID, command, and other parameters. """ # TODO: Use a real unittest.Mock? For now we make a real instance and just hack it up. desc = JobDescription(**kwargs) # Normally we can't pass in a command or ID, and the job # serialization logic takes care of filling them in. We set them # here. if command is not None: desc.command = command if jobStoreID is not None: desc.jobStoreID = jobStoreID return desc
def testJobDescription(self): """ Tests the public interface of a JobDescription. """ command = "by your command" memory = 2^32 disk = 2^32 cores = "1" preemptable = 1 j = JobDescription(command=command, requirements={"memory": memory, "cores": cores, "disk": disk, "preemptable": preemptable}, jobName='testJobGraph', unitName='noName') #Check attributes self.assertEqual(j.command, command) self.assertEqual(j.memory, memory) self.assertEqual(j.disk, disk) self.assertEqual(j.cores, int(cores)) self.assertEqual(j.preemptable, bool(preemptable)) self.assertEqual(type(j.jobStoreID), TemporaryID) self.assertEqual(list(j.successorsAndServiceHosts()), []) self.assertEqual(list(j.allSuccessors()), []) self.assertEqual(list(j.serviceHostIDsInBatches()), []) self.assertEqual(list(j.services), []) self.assertEqual(list(j.nextSuccessors()), []) self.assertEqual(sum((len(level) for level in j.stack)), 0) self.assertEqual(j.predecessorsFinished, set()) self.assertEqual(j.logJobStoreFileID, None) #Check equals function (should be based on object identity and not contents) j2 = JobDescription(command=command, requirements={"memory": memory, "cores": cores, "disk": disk, "preemptable": preemptable}, jobName='testJobGraph', unitName='noName') self.assertNotEqual(j, j2)
def addJob(self, jobShape, preemptable=False): """ Add a job to the job queue """ self.totalJobs += 1 jobID = uuid.uuid4() self.jobBatchSystemIDToIssuedJob[jobID] = JobDescription( requirements={ "memory": jobShape.memory, "cores": jobShape.cores, "disk": jobShape.disk, "preemptable": preemptable }, jobName='job{}'.format(self.totalJobs)) self.jobQueue.put(jobID)
def test(self): # We'll use fractions to avoid rounding errors. Remember that not every fraction can be # represented as a floating point number. F = Fraction # This test isn't general enough to cover every possible value of minCores in # SingleMachineBatchSystem. Instead we hard-code a value and assert it. minCores = F(1, 10) self.assertEqual(float(minCores), SingleMachineBatchSystem.minCores) for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}: for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}: for load in (F(1, 10), F(1), F(10)): jobs = int(maxCores / coresPerJob * load) if jobs >= 1 and minCores <= coresPerJob < maxCores: self.assertEqual(maxCores, float(maxCores)) bs = SingleMachineBatchSystem( config=hidden.AbstractBatchSystemTest.createConfig(), maxCores=float(maxCores), # Ensure that memory or disk requirements don't get in the way. maxMemory=jobs * 10, maxDisk=jobs * 10) try: jobIds = set() for i in range(0, int(jobs)): jobIds.add(bs.issueBatchJob(JobDescription(command=self.scriptCommand(), requirements=dict( cores=float(coresPerJob), memory=1, disk=1, preemptable=preemptable), jobName=str(i), unitName=''))) self.assertEqual(len(jobIds), jobs) while jobIds: job = bs.getUpdatedBatchJob(maxWait=10) self.assertIsNotNone(job) jobId, status, wallTime = job.jobID, job.exitStatus, job.wallTime self.assertEqual(status, 0) # would raise KeyError on absence jobIds.remove(jobId) finally: bs.shutdown() concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath) self.assertEqual(concurrentTasks, 0) logger.info('maxCores: {maxCores}, ' 'coresPerJob: {coresPerJob}, ' 'load: {load}'.format(**locals())) # This is the key assertion: expectedMaxConcurrentTasks = min(maxCores // coresPerJob, jobs) self.assertEqual(maxConcurrentTasks, expectedMaxConcurrentTasks) resetCounters(self.counterPath)
def testJobDescriptionSequencing(self): j = JobDescription(command='command', requirements={}, jobName='unimportant') j.addChild('child') j.addFollowOn('followOn') # With a command, nothing should be ready to run self.assertEqual(list(j.nextSuccessors()), []) # With command cleared, child should be ready to run j.command = None self.assertEqual(list(j.nextSuccessors()), ['child']) # Without the child, the follow-on should be ready to run j.filterSuccessors(lambda jID: jID != 'child') self.assertEqual(list(j.nextSuccessors()), ['followOn']) # Without the follow-on, we should return None, to be distinct from an # empty list. Nothing left to do! j.filterSuccessors(lambda jID: jID != 'followOn') self.assertEqual(j.nextSuccessors(), None)
def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, config: Config) -> Optional[JobDescription]: """ Returns the next chainable job's JobDescription after the given predecessor JobDescription, if one exists, or None if the chain must terminate. :param predecessor: The job to chain from :param jobStore: The JobStore to fetch JobDescriptions from. :param config: The configuration for the current run. """ #If no more jobs to run or services not finished, quit if len(predecessor.stack) == 0 or len(predecessor.services) > 0 or ( isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None): logger.debug( "Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s", len(predecessor.stack), len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None)) return None if len(predecessor.stack) > 1 and len(predecessor.stack[-1]) > 0 and len( predecessor.stack[-2]) > 0: # TODO: Without a real stack list we can freely mutate, we can't chain # to a child, which may branch, and then go back and do the follow-ons # of the original job. # TODO: Go back to a free-form stack list and require some kind of # stack build phase? logger.debug( "Stopping running chain of jobs because job has both children and follow-ons" ) return None #Get the next set of jobs to run jobs = predecessor.nextSuccessors() if len(jobs) == 0: # If there are no jobs, we might just not have any children. logger.debug( "Stopping running chain of jobs because job has no ready children or follow-ons" ) return None #If there are 2 or more jobs to run in parallel we quit if len(jobs) >= 2: logger.debug( "No more jobs can run in series by this worker," " it's got %i children", len(jobs) - 1) return None # Grab the only job that should be there. successorID = next(iter(jobs)) # Load the successor JobDescription successor = jobStore.load(successorID) #We check the requirements of the successor to see if we can run it #within the current worker if successor.memory > predecessor.memory: logger.debug("We need more memory for the next job, so finishing") return None if successor.cores > predecessor.cores: logger.debug("We need more cores for the next job, so finishing") return None if successor.disk > predecessor.disk: logger.debug("We need more disk for the next job, so finishing") return None if successor.preemptable != predecessor.preemptable: logger.debug( "Preemptability is different for the next job, returning to the leader" ) return None if successor.predecessorNumber > 1: logger.debug( "The next job has multiple predecessors; we must return to the leader." ) return None if len(successor.services) > 0: logger.debug( "The next job requires services that will not yet be started; we must return to the leader." ) return None if isinstance(successor, CheckpointJobDescription): # Check if job is a checkpoint job and quit if so logger.debug("Next job is checkpoint, so finishing") return None # Made it through! This job is chainable. return successor
def testClusterScalingMultipleNodeTypes(self): smallNode = Shape(20, 5, 10, 10, False) mediumNode = Shape(20, 10, 10, 10, False) largeNode = Shape(20, 20, 10, 10, False) numJobs = 100 config = Config() # Make defaults dummy values config.defaultMemory = 1 config.defaultCores = 1 config.defaultDisk = 1 # No preemptable nodes/jobs config.preemptableNodeTypes = [] config.minPreemptableNodes = [] config.maxPreemptableNodes = [] # No preemptable nodes # Make sure the node types don't have to be ordered config.nodeTypes = [largeNode, smallNode, mediumNode] config.minNodes = [0, 0, 0] config.maxNodes = [10, 10] # test expansion of this list # Algorithm parameters config.targetTime = defaultTargetTime config.betaInertia = 0.1 config.scaleInterval = 3 mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() mock.start() try: # Add small jobs list( map(lambda x: mock.addJob(jobShape=smallNode), list(range(numJobs)))) list( map(lambda x: mock.addJob(jobShape=mediumNode), list(range(numJobs)))) # Add medium completed jobs for i in range(1000): iJ = JobDescription(requirements=dict(memory=random.choice( range(smallNode.memory, mediumNode.memory)), cores=mediumNode.cores, disk=largeNode.cores, preemptable=False), jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10))) while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes( ) > 0: logger.debug("%i nodes currently provisioned" % mock.getNumberOfNodes()) # Make sure there are no large nodes self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0) clusterScaler.check() time.sleep(0.5) finally: clusterScaler.shutdown() mock.shutDown() # Make sure jobs ran on both the small and medium node types self.assertTrue(mock.totalJobs > 0) self.assertTrue(mock.maxWorkers[smallNode] > 0) self.assertTrue(mock.maxWorkers[mediumNode] > 0) self.assertEqual(mock.maxWorkers[largeNode], 0)
def _testClusterScaling(self, config, numJobs, numPreemptableJobs, jobShape): """ Test the ClusterScaler class with different patterns of job creation. Tests ascertain that autoscaling occurs and that all the jobs are run. """ # First do simple test of creating 100 preemptable and non-premptable jobs and check the # jobs are completed okay, then print the amount of worker time expended and the total # number of worker nodes used. mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0) mock.start() clusterScaler = ScalerThread(mock, mock, config) clusterScaler.start() try: # Add 100 jobs to complete list( map(lambda x: mock.addJob(jobShape=jobShape), list(range(numJobs)))) list( map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True), list(range(numPreemptableJobs)))) # Add some completed jobs for preemptable in (True, False): if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0: # Add 1000 random jobs for _ in range(1000): x = mock.getNodeShape(nodeType=jobShape) iJ = JobDescription(requirements=dict( memory=random.choice(list(range(1, x.memory))), cores=random.choice(list(range(1, x.cores))), disk=random.choice(list(range(1, x.disk))), preemptable=preemptable), jobName='testClusterScaling', unitName='') clusterScaler.addCompletedJob( iJ, random.choice(list(range(1, x.wallTime)))) startTime = time.time() # Wait while the cluster processes the jobs while (mock.getNumberOfJobsIssued(preemptable=False) > 0 or mock.getNumberOfJobsIssued(preemptable=True) > 0 or mock.getNumberOfNodes() > 0 or mock.getNumberOfNodes(preemptable=True) > 0): logger.debug( "Running, non-preemptable queue size: %s, non-preemptable workers: %s, " "preemptable queue size: %s, preemptable workers: %s" % (mock.getNumberOfJobsIssued(preemptable=False), mock.getNumberOfNodes(preemptable=False), mock.getNumberOfJobsIssued(preemptable=True), mock.getNumberOfNodes(preemptable=True))) clusterScaler.check() time.sleep(0.5) logger.debug("We waited %s for cluster to finish" % (time.time() - startTime)) finally: clusterScaler.shutdown() mock.shutDown() # Print some info about the autoscaling logger.debug("Total-jobs: %s: Max-workers: %s, " "Total-worker-time: %s, Worker-time-per-job: %s" % (mock.totalJobs, sum( mock.maxWorkers.values()), mock.totalWorkerTime, old_div(mock.totalWorkerTime, mock.totalJobs) if mock.totalJobs > 0 else 0.0))
def _buildToilState(self, jobDesc: JobDescription) -> None: """ Traverses tree of jobs down from the subtree root JobDescription (jobDesc), building the ToilState class. :param jobDesc: The description for the root job of the workflow being run. """ # If the job description has a command, is a checkpoint, has services # or is ready to be deleted it is ready to be processed (i.e. it is updated) if (jobDesc.command is not None or (isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None) or len(jobDesc.services) > 0 or jobDesc.nextSuccessors() is None): logger.debug( "Found job to run: %s, with command: %s, with checkpoint: %s, with " "services: %s, with no next successors: %s", jobDesc.jobStoreID, jobDesc.command is not None, isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None, len(jobDesc.services) > 0, jobDesc.nextSuccessors() is None, ) # Set the job updated because we should be able to make progress on it. self.bus.put(JobUpdatedMessage(str(jobDesc.jobStoreID), 0)) if isinstance(jobDesc, CheckpointJobDescription ) and jobDesc.checkpoint is not None: jobDesc.command = jobDesc.checkpoint else: # There exist successors logger.debug( "Adding job: %s to the state with %s successors", jobDesc.jobStoreID, len(jobDesc.nextSuccessors()), ) # Record the number of successors self.successorCounts[str(jobDesc.jobStoreID)] = len( jobDesc.nextSuccessors()) def processSuccessorWithMultiplePredecessors( successor: JobDescription) -> None: # If jobDesc is not reported as complete by the successor if jobDesc.jobStoreID not in successor.predecessorsFinished: # Update the successor's status to mark the predecessor complete successor.predecessorsFinished.add(jobDesc.jobStoreID) # If the successor has no predecessors to finish assert len(successor.predecessorsFinished ) <= successor.predecessorNumber if len(successor.predecessorsFinished ) == successor.predecessorNumber: # It is ready to be run, so remove it from the set of waiting jobs self.jobsToBeScheduledWithMultiplePredecessors.remove( successorJobStoreID) # Recursively consider the successor self._buildToilState(successor) # For each successor for successorJobStoreID in jobDesc.nextSuccessors(): # If the successor does not yet point back at a # predecessor we have not yet considered it if successorJobStoreID not in self.successor_to_predecessors: # Add the job as a predecessor self.successor_to_predecessors[successorJobStoreID] = { str(jobDesc.jobStoreID) } # We load the successor job successor = self.get_job(successorJobStoreID) # If predecessor number > 1 then the successor has multiple predecessors if successor.predecessorNumber > 1: # We put the successor job in the set of waiting successor # jobs with multiple predecessors assert successorJobStoreID not in self.jobsToBeScheduledWithMultiplePredecessors self.jobsToBeScheduledWithMultiplePredecessors.add( successorJobStoreID) # Process successor processSuccessorWithMultiplePredecessors(successor) else: # The successor has only this job as a predecessor so # recursively consider the successor self._buildToilState(successor) else: # We've already seen the successor # Add the job as a predecessor assert (jobDesc.jobStoreID not in self. successor_to_predecessors[successorJobStoreID]) self.successor_to_predecessors[successorJobStoreID].add( str(jobDesc.jobStoreID)) # If the successor has multiple predecessors if successorJobStoreID in self.jobsToBeScheduledWithMultiplePredecessors: # Get the successor from cache successor = self.get_job(successorJobStoreID) # Process successor processSuccessorWithMultiplePredecessors(successor)