Beispiel #1
0
 def testBatchResourceLimits(self):
     jobDesc1 = JobDescription(command="sleep 1000",
                               requirements=dict(memory=1 << 30,
                                                 cores=1,
                                                 disk=1000,
                                                 preemptable=preemptable),
                               jobName='testResourceLimits')
     job1 = self.batchSystem.issueBatchJob(jobDesc1)
     self.assertIsNotNone(job1)
     jobDesc2 = JobDescription(command="sleep 1000",
                               requirements=dict(memory=2 << 30,
                                                 cores=1,
                                                 disk=1000,
                                                 preemptable=preemptable),
                               jobName='testResourceLimits')
     job2 = self.batchSystem.issueBatchJob(jobDesc2)
     self.assertIsNotNone(job2)
     batches = self._getBatchList()
     self.assertEqual(len(batches), 2)
     # It would be better to directly check that the batches have the correct memory and cpu
     # values, but Parasol seems to slightly change the values sometimes.
     self.assertNotEqual(batches[0]['ram'], batches[1]['ram'])
     # Need to kill one of the jobs because there are only two cores available
     self.batchSystem.killBatchJobs([job2])
     job3 = self.batchSystem.issueBatchJob(jobDesc1)
     self.assertIsNotNone(job3)
     batches = self._getBatchList()
     self.assertEqual(len(batches), 1)
Beispiel #2
0
        def _mockJobDescription(self, jobStoreID=None, command=None, **kwargs):
            """
            Create a mock-up JobDescription with the given ID, command, and other parameters.
            """

            # TODO: Use a real unittest.Mock? For now we make a real instance and just hack it up.

            desc = JobDescription(**kwargs)
            # Normally we can't pass in a command or ID, and the job
            # serialization logic takes care of filling them in. We set them
            # here.
            if command is not None:
                desc.command = command
            if jobStoreID is not None:
                desc.jobStoreID = jobStoreID

            return desc
 def testJobDescription(self):       
     """
     Tests the public interface of a JobDescription.
     """ 
 
     command = "by your command"
     memory = 2^32
     disk = 2^32
     cores = "1"
     preemptable = 1
     
     j = JobDescription(command=command, requirements={"memory": memory, "cores": cores, "disk": disk, "preemptable": preemptable},
                        jobName='testJobGraph', unitName='noName')
     
     #Check attributes
     self.assertEqual(j.command, command)
     self.assertEqual(j.memory, memory)
     self.assertEqual(j.disk, disk)
     self.assertEqual(j.cores, int(cores))
     self.assertEqual(j.preemptable, bool(preemptable))
     self.assertEqual(type(j.jobStoreID), TemporaryID)
     self.assertEqual(list(j.successorsAndServiceHosts()), [])
     self.assertEqual(list(j.allSuccessors()), [])
     self.assertEqual(list(j.serviceHostIDsInBatches()), [])
     self.assertEqual(list(j.services), [])
     self.assertEqual(list(j.nextSuccessors()), [])
     self.assertEqual(sum((len(level) for level in j.stack)), 0)
     self.assertEqual(j.predecessorsFinished, set())
     self.assertEqual(j.logJobStoreFileID, None)
     
     #Check equals function (should be based on object identity and not contents)
     j2 = JobDescription(command=command, requirements={"memory": memory, "cores": cores, "disk": disk, "preemptable": preemptable},
                         jobName='testJobGraph', unitName='noName')
     self.assertNotEqual(j, j2)
Beispiel #4
0
 def addJob(self, jobShape, preemptable=False):
     """
     Add a job to the job queue
     """
     self.totalJobs += 1
     jobID = uuid.uuid4()
     self.jobBatchSystemIDToIssuedJob[jobID] = JobDescription(
         requirements={
             "memory": jobShape.memory,
             "cores": jobShape.cores,
             "disk": jobShape.disk,
             "preemptable": preemptable
         },
         jobName='job{}'.format(self.totalJobs))
     self.jobQueue.put(jobID)
Beispiel #5
0
 def test(self):
     # We'll use fractions to avoid rounding errors. Remember that not every fraction can be
     # represented as a floating point number.
     F = Fraction
     # This test isn't general enough to cover every possible value of minCores in
     # SingleMachineBatchSystem. Instead we hard-code a value and assert it.
     minCores = F(1, 10)
     self.assertEqual(float(minCores), SingleMachineBatchSystem.minCores)
     for maxCores in {F(minCores), minCores * 10, F(1), F(numCores, 2), F(numCores)}:
         for coresPerJob in {F(minCores), F(minCores * 10), F(1), F(maxCores, 2), F(maxCores)}:
             for load in (F(1, 10), F(1), F(10)):
                 jobs = int(maxCores / coresPerJob * load)
                 if jobs >= 1 and minCores <= coresPerJob < maxCores:
                     self.assertEqual(maxCores, float(maxCores))
                     bs = SingleMachineBatchSystem(
                         config=hidden.AbstractBatchSystemTest.createConfig(),
                         maxCores=float(maxCores),
                         # Ensure that memory or disk requirements don't get in the way.
                         maxMemory=jobs * 10,
                         maxDisk=jobs * 10)
                     try:
                         jobIds = set()
                         for i in range(0, int(jobs)):
                             jobIds.add(bs.issueBatchJob(JobDescription(command=self.scriptCommand(),
                                                                        requirements=dict(
                                                                            cores=float(coresPerJob),
                                                                            memory=1, disk=1,
                                                                            preemptable=preemptable),
                                                                        jobName=str(i), unitName='')))
                         self.assertEqual(len(jobIds), jobs)
                         while jobIds:
                             job = bs.getUpdatedBatchJob(maxWait=10)
                             self.assertIsNotNone(job)
                             jobId, status, wallTime = job.jobID, job.exitStatus, job.wallTime
                             self.assertEqual(status, 0)
                             # would raise KeyError on absence
                             jobIds.remove(jobId)
                     finally:
                         bs.shutdown()
                     concurrentTasks, maxConcurrentTasks = getCounters(self.counterPath)
                     self.assertEqual(concurrentTasks, 0)
                     logger.info('maxCores: {maxCores}, '
                              'coresPerJob: {coresPerJob}, '
                              'load: {load}'.format(**locals()))
                     # This is the key assertion:
                     expectedMaxConcurrentTasks = min(maxCores // coresPerJob, jobs)
                     self.assertEqual(maxConcurrentTasks, expectedMaxConcurrentTasks)
                     resetCounters(self.counterPath)
 def testJobDescriptionSequencing(self):
     j = JobDescription(command='command', requirements={},  jobName='unimportant')
     
     j.addChild('child')
     j.addFollowOn('followOn')
     
     # With a command, nothing should be ready to run
     self.assertEqual(list(j.nextSuccessors()), [])
     
     # With command cleared, child should be ready to run
     j.command = None
     self.assertEqual(list(j.nextSuccessors()), ['child'])
     
     # Without the child, the follow-on should be ready to run
     j.filterSuccessors(lambda jID: jID != 'child')
     self.assertEqual(list(j.nextSuccessors()), ['followOn'])
     
     # Without the follow-on, we should return None, to be distinct from an
     # empty list. Nothing left to do!
     j.filterSuccessors(lambda jID: jID != 'followOn')
     self.assertEqual(j.nextSuccessors(), None)
Beispiel #7
0
def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore,
                  config: Config) -> Optional[JobDescription]:
    """
    Returns the next chainable job's JobDescription after the given predecessor
    JobDescription, if one exists, or None if the chain must terminate.

    :param predecessor: The job to chain from
    :param jobStore: The JobStore to fetch JobDescriptions from.
    :param config: The configuration for the current run.
    """
    #If no more jobs to run or services not finished, quit
    if len(predecessor.stack) == 0 or len(predecessor.services) > 0 or (
            isinstance(predecessor, CheckpointJobDescription)
            and predecessor.checkpoint != None):
        logger.debug(
            "Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s",
            len(predecessor.stack), len(predecessor.services),
            (isinstance(predecessor, CheckpointJobDescription)
             and predecessor.checkpoint != None))
        return None

    if len(predecessor.stack) > 1 and len(predecessor.stack[-1]) > 0 and len(
            predecessor.stack[-2]) > 0:
        # TODO: Without a real stack list we can freely mutate, we can't chain
        # to a child, which may branch, and then go back and do the follow-ons
        # of the original job.
        # TODO: Go back to a free-form stack list and require some kind of
        # stack build phase?
        logger.debug(
            "Stopping running chain of jobs because job has both children and follow-ons"
        )
        return None

    #Get the next set of jobs to run
    jobs = predecessor.nextSuccessors()
    if len(jobs) == 0:
        # If there are no jobs, we might just not have any children.
        logger.debug(
            "Stopping running chain of jobs because job has no ready children or follow-ons"
        )
        return None

    #If there are 2 or more jobs to run in parallel we quit
    if len(jobs) >= 2:
        logger.debug(
            "No more jobs can run in series by this worker,"
            " it's got %i children",
            len(jobs) - 1)
        return None

    # Grab the only job that should be there.
    successorID = next(iter(jobs))

    # Load the successor JobDescription
    successor = jobStore.load(successorID)

    #We check the requirements of the successor to see if we can run it
    #within the current worker
    if successor.memory > predecessor.memory:
        logger.debug("We need more memory for the next job, so finishing")
        return None
    if successor.cores > predecessor.cores:
        logger.debug("We need more cores for the next job, so finishing")
        return None
    if successor.disk > predecessor.disk:
        logger.debug("We need more disk for the next job, so finishing")
        return None
    if successor.preemptable != predecessor.preemptable:
        logger.debug(
            "Preemptability is different for the next job, returning to the leader"
        )
        return None
    if successor.predecessorNumber > 1:
        logger.debug(
            "The next job has multiple predecessors; we must return to the leader."
        )
        return None

    if len(successor.services) > 0:
        logger.debug(
            "The next job requires services that will not yet be started; we must return to the leader."
        )
        return None

    if isinstance(successor, CheckpointJobDescription):
        # Check if job is a checkpoint job and quit if so
        logger.debug("Next job is checkpoint, so finishing")
        return None

    # Made it through! This job is chainable.
    return successor
Beispiel #8
0
    def testClusterScalingMultipleNodeTypes(self):

        smallNode = Shape(20, 5, 10, 10, False)
        mediumNode = Shape(20, 10, 10, 10, False)
        largeNode = Shape(20, 20, 10, 10, False)

        numJobs = 100

        config = Config()

        # Make defaults dummy values
        config.defaultMemory = 1
        config.defaultCores = 1
        config.defaultDisk = 1

        # No preemptable nodes/jobs
        config.preemptableNodeTypes = []
        config.minPreemptableNodes = []
        config.maxPreemptableNodes = []  # No preemptable nodes

        # Make sure the node types don't have to be ordered
        config.nodeTypes = [largeNode, smallNode, mediumNode]
        config.minNodes = [0, 0, 0]
        config.maxNodes = [10, 10]  # test expansion of this list

        # Algorithm parameters
        config.targetTime = defaultTargetTime
        config.betaInertia = 0.1
        config.scaleInterval = 3

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ScalerThread(mock, mock, config)
        clusterScaler.start()
        mock.start()

        try:
            # Add small jobs
            list(
                map(lambda x: mock.addJob(jobShape=smallNode),
                    list(range(numJobs))))
            list(
                map(lambda x: mock.addJob(jobShape=mediumNode),
                    list(range(numJobs))))

            # Add medium completed jobs
            for i in range(1000):
                iJ = JobDescription(requirements=dict(memory=random.choice(
                    range(smallNode.memory, mediumNode.memory)),
                                                      cores=mediumNode.cores,
                                                      disk=largeNode.cores,
                                                      preemptable=False),
                                    jobName='testClusterScaling',
                                    unitName='')
                clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10)))

            while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes(
            ) > 0:
                logger.debug("%i nodes currently provisioned" %
                             mock.getNumberOfNodes())
                # Make sure there are no large nodes
                self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0)
                clusterScaler.check()
                time.sleep(0.5)
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Make sure jobs ran on both the small and medium node types
        self.assertTrue(mock.totalJobs > 0)
        self.assertTrue(mock.maxWorkers[smallNode] > 0)
        self.assertTrue(mock.maxWorkers[mediumNode] > 0)

        self.assertEqual(mock.maxWorkers[largeNode], 0)
Beispiel #9
0
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs,
                            jobShape):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain that
        autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        mock.start()
        clusterScaler = ScalerThread(mock, mock, config)
        clusterScaler.start()
        try:
            # Add 100 jobs to complete
            list(
                map(lambda x: mock.addJob(jobShape=jobShape),
                    list(range(numJobs))))
            list(
                map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True),
                    list(range(numPreemptableJobs))))

            # Add some completed jobs
            for preemptable in (True, False):
                if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                    # Add 1000 random jobs
                    for _ in range(1000):
                        x = mock.getNodeShape(nodeType=jobShape)
                        iJ = JobDescription(requirements=dict(
                            memory=random.choice(list(range(1, x.memory))),
                            cores=random.choice(list(range(1, x.cores))),
                            disk=random.choice(list(range(1, x.disk))),
                            preemptable=preemptable),
                                            jobName='testClusterScaling',
                                            unitName='')
                        clusterScaler.addCompletedJob(
                            iJ, random.choice(list(range(1, x.wallTime))))

            startTime = time.time()
            # Wait while the cluster processes the jobs
            while (mock.getNumberOfJobsIssued(preemptable=False) > 0
                   or mock.getNumberOfJobsIssued(preemptable=True) > 0
                   or mock.getNumberOfNodes() > 0
                   or mock.getNumberOfNodes(preemptable=True) > 0):
                logger.debug(
                    "Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                    "preemptable queue size: %s, preemptable workers: %s" %
                    (mock.getNumberOfJobsIssued(preemptable=False),
                     mock.getNumberOfNodes(preemptable=False),
                     mock.getNumberOfJobsIssued(preemptable=True),
                     mock.getNumberOfNodes(preemptable=True)))
                clusterScaler.check()
                time.sleep(0.5)
            logger.debug("We waited %s for cluster to finish" %
                         (time.time() - startTime))
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Print some info about the autoscaling
        logger.debug("Total-jobs: %s: Max-workers: %s, "
                     "Total-worker-time: %s, Worker-time-per-job: %s" %
                     (mock.totalJobs, sum(
                         mock.maxWorkers.values()), mock.totalWorkerTime,
                      old_div(mock.totalWorkerTime, mock.totalJobs)
                      if mock.totalJobs > 0 else 0.0))
Beispiel #10
0
    def _buildToilState(self, jobDesc: JobDescription) -> None:
        """
        Traverses tree of jobs down from the subtree root JobDescription
        (jobDesc), building the ToilState class.

        :param jobDesc: The description for the root job of the workflow being run.
        """

        # If the job description has a command, is a checkpoint, has services
        # or is ready to be deleted it is ready to be processed (i.e. it is updated)
        if (jobDesc.command is not None
                or (isinstance(jobDesc, CheckpointJobDescription)
                    and jobDesc.checkpoint is not None)
                or len(jobDesc.services) > 0
                or jobDesc.nextSuccessors() is None):
            logger.debug(
                "Found job to run: %s, with command: %s, with checkpoint: %s, with "
                "services: %s, with no next successors: %s",
                jobDesc.jobStoreID,
                jobDesc.command is not None,
                isinstance(jobDesc, CheckpointJobDescription)
                and jobDesc.checkpoint is not None,
                len(jobDesc.services) > 0,
                jobDesc.nextSuccessors() is None,
            )
            # Set the job updated because we should be able to make progress on it.
            self.bus.put(JobUpdatedMessage(str(jobDesc.jobStoreID), 0))

            if isinstance(jobDesc, CheckpointJobDescription
                          ) and jobDesc.checkpoint is not None:
                jobDesc.command = jobDesc.checkpoint

        else:  # There exist successors
            logger.debug(
                "Adding job: %s to the state with %s successors",
                jobDesc.jobStoreID,
                len(jobDesc.nextSuccessors()),
            )

            # Record the number of successors
            self.successorCounts[str(jobDesc.jobStoreID)] = len(
                jobDesc.nextSuccessors())

            def processSuccessorWithMultiplePredecessors(
                    successor: JobDescription) -> None:
                # If jobDesc is not reported as complete by the successor
                if jobDesc.jobStoreID not in successor.predecessorsFinished:

                    # Update the successor's status to mark the predecessor complete
                    successor.predecessorsFinished.add(jobDesc.jobStoreID)

                # If the successor has no predecessors to finish
                assert len(successor.predecessorsFinished
                           ) <= successor.predecessorNumber
                if len(successor.predecessorsFinished
                       ) == successor.predecessorNumber:

                    # It is ready to be run, so remove it from the set of waiting jobs
                    self.jobsToBeScheduledWithMultiplePredecessors.remove(
                        successorJobStoreID)

                    # Recursively consider the successor
                    self._buildToilState(successor)

            # For each successor
            for successorJobStoreID in jobDesc.nextSuccessors():

                # If the successor does not yet point back at a
                # predecessor we have not yet considered it
                if successorJobStoreID not in self.successor_to_predecessors:

                    # Add the job as a predecessor
                    self.successor_to_predecessors[successorJobStoreID] = {
                        str(jobDesc.jobStoreID)
                    }

                    # We load the successor job
                    successor = self.get_job(successorJobStoreID)

                    # If predecessor number > 1 then the successor has multiple predecessors
                    if successor.predecessorNumber > 1:

                        # We put the successor job in the set of waiting successor
                        # jobs with multiple predecessors
                        assert successorJobStoreID not in self.jobsToBeScheduledWithMultiplePredecessors
                        self.jobsToBeScheduledWithMultiplePredecessors.add(
                            successorJobStoreID)

                        # Process successor
                        processSuccessorWithMultiplePredecessors(successor)

                    else:
                        # The successor has only this job as a predecessor so
                        # recursively consider the successor
                        self._buildToilState(successor)

                else:
                    # We've already seen the successor

                    # Add the job as a predecessor
                    assert (jobDesc.jobStoreID not in self.
                            successor_to_predecessors[successorJobStoreID])
                    self.successor_to_predecessors[successorJobStoreID].add(
                        str(jobDesc.jobStoreID))

                    # If the successor has multiple predecessors
                    if successorJobStoreID in self.jobsToBeScheduledWithMultiplePredecessors:

                        # Get the successor from cache
                        successor = self.get_job(successorJobStoreID)

                        # Process successor
                        processSuccessorWithMultiplePredecessors(successor)