コード例 #1
0
ファイル: clusterScalerTest.py プロジェクト: vallurumk/toil
    def testPreemptableDeficitIsSet(self):
        """
        Make sure that updateClusterSize sets the preemptable deficit if
        it can't launch preemptable nodes properly. That way, the
        deficit can be communicated to the next run of
        estimateNodeCount.
        """
        # Mock out addNodes. We want to pretend it had trouble
        # launching all 5 nodes, and could only launch 3.
        self.provisioner.addNodes = MagicMock(return_value=3)
        # Pretend there are no nodes in the cluster right now
        self.provisioner.getProvisionedWorkers = MagicMock(return_value=[])
        # In this case, we want to explicitly set up the config so
        # that we can have preemptable and non-preemptable nodes of
        # the same type. That is the only situation where
        # preemptableCompensation applies.
        self.config.nodeTypes = ['c4.8xlarge:0.6', 'c4.8xlarge']
        self.provisioner.nodeTypes = ['c4.8xlarge', 'c4.8xlarge']
        self.provisioner.nodeShapes = [c4_8xlarge_preemptable, c4_8xlarge]
        scaler = ClusterScaler(self.provisioner, self.leader, self.config)
        estimatedNodeCounts = {c4_8xlarge_preemptable: 5, c4_8xlarge: 0}
        scaler.updateClusterSize(estimatedNodeCounts)
        self.assertEqual(scaler.preemptableNodeDeficit['c4.8xlarge'], 2)
        self.provisioner.addNodes.assert_called_once()

        # OK, now pretend this is a while later, and actually launched
        # the nodes properly. The deficit should disappear
        self.provisioner.addNodes = MagicMock(return_value=5)
        scaler.updateClusterSize(estimatedNodeCounts)
        self.assertEqual(scaler.preemptableNodeDeficit['c4.8xlarge'], 0)
コード例 #2
0
ファイル: clusterScalerTest.py プロジェクト: vallurumk/toil
 def testNoLaunchingIfDeltaAlreadyMet(self):
     """
     Check that the scaler doesn't try to launch "0" more instances if
     the delta was able to be met by unignoring nodes.
     """
     # We have only one node type for simplicity
     self.provisioner.nodeTypes = ['c4.8xlarge']
     self.provisioner.nodeShapes = [c4_8xlarge]
     scaler = ClusterScaler(self.provisioner, self.leader, self.config)
     # Pretend there is one ignored worker in the cluster
     self.provisioner.getProvisionedWorkers = MagicMock(return_value=[
         Node('127.0.0.1',
              '127.0.0.1',
              'testNode',
              datetime.datetime.now().isoformat(),
              nodeType='c4.8xlarge',
              preemptable=True)
     ])
     scaler.ignoredNodes.add('127.0.0.1')
     # Exercise the updateClusterSize logic
     self.provisioner.addNodes = MagicMock()
     scaler.updateClusterSize({c4_8xlarge: 1})
     self.assertFalse(self.provisioner.addNodes.called,
                      "addNodes was called when no new nodes were needed")
     self.assertEqual(
         len(scaler.ignoredNodes), 0,
         "The scaler didn't unignore an ignored node when "
         "scaling up")
コード例 #3
0
ファイル: clusterScalerTest.py プロジェクト: vallurumk/toil
    def testRounding(self):
        """
        Test to make sure the ClusterScaler's rounding rounds properly.
        """

        # Get a ClusterScaler
        self.config.targetTime = 1
        self.config.betaInertia = 0.0
        self.config.maxNodes = [2, 3]
        scaler = ClusterScaler(self.provisioner, self.leader, self.config)

        # Exact integers round to themselves
        self.assertEqual(scaler._round(0.0), 0)
        self.assertEqual(scaler._round(1.0), 1)
        self.assertEqual(scaler._round(-1.0), -1)
        self.assertEqual(scaler._round(123456789101112.13), 123456789101112)

        # Decimals other than X.5 round to the side they are closer to
        self.assertEqual(scaler._round(1E-10), 0)
        self.assertEqual(scaler._round(0.5 + 1E-15), 1)
        self.assertEqual(scaler._round(-0.9), -1)
        self.assertEqual(scaler._round(-0.4), 0)

        # Decimals at exactly X.5 round away from 0
        self.assertEqual(scaler._round(0.5), 1)
        self.assertEqual(scaler._round(-0.5), -1)
        self.assertEqual(scaler._round(2.5), 3)
        self.assertEqual(scaler._round(-2.5), -3)
        self.assertEqual(scaler._round(15.5), 16)
        self.assertEqual(scaler._round(-15.5), -16)
        self.assertEqual(scaler._round(123456789101112.5), 123456789101113)
コード例 #4
0
ファイル: clusterScalerTest.py プロジェクト: vallurumk/toil
 def testMaxNodes(self):
     """
     Set the scaler to be very aggressive, give it a ton of jobs, and
     make sure it doesn't go over maxNodes.
     """
     self.config.targetTime = 1
     self.config.betaInertia = 0.0
     self.config.maxNodes = [2, 3]
     scaler = ClusterScaler(self.provisioner, self.leader, self.config)
     jobShapes = [
         Shape(wallTime=3600,
               cores=2,
               memory=h2b('1G'),
               disk=h2b('2G'),
               preemptable=True)
     ] * 1000
     jobShapes.extend([
         Shape(wallTime=3600,
               cores=2,
               memory=h2b('1G'),
               disk=h2b('2G'),
               preemptable=False)
     ] * 1000)
     estimatedNodeCounts = scaler.getEstimatedNodeCounts(
         jobShapes, defaultdict(int))
     self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2)
     self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptable], 3)
コード例 #5
0
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain
        that autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        logger.info("Creating dummy batch system and scalar")

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ClusterScaler(mock, mock, config)

        # Add 100 jobs to complete
        logger.info("Creating test jobs")
        map(lambda x: mock.addJob(), range(numJobs))
        map(lambda x: mock.addJob(preemptable=True), range(numPreemptableJobs))

        # Add some completed jobs
        for preemptable in (True, False):
            if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                # Add a 1000 random jobs
                for i in xrange(1000):
                    x = mock.getNodeShape(preemptable)
                    iJ = IssuedJob(1,
                                   memory=random.choice(range(1, x.memory)),
                                   cores=random.choice(range(1, x.cores)),
                                   disk=random.choice(range(1, x.disk)),
                                   preemptable=preemptable)
                    clusterScaler.addCompletedJob(
                        iJ, random.choice(range(1, x.wallTime)))

        logger.info("Waiting for jobs to be processed")
        startTime = time.time()
        # Wait while the cluster the process chunks through the jobs
        while (mock.getNumberOfJobsIssued(preemptable=False) > 0
               or mock.getNumberOfJobsIssued(preemptable=True) > 0
               or mock.getNumberOfNodes() > 0
               or mock.getNumberOfNodes(preemptable=True) > 0):
            logger.info(
                "Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                "preemptable queue size: %s, preemptable workers: %s",
                mock.getNumberOfJobsIssued(preemptable=False),
                mock.getNumberOfNodes(preemptable=False),
                mock.getNumberOfJobsIssued(preemptable=True),
                mock.getNumberOfNodes(preemptable=True))
            time.sleep(0.5)
        logger.info("We waited %s for cluster to finish" %
                    (time.time() - startTime))
        clusterScaler.shutdown()

        # Print some info about the autoscaling
        for i, bs in enumerate(mock.delegates):
            preemptable = bool(i)
            logger.info("Preemptable: %s, Total-jobs: %s: Max-workers: %s,"
                        " Total-worker-time: %s, Worker-time-per-job: %s" %
                        (preemptable, bs.totalJobs, bs.maxWorkers,
                         bs.totalWorkerTime, bs.totalWorkerTime /
                         bs.totalJobs if bs.totalJobs > 0 else 0.0))
コード例 #6
0
ファイル: clusterScalerTest.py プロジェクト: ratschlab/toil
 def testMinNodes(self):
     """
     Without any jobs queued, the scaler should still estimate "minNodes" nodes.
     """
     self.config.betaInertia = 0.0
     self.config.minNodes = [2, 3]
     scaler = ClusterScaler(self.provisioner, self.leader, self.config)
     jobShapes = []
     estimatedNodeCounts = scaler.getEstimatedNodeCounts(jobShapes, defaultdict(int))
     self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2)
     self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptable], 3)
コード例 #7
0
ファイル: clusterScalerTest.py プロジェクト: ratschlab/toil
 def testBetaInertia(self):
     # This is really high, but makes things easy to calculate.
     self.config.betaInertia = 0.5
     scaler = ClusterScaler(self.provisioner, self.leader, self.config)
     # OK, smoothing things this much should get us 50% of the way to 100.
     self.assertEqual(scaler.smoothEstimate(c4_8xlarge_preemptable, 100), 50)
     # Now we should be at 75%.
     self.assertEqual(scaler.smoothEstimate(c4_8xlarge_preemptable, 100), 75)
     # We should eventually converge on our estimate as long as betaInertia is below 1.
     for _ in range(1000):
         scaler.smoothEstimate(c4_8xlarge_preemptable, 100)
     self.assertEqual(scaler.smoothEstimate(c4_8xlarge_preemptable, 100), 100)
コード例 #8
0
ファイル: clusterScalerTest.py プロジェクト: mr-c/toil
    def testPreemptableDeficitResponse(self):
        """
        When a preemptable deficit was detected by a previous run of the
        loop, the scaler should add non-preemptable nodes to
        compensate in proportion to preemptableCompensation.
        """
        self.config.targetTime = 1
        self.config.betaInertia = 0.0
        self.config.maxNodes = [10, 10]
        # This should mean that one non-preemptable node is launched
        # for every two preemptable nodes "missing".
        self.config.preemptableCompensation = 0.5
        # In this case, we want to explicitly set up the config so
        # that we can have preemptable and non-preemptable nodes of
        # the same type. That is the only situation where
        # preemptableCompensation applies.
        self.config.nodeTypes = [c4_8xlarge_preemptable, c4_8xlarge]
        self.provisioner.setAutoscaledNodeTypes([
            ({t}, None) for t in self.config.nodeTypes
        ])

        scaler = ClusterScaler(self.provisioner, self.leader, self.config)
        # Simulate a situation where a previous run caused a
        # "deficit" of 5 preemptable nodes (e.g. a spot bid was lost)
        scaler.preemptableNodeDeficit[c4_8xlarge] = 5
        # Add a bunch of preemptable jobs (so the bin-packing
        # estimate for the non-preemptable node should still be 0)
        jobShapes = [
            Shape(wallTime=3600,
                  cores=2,
                  memory=h2b('1G'),
                  disk=h2b('2G'),
                  preemptable=True)
        ] * 1000
        estimatedNodeCounts = scaler.getEstimatedNodeCounts(
            jobShapes, defaultdict(int))
        # We don't care about the estimated size of the preemptable
        # nodes. All we want to know is if we responded to the deficit
        # properly: 0.5 * 5 (preemptableCompensation * the deficit) = 3 (rounded up).
        self.assertEqual(
            estimatedNodeCounts[self.provisioner.node_shapes_for_testing[1]],
            3)
コード例 #9
0
def mainLoop(config,
             batchSystem,
             provisioner,
             jobStore,
             rootJobWrapper,
             jobCache=None):
    """
    This is the main loop from which jobs are issued and processed.
    
    If jobCache is passed, it must be a dict from job ID to pre-existing
    JobWrapper objects. Jobs will be loaded from the cache (which can be
    downloaded from the jobStore in a batch).

    :raises: toil.leader.FailedJobsException if at the end of function their remain \
    failed jobs
    
    :return: The return value of the root job's run function.
    :rtype: Any
    """

    # Get a snap shot of the current state of the jobs in the jobStore
    toilState = ToilState(jobStore, rootJobWrapper, jobCache=jobCache)

    # Create a service manager to start and terminate services
    try:
        serviceManager = ServiceManager(jobStore)

        assert len(batchSystem.getIssuedBatchJobIDs()
                   ) == 0  #Batch system must start with no active jobs!
        logger.info(
            "Checked batch system has no running jobs and no updated jobs")

        # Load the jobBatcher class - used to track jobs submitted to the batch-system
        jobBatcher = JobBatcher(config, batchSystem, jobStore, toilState,
                                serviceManager)
        logger.info(
            "Found %s jobs to start and %i jobs with successors to run",
            len(toilState.updatedJobs), len(toilState.successorCounts))

        try:
            # Start the stats/logging aggregation process
            statsAndLogging = StatsAndLogging(jobStore)

            try:
                # Create cluster scaling processes if the provisioner is not None
                if provisioner is None:
                    clusterScaler = None
                else:
                    clusterScaler = ClusterScaler(provisioner, jobBatcher,
                                                  config)
                    jobBatcher.clusterScaler = clusterScaler
                innerLoop(jobStore, config, batchSystem, toilState, jobBatcher,
                          serviceManager, statsAndLogging)
            finally:
                if provisioner is not None:
                    logger.info('Waiting for workers to shutdown')
                    startTime = time.time()
                    clusterScaler.shutdown()
                    logger.info('Worker shutdown complete in %s seconds',
                                time.time() - startTime)
        finally:
            # Shutdown the stats and logging process
            statsAndLogging.shutdown()
    finally:
        serviceManager.shutdown()

    # Filter the failed jobs
    toilState.totalFailedJobs = set(
        filter(jobStore.exists, toilState.totalFailedJobs))

    logger.info("Finished toil run %s" %
                ("successfully" if len(toilState.totalFailedJobs) == 0 else
                 ("with %s failed jobs" % len(toilState.totalFailedJobs))))
    if len(toilState.totalFailedJobs):
        logger.info("Failed jobs at end of the run: %s",
                    toilState.totalFailedJobs)

    # Cleanup
    if len(toilState.totalFailedJobs) > 0:
        raise FailedJobsException(config.jobStore,
                                  len(toilState.totalFailedJobs))

    # Parse out the return value from the root job
    with jobStore.readSharedFileStream("rootJobReturnValue") as jobStoreFileID:
        with jobStore.readFileStream(jobStoreFileID.read()) as fH:
            try:
                return cPickle.load(fH)  # rootJobReturnValue
            except EOFError:
                logger.exception("Failed to unpickle root job return value")
                raise FailedJobsException(jobStoreFileID,
                                          toilState.totalFailedJobs)
コード例 #10
0
    def _testClusterScaling(self, config, numJobs, numPreemptableJobs,
                            jobShape):
        """
        Test the ClusterScaler class with different patterns of job creation. Tests ascertain
        that autoscaling occurs and that all the jobs are run.
        """
        # First do simple test of creating 100 preemptable and non-premptable jobs and check the
        # jobs are completed okay, then print the amount of worker time expended and the total
        # number of worker nodes used.

        logger.info("Creating dummy batch system and scalar")

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        mock.start()
        clusterScaler = ClusterScaler(mock, mock, config)
        clusterScaler.start()
        try:
            # Add 100 jobs to complete
            logger.info("Creating test jobs")
            list(
                map(lambda x: mock.addJob(jobShape=jobShape),
                    list(range(numJobs))))
            list(
                map(lambda x: mock.addJob(jobShape=jobShape, preemptable=True),
                    list(range(numPreemptableJobs))))

            # Add some completed jobs
            for preemptable in (True, False):
                if preemptable and numPreemptableJobs > 0 or not preemptable and numJobs > 0:
                    # Add a 1000 random jobs
                    for i in range(1000):
                        x = mock.getNodeShape(nodeType=jobShape)
                        iJ = JobNode(
                            jobStoreID=1,
                            requirements=dict(
                                memory=random.choice(list(range(1, x.memory))),
                                cores=random.choice(list(range(1, x.cores))),
                                disk=random.choice(list(range(1, x.disk))),
                                preemptable=preemptable),
                            command=None,
                            jobName='testClusterScaling',
                            unitName='')
                        clusterScaler.addCompletedJob(
                            iJ, random.choice(list(range(1, x.wallTime))))

            logger.info("Waiting for jobs to be processed")
            startTime = time.time()
            # Wait while the cluster the process chunks through the jobs
            while (mock.getNumberOfJobsIssued(preemptable=False) > 0
                   or mock.getNumberOfJobsIssued(preemptable=True) > 0
                   or mock.getNumberOfNodes() > 0
                   or mock.getNumberOfNodes(preemptable=True) > 0):
                logger.info(
                    "Running, non-preemptable queue size: %s, non-preemptable workers: %s, "
                    "preemptable queue size: %s, preemptable workers: %s" %
                    (mock.getNumberOfJobsIssued(preemptable=False),
                     mock.getNumberOfNodes(preemptable=False),
                     mock.getNumberOfJobsIssued(preemptable=True),
                     mock.getNumberOfNodes(preemptable=True)))
                clusterScaler.check()
                time.sleep(0.5)
            logger.info("We waited %s for cluster to finish" %
                        (time.time() - startTime))
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        # Print some info about the autoscaling
        logger.info("Total-jobs: %s: Max-workers: %s,"
                    " Total-worker-time: %s, Worker-time-per-job: %s" %
                    (mock.totalJobs, sum(
                        mock.maxWorkers.values()), mock.totalWorkerTime,
                     old_div(mock.totalWorkerTime, mock.totalJobs)
                     if mock.totalJobs > 0 else 0.0))
コード例 #11
0
    def testClusterScalingMultipleNodeTypes(self):

        smallNode = Shape(20, 5, 10, 10, False)
        mediumNode = Shape(20, 10, 10, 10, False)
        largeNode = Shape(20, 20, 10, 10, False)

        numJobs = 100

        config = Config()

        # Make defaults dummy values
        config.defaultMemory = 1
        config.defaultCores = 1
        config.defaultDisk = 1

        # No preemptable nodes/jobs
        config.preemptableNodeTypes = []
        config.minPreemptableNodes = []
        config.maxPreemptableNodes = []  # No preemptable nodes

        #Make sure the node types don't have to be ordered
        config.nodeTypes = [largeNode, smallNode, mediumNode]
        config.minNodes = [0, 0, 0]
        config.maxNodes = [10, 10]  # test expansion of this list

        # Algorithm parameters
        config.alphaPacking = 0.8
        config.betaInertia = 1.2
        config.scaleInterval = 3

        mock = MockBatchSystemAndProvisioner(config, secondsPerJob=2.0)
        clusterScaler = ClusterScaler(mock, mock, config)
        clusterScaler.start()
        mock.start()

        try:
            #Add small jobs
            list(
                map(lambda x: mock.addJob(jobShape=smallNode),
                    list(range(numJobs))))
            list(
                map(lambda x: mock.addJob(jobShape=mediumNode),
                    list(range(numJobs))))

            #Add medium completed jobs
            for i in range(1000):
                iJ = JobNode(jobStoreID=1,
                             requirements=dict(memory=random.choice(
                                 range(smallNode.memory, mediumNode.memory)),
                                               cores=mediumNode.cores,
                                               disk=largeNode.cores,
                                               preemptable=False),
                             command=None,
                             jobName='testClusterScaling',
                             unitName='')
                clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10)))

            while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes(
            ) > 0:
                logger.info("%i nodes currently provisioned" %
                            mock.getNumberOfNodes())
                #Make sure there are no large nodes
                self.assertEqual(mock.getNumberOfNodes(nodeType=largeNode), 0)
                clusterScaler.check()
                time.sleep(0.5)
        finally:
            clusterScaler.shutdown()
            mock.shutDown()

        #Make sure jobs ran on both the small and medium node types
        self.assertTrue(mock.totalJobs > 0)
        self.assertTrue(mock.maxWorkers[smallNode] > 0)
        self.assertTrue(mock.maxWorkers[mediumNode] > 0)

        self.assertEqual(mock.maxWorkers[largeNode], 0)
コード例 #12
0
    def __init__(self, config, batchSystem, provisioner, jobStore, rootJob, jobCache=None):
        """
        :param toil.common.Config config:
        :param toil.batchSystems.abstractBatchSystem.AbstractBatchSystem batchSystem:
        :param toil.provisioners.abstractProvisioner.AbstractProvisioner provisioner
        :param toil.jobStores.abstractJobStore.AbstractJobStore jobStore:
        :param toil.jobGraph.JobGraph rootJob

        If jobCache is passed, it must be a dict from job ID to pre-existing
        JobGraph objects. Jobs will be loaded from the cache (which can be
        downloaded from the jobStore in a batch) during the construction of the ToilState object.
        """
        # Object containing parameters for the run
        self.config = config

        # The job store
        self.jobStore = jobStore
        self.jobStoreLocator = config.jobStore

        # Get a snap shot of the current state of the jobs in the jobStore
        self.toilState = ToilState(jobStore, rootJob, jobCache=jobCache)
        logger.info("Found %s jobs to start and %i jobs with successors to run",
                        len(self.toilState.updatedJobs), len(self.toilState.successorCounts))

        # Batch system
        self.batchSystem = batchSystem
        assert len(self.batchSystem.getIssuedBatchJobIDs()) == 0 #Batch system must start with no active jobs!
        logger.info("Checked batch system has no running jobs and no updated jobs")

        # Map of batch system IDs to IsseudJob tuples
        self.jobBatchSystemIDToIssuedJob = {}

        # Number of preempetable jobs currently being run by batch system
        self.preemptableJobsIssued = 0

        # Tracking the number service jobs issued,
        # this is used limit the number of services issued to the batch system
        self.serviceJobsIssued = 0
        self.serviceJobsToBeIssued = [] # A queue of service jobs that await scheduling
        #Equivalents for service jobs to be run on preemptable nodes
        self.preemptableServiceJobsIssued = 0
        self.preemptableServiceJobsToBeIssued = []

        # Hash to store number of times a job is lost by the batch system,
        # used to decide if to reissue an apparently missing job
        self.reissueMissingJobs_missingHash = {}

        # Class used to create/destroy nodes in the cluster, may be None if
        # using a statically defined cluster
        self.provisioner = provisioner

        # Create cluster scaling thread if the provisioner is not None
        self.clusterScaler = None if self.provisioner is None else ClusterScaler(self.provisioner, self, self.config)

        # A service manager thread to start and terminate services
        self.serviceManager = ServiceManager(jobStore, self.toilState)

        # A thread to manage the aggregation of statistics and logging from the run
        self.statsAndLogging = StatsAndLogging(self.jobStore, self.config)

        # Set used to monitor deadlocked jobs
        self.potentialDeadlockedJobs = set()
        self.potentialDeadlockTime = 0