Beispiel #1
0
 def tryRun(self):
     while not self.stop:
         try:
             with throttle(self.scaler.config.scaleInterval):
                 queuedJobs = self.scaler.leader.getJobs()
                 queuedJobShapes = [
                     Shape(wallTime=self.scaler.getAverageRuntime(
                         jobName=job.jobName,
                         service=isinstance(job, ServiceJobNode)),
                           memory=job.memory,
                           cores=job.cores,
                           disk=job.disk,
                           preemptable=job.preemptable)
                     for job in queuedJobs
                 ]
                 currentNodeCounts = {}
                 for nodeShape in self.scaler.nodeShapes:
                     nodeType = self.scaler.nodeShapeToType[nodeShape]
                     currentNodeCounts[nodeShape] = len(
                         self.scaler.leader.provisioner.
                         getProvisionedWorkers(
                             nodeType=nodeType,
                             preemptable=nodeShape.preemptable))
                 estimatedNodeCounts = self.scaler.getEstimatedNodeCounts(
                     queuedJobShapes, currentNodeCounts)
                 self.scaler.updateClusterSize(estimatedNodeCounts)
                 if self.stats:
                     self.stats.checkStats()
         except:
             logger.exception(
                 "Exception encountered in scaler thread. Making a best-effort "
                 "attempt to keep going, but things may go wrong from now on."
             )
     self.scaler.shutDown()
Beispiel #2
0
    def tryRun(self):
        global _preemptableNodeDeficit

        while not self.scaler.stop:
            with throttle(self.scaler.config.scaleInterval):
                self.totalNodes = len(
                    self.scaler.leader.provisioner.getProvisionedWorkers(
                        self.preemptable))
                # Estimate the number of nodes to run the issued jobs.
                # Number of jobs issued
                queueSize = self.scaler.leader.getNumberOfJobsIssued(
                    preemptable=self.preemptable)

                # Job shapes of completed jobs
                recentJobShapes = self.jobShapes.get()
                assert len(recentJobShapes) > 0

                # Estimate of number of nodes needed to run recent jobs
                nodesToRunRecentJobs = binPacking(recentJobShapes,
                                                  self.nodeShape)

                # Actual calculation of the estimated number of nodes required
                estimatedNodes = 0 if queueSize == 0 else max(
                    1,
                    int(
                        round(self.scaler.config.alphaPacking *
                              nodesToRunRecentJobs * float(queueSize) /
                              len(recentJobShapes))))

                # Account for case where the average historical runtime of completed jobs is less
                # than the runtime of currently running jobs. This is important
                # to avoid a deadlock where the estimated number of nodes to run the jobs
                # is too small to schedule a set service jobs and their dependent jobs, leading
                # to service jobs running indefinitely.

                # How many jobs are currently running and their average runtime.
                numberOfRunningJobs, currentAvgRuntime = self.scaler.leader.getNumberAndAvgRuntimeOfCurrentlyRunningJobs(
                )

                # Average runtime of recently completed jobs
                historicalAvgRuntime = sum(
                    map(lambda jS: jS.wallTime,
                        recentJobShapes)) / len(recentJobShapes)

                # Ratio of avg. runtime of currently running and completed jobs
                runtimeCorrection = float(
                    currentAvgRuntime
                ) / historicalAvgRuntime if currentAvgRuntime > historicalAvgRuntime and numberOfRunningJobs >= estimatedNodes else 1.0

                # Make correction, if necessary (only do so if cluster is busy and average runtime is higher than historical
                # average)
                if runtimeCorrection != 1.0:
                    estimatedNodes = int(
                        round(estimatedNodes * runtimeCorrection))
                    if self.totalNodes < self.maxNodes:
                        logger.warn(
                            "Historical avg. runtime (%s) is less than current avg. runtime (%s) and cluster"
                            " is being well utilised (%s running jobs), increasing cluster requirement by: %s"
                            % (historicalAvgRuntime, currentAvgRuntime,
                               numberOfRunningJobs, runtimeCorrection))

                # If we're the non-preemptable scaler, we need to see if we have a deficit of
                # preemptable nodes that we should compensate for.
                if not self.preemptable:
                    compensation = self.scaler.config.preemptableCompensation
                    assert 0.0 <= compensation <= 1.0
                    # The number of nodes we provision as compensation for missing preemptable
                    # nodes is the product of the deficit (the number of preemptable nodes we did
                    # _not_ allocate) and configuration preference.
                    compensationNodes = int(
                        round(_preemptableNodeDeficit * compensation))
                    if compensationNodes > 0:
                        logger.info(
                            'Adding %d preemptable nodes to compensate for a deficit of %d '
                            'non-preemptable ones.', compensationNodes,
                            _preemptableNodeDeficit)
                    estimatedNodes += compensationNodes

                jobsPerNode = (0 if nodesToRunRecentJobs <= 0 else
                               len(recentJobShapes) /
                               float(nodesToRunRecentJobs))
                if estimatedNodes > 0 and self.totalNodes < self.maxNodes:
                    logger.info(
                        'Estimating that cluster needs %s %s of shape %s, from current '
                        'size of %s, given a queue size of %s, the number of jobs per node '
                        'estimated to be %s, an alpha parameter of %s and a run-time length correction of %s.',
                        estimatedNodes, self.nodeTypeString, self.nodeShape,
                        self.totalNodes, queueSize, jobsPerNode,
                        self.scaler.config.alphaPacking, runtimeCorrection)

                # Use inertia parameter to stop small fluctuations
                delta = self.totalNodes * max(
                    0.0, self.scaler.config.betaInertia - 1.0)
                if self.totalNodes - delta <= estimatedNodes <= self.totalNodes + delta:
                    logger.debug(
                        'Difference in new (%s) and previous estimates in number of '
                        '%s (%s) required is within beta (%s), making no change.',
                        estimatedNodes, self.nodeTypeString, self.totalNodes,
                        self.scaler.config.betaInertia)
                    estimatedNodes = self.totalNodes

                # Bound number using the max and min node parameters
                if estimatedNodes > self.maxNodes:
                    logger.debug(
                        'Limiting the estimated number of necessary %s (%s) to the '
                        'configured maximum (%s).', self.nodeTypeString,
                        estimatedNodes, self.maxNodes)
                    estimatedNodes = self.maxNodes
                elif estimatedNodes < self.minNodes:
                    logger.info(
                        'Raising the estimated number of necessary %s (%s) to the '
                        'configured mininimum (%s).', self.nodeTypeString,
                        estimatedNodes, self.minNodes)
                    estimatedNodes = self.minNodes

                if estimatedNodes != self.totalNodes:
                    logger.info('Changing the number of %s from %s to %s.',
                                self.nodeTypeString, self.totalNodes,
                                estimatedNodes)
                    self.totalNodes = self.setNodeCount(
                        numNodes=estimatedNodes, preemptable=self.preemptable)

                    # If we were scaling up the number of preemptable nodes and failed to meet
                    # our target, we need to update the slack so that non-preemptable nodes will
                    # be allocated instead and we won't block. If we _did_ meet our target,
                    # we need to reset the slack to 0.
                    if self.preemptable:
                        if self.totalNodes < estimatedNodes:
                            deficit = estimatedNodes - self.totalNodes
                            logger.info(
                                'Preemptable scaler detected deficit of %d nodes.',
                                deficit)
                            _preemptableNodeDeficit = deficit
                        else:
                            _preemptableNodeDeficit = 0

                if self.stats:
                    self.stats.checkStats()

        self.shutDown(preemptable=self.preemptable)
        logger.info('Scaler exited normally.')
Beispiel #3
0
    def tryRun(self):
        global _preemptableNodeDeficit

        if isinstance(self.scaler.jobBatcher.batchSystem, AbstractScalableBatchSystem):
            totalNodes = len(self.scaler.jobBatcher.batchSystem.getNodes(self.preemptable))
        else:
            totalNodes = 0
        logger.info('Starting with %s node(s) in the cluster.', totalNodes)
        while not self.scaler.stop:
            with throttle(self.scaler.config.scaleInterval):
                # Calculate the approx. number nodes needed
                # TODO: Correct for jobs already running which can be considered fractions of a job
                queueSize = self.scaler.jobBatcher.getNumberOfJobsIssued(preemptable=self.preemptable)

                recentJobShapes = self.jobShapes.get()
                assert len(recentJobShapes) > 0
                nodesToRunRecentJobs = binPacking(recentJobShapes, self.nodeShape)
                estimatedNodes = 0 if queueSize == 0 else max(1, int(round(
                    self.scaler.config.alphaPacking
                    * nodesToRunRecentJobs
                    * float(queueSize)
                    / len(recentJobShapes))))

                # If we're the non-preemptable scaler, we need to see if we have a deficit of
                # preemptable nodes that we should compensate for.
                if not self.preemptable:
                    compensation = self.scaler.config.preemptableCompensation
                    assert 0.0 <= compensation <= 1.0
                    # The number of nodes we provision as compensation for missing preemptable
                    # nodes is the product of the deficit (the number of preemptable nodes we did
                    # _not_ allocate) and configuration preference.
                    compensationNodes = int(round(_preemptableNodeDeficit * compensation))
                    logger.info('Adding %d preemptable nodes to compensate for a deficit of %d '
                                'non-preemptable ones.', compensationNodes, _preemptableNodeDeficit)
                    estimatedNodes += compensationNodes

                fix_my_name = (0 if nodesToRunRecentJobs <= 0
                               else len(recentJobShapes) / float(nodesToRunRecentJobs))
                logger.debug('Estimating that cluster needs %s nodes of shape %s, from current '
                             'size of %s, given a queue size of %s, the number of jobs per node '
                             'estimated to be %s and an alpha parameter of %s.',
                             estimatedNodes, self.nodeShape, totalNodes, queueSize, fix_my_name,
                             self.scaler.config.alphaPacking)

                # Use inertia parameter to stop small fluctuations
                if estimatedNodes <= totalNodes * self.scaler.config.betaInertia <= estimatedNodes:
                    logger.debug('Difference in new (%s) and previous estimates in number of '
                                 'nodes (%s) required is within beta (%s), making no change.',
                                 estimatedNodes, totalNodes, self.scaler.config.betaInertia)
                    estimatedNodes = totalNodes

                # Bound number using the max and min node parameters
                if estimatedNodes > self.maxNodes:
                    logger.info('Limiting the estimated number of necessary nodes (%s) to the '
                                'configured maximum (%s).', estimatedNodes, self.maxNodes)
                    estimatedNodes = self.maxNodes
                elif estimatedNodes < self.minNodes:
                    logger.info('Raising the estimated number of necessary nodes (%s) to the '
                                'configured mininimum (%s).', estimatedNodes, self.minNodes)
                    estimatedNodes = self.minNodes

                if estimatedNodes != totalNodes:
                    logger.info('Changing the number of worker nodes from %s to %s.', totalNodes,
                                estimatedNodes)
                    totalNodes = self.scaler.provisioner.setNodeCount(numNodes=estimatedNodes,
                                                                      preemptable=self.preemptable)
                    
                    # If we were scaling up the number of preemptable nodes and failed to meet
                    # our target, we need to update the slack so that non-preemptable nodes will
                    # be allocated instead and we won't block. If we _did_ meet our target,
                    # we need to reset the slack to 0.
                    if self.preemptable:
                        if totalNodes < estimatedNodes:
                            deficit = estimatedNodes - totalNodes
                            logger.info('Preemptable scaler detected deficit of %d nodes.', deficit)
                            _preemptableNodeDeficit = deficit
                        else:
                            _preemptableNodeDeficit = 0
                    
        logger.info('Forcing provisioner to reduce cluster size to zero.')
        totalNodes = self.scaler.provisioner.setNodeCount(numNodes=0,
                                                          preemptable=self.preemptable,
                                                          force=True)
        if totalNodes != 0:
            raise RuntimeError('Provisioner was not able to reduce cluster size to zero.')
        else:
            logger.info('Scaler exited normally.')
Beispiel #4
0
    def _addNodes(self, instances, numNodes, preemptable=False):
        deadline = time.time() + provisioning_timeout
        spec = dict(key_name=self._keyName,
                    user_data=self._userData(),
                    instance_type=self.instanceType[preemptable].name,
                    instance_profile_arn=self._instanceProfileArn,
                    security_group_ids=self._securityGroupIds,
                    ebs_optimized=self.ebsOptimized,
                    dry_run=False)
        # Offset the ordinals of the preemptable nodes to be disjunct from the non-preemptable
        # ones. Without this, the two scaler threads would inevitably allocate colliding ordinals.
        offset = 1000 if preemptable else 0
        used_ordinals = {
            int(i.tags['cluster_ordinal']) - offset
            for i in instances
        }
        # Since leader is absent from the instances iterable, we need to explicitly reserve its
        # ordinal unless we're allocating offset ordinals reserved for preemptable instances:
        assert len(used_ordinals) == len(instances)  # check for collisions
        if not preemptable:
            used_ordinals.add(0)
        ordinals = (ordinal + offset for ordinal in allocate_cluster_ordinals(
            num=numNodes, used=used_ordinals))

        def createInstances():
            """
            :rtype: Iterable[list[Instance]]
            """
            if preemptable:
                for batch in create_spot_instances(
                        self._ec2,
                        self.spotBid,
                        self.imageId,
                        spec,
                        # Don't insist on spot requests and don't raise
                        # if no requests were fulfilled:
                        tentative=True,
                        num_instances=numNodes,
                        timeout=deadline - time.time()):
                    yield batch
            else:
                yield create_ondemand_instances(self._ec2,
                                                self.imageId,
                                                spec,
                                                num_instances=numNodes)

        instancesByAddress = {}

        def handleInstance(instance):
            log.debug('Tagging instance %s.', instance.id)
            leader_tags = self._instance.tags
            name = leader_tags['Name'].replace('toil-leader', 'toil-worker')
            tag_object_persistently(
                instance,
                dict(leader_tags, Name=name, cluster_ordinal=next(ordinals)))
            assert instance.private_ip_address
            instancesByAddress[instance.private_ip_address] = instance

        # Each instance gets a different ordinal so we can't tag an entire batch at once but have
        # to tag each instance individually. It needs to be done quickly because the tags are
        # crucial for the boot code running inside the instance to join the cluster. Hence we do
        # it in a thread pool. If the pool is too large, we'll hit the EC2 limit on the number of
        # of concurrent requests. If it is too small, we won't be able to tag all instances in
        # time.
        with thread_pool(min(numNodes, 32)) as pool:
            for batch in createInstances():
                log.debug('Got a batch of %i instance(s).', len(batch))
                for instance in batch:
                    log.debug(
                        'Submitting instance %s to thread pool for tagging.',
                        instance.id)
                    pool.apply_async(handleInstance, (instance, ))
        numInstancesAdded = len(instancesByAddress)
        log.info('Created and tagged %i instance(s).', numInstancesAdded)

        if preemptable:
            # Reset deadline such that slow spot creation does not take away from instance boot-up
            deadline = time.time() + provisioning_timeout
        if isinstance(self.batchSystem, AbstractScalableBatchSystem):
            while instancesByAddress and time.time() < deadline:
                with throttle(10):
                    log.debug(
                        'Waiting for batch system to report back %i node(s).',
                        len(instancesByAddress))
                    # Get all nodes to be safe, not just the ones whose preemptability matches,
                    # in case there's a problem with a node determining its own preemptability.
                    nodes = self.batchSystem.getNodes()
                    for nodeAddress in nodes.iterkeys():
                        instancesByAddress.pop(nodeAddress, None)
            if instancesByAddress:
                log.warn(
                    '%i instance(s) out of %i did not join the cluster as worker nodes. They '
                    'will be terminated.', len(instancesByAddress),
                    numInstancesAdded)
                instanceIds = [i.id for i in instancesByAddress.itervalues()]
                self._logAndTerminate(instanceIds)
                numInstancesAdded -= len(instanceIds)
            else:
                log.info('All %i node(s) joined the cluster.',
                         numInstancesAdded)
        else:
            log.warn(
                'Batch system is not scalable. Assuming all instances joined the cluster.'
            )
        return numInstancesAdded
Beispiel #5
0
    def tryRun(self):
        global _preemptableNodeDeficit

        if isinstance(self.scaler.jobBatcher.batchSystem, AbstractScalableBatchSystem):
            totalNodes = len(self.scaler.jobBatcher.batchSystem.getNodes(self.preemptable))
        else:
            totalNodes = 0
        logger.info('Starting with %s node(s) in the cluster.', totalNodes)
        while not self.scaler.stop:
            with throttle(self.scaler.config.scaleInterval):
                # Calculate the approx. number nodes needed
                # TODO: Correct for jobs already running which can be considered fractions of a job
                queueSize = self.scaler.jobBatcher.getNumberOfJobsIssued(preemptable=self.preemptable)

                recentJobShapes = self.jobShapes.get()
                assert len(recentJobShapes) > 0
                nodesToRunRecentJobs = binPacking(recentJobShapes, self.nodeShape)
                estimatedNodes = 0 if queueSize == 0 else max(1, int(round(
                    self.scaler.config.alphaPacking
                    * nodesToRunRecentJobs
                    * float(queueSize)
                    / len(recentJobShapes))))

                # If we're the non-preemptable scaler, we need to see if we have a deficit of
                # preemptable nodes that we should compensate for.
                if not self.preemptable:
                    compensation = self.scaler.config.preemptableCompensation
                    assert 0.0 <= compensation <= 1.0
                    # The number of nodes we provision as compensation for missing preemptable
                    # nodes is the product of the deficit (the number of preemptable nodes we did
                    # _not_ allocate) and configuration preference.
                    compensationNodes = int(round(_preemptableNodeDeficit * compensation))
                    logger.info('Adding %d preemptable nodes to compensate for a deficit of %d '
                                'non-preemptable ones.', compensationNodes, _preemptableNodeDeficit)
                    estimatedNodes += compensationNodes

                fix_my_name = (0 if nodesToRunRecentJobs <= 0
                               else len(recentJobShapes) / float(nodesToRunRecentJobs))
                logger.debug('Estimating that cluster needs %s nodes of shape %s, from current '
                             'size of %s, given a queue size of %s, the number of jobs per node '
                             'estimated to be %s and an alpha parameter of %s.',
                             estimatedNodes, self.nodeShape, totalNodes, queueSize, fix_my_name,
                             self.scaler.config.alphaPacking)

                # Use inertia parameter to stop small fluctuations
                if estimatedNodes <= totalNodes * self.scaler.config.betaInertia <= estimatedNodes:
                    logger.debug('Difference in new (%s) and previous estimates in number of '
                                 'nodes (%s) required is within beta (%s), making no change.',
                                 estimatedNodes, totalNodes, self.scaler.config.betaInertia)
                    estimatedNodes = totalNodes

                # Bound number using the max and min node parameters
                if estimatedNodes > self.maxNodes:
                    logger.info('Limiting the estimated number of necessary nodes (%s) to the '
                                'configured maximum (%s).', estimatedNodes, self.maxNodes)
                    estimatedNodes = self.maxNodes
                elif estimatedNodes < self.minNodes:
                    logger.info('Raising the estimated number of necessary nodes (%s) to the '
                                'configured mininimum (%s).', estimatedNodes, self.minNodes)
                    estimatedNodes = self.minNodes

                if estimatedNodes != totalNodes:
                    logger.info('Changing the number of worker nodes from %s to %s.', totalNodes,
                                estimatedNodes)
                    totalNodes = self.scaler.provisioner.setNodeCount(numNodes=estimatedNodes,
                                                                      preemptable=self.preemptable)
                    
                    # If we were scaling up the number of preemptable nodes and failed to meet
                    # our target, we need to update the slack so that non-preemptable nodes will
                    # be allocated instead and we won't block. If we _did_ meet our target,
                    # we need to reset the slack to 0.
                    if self.preemptable:
                        if totalNodes < estimatedNodes:
                            deficit = estimatedNodes - totalNodes
                            logger.info('Preemptable scaler detected deficit of %d nodes.', deficit)
                            _preemptableNodeDeficit = deficit
                        else:
                            _preemptableNodeDeficit = 0
                    
        logger.info('Forcing provisioner to reduce cluster size to zero.')
        totalNodes = self.scaler.provisioner.setNodeCount(numNodes=0,
                                                          preemptable=self.preemptable,
                                                          force=True)
        if totalNodes != 0:
            raise RuntimeError('Provisioner was not able to reduce cluster size to zero.')
        else:
            logger.info('Scaler exited normally.')
Beispiel #6
0
    def tryRun(self):
        while not self.scaler.stop:
            with throttle(self.scaler.config.scaleInterval):
                # Estimate of number of nodes needed to run recent jobs
                recentJobShapes = self.jobShapes.get()
                queuedJobs = self.scaler.leader.getJobs()
                logger.info("Detected %i queued jobs." % len(queuedJobs))
                queuedJobShapes = [
                    Shape(wallTime=self.scaler.getAverageRuntime(
                        jobName=job.jobName),
                          memory=job.memory,
                          cores=job.cores,
                          disk=job.disk,
                          preemptable=job.preemptable) for job in queuedJobs
                ]
                nodesToRunRecentJobs = binPacking(jobShapes=recentJobShapes,
                                                  nodeShapes=self.nodeShapes)
                nodesToRunQueuedJobs = binPacking(jobShapes=queuedJobShapes,
                                                  nodeShapes=self.nodeShapes)
                for nodeShape in self.nodeShapes:
                    nodeType = self.nodeShapeToType[nodeShape]
                    self.totalNodes[nodeShape] = len(
                        self.scaler.leader.provisioner.getProvisionedWorkers(
                            nodeType=nodeType,
                            preemptable=nodeShape.preemptable))

                    logger.info("Nodes of type %s to run recent jobs: %s" %
                                (nodeType, nodesToRunRecentJobs[nodeShape]))
                    logger.info("Nodes of type %s to run queued jobs = %s" %
                                (nodeType, nodesToRunQueuedJobs[nodeShape]))
                    # Actual calculation of the estimated number of nodes required
                    estimatedNodes = 0 if nodesToRunQueuedJobs[
                        nodeShape] == 0 else max(
                            1,
                            int(
                                round(self.scaler.config.alphaPacking *
                                      nodesToRunRecentJobs[nodeShape] +
                                      (1 - self.scaler.config.alphaPacking) *
                                      nodesToRunQueuedJobs[nodeShape])))
                    logger.info("Estimating %i nodes of shape %s" %
                                (estimatedNodes, nodeShape))

                    # If we're scaling a non-preemptable node type, we need to see if we have a
                    # deficit of preemptable nodes of this type that we should compensate for.
                    if not nodeShape.preemptable:
                        compensation = self.scaler.config.preemptableCompensation
                        assert 0.0 <= compensation <= 1.0
                        # The number of nodes we provision as compensation for missing preemptable
                        # nodes is the product of the deficit (the number of preemptable nodes we did
                        # _not_ allocate) and configuration preference.
                        compensationNodes = int(
                            round(self.preemptableNodeDeficit[nodeType] *
                                  compensation))
                        if compensationNodes > 0:
                            logger.info(
                                'Adding %d preemptable nodes of type %s to compensate for a deficit of %d '
                                'non-preemptable ones.', compensationNodes,
                                nodeType,
                                self.preemptableNodeDeficit[nodeType])
                        estimatedNodes += compensationNodes
                    jobsPerNode = (0 if nodesToRunRecentJobs[nodeShape] <= 0
                                   else old_div(
                                       len(recentJobShapes),
                                       float(nodesToRunRecentJobs[nodeShape])))
                    if estimatedNodes > 0 and self.totalNodes[
                            nodeShape] < self.maxNodes[nodeShape]:
                        logger.info(
                            'Estimating that cluster needs %s of shape %s, from current '
                            'size of %s, given a queue size of %s, the number of jobs per node '
                            'estimated to be %s, an alpha parameter of %s.',
                            estimatedNodes,
                            nodeShape, self.totalNodes[nodeShape],
                            len(queuedJobs), jobsPerNode,
                            self.scaler.config.alphaPacking)

                    # Use inertia parameter to stop small fluctuations
                    logger.info("Currently %i nodes of type %s in cluster" %
                                (self.totalNodes[nodeShape], nodeType))
                    if self.scaler.leader.toilMetrics:
                        self.scaler.leader.toilMetrics.logClusterSize(
                            nodeType=nodeType,
                            currentSize=self.totalNodes[nodeShape],
                            desiredSize=estimatedNodes)
                    delta = self.totalNodes[nodeShape] * max(
                        0.0, self.scaler.config.betaInertia - 1.0)
                    if self.totalNodes[
                            nodeShape] - delta <= estimatedNodes <= self.totalNodes[
                                nodeShape] + delta:
                        logger.debug(
                            'Difference in new (%s) and previous estimates in number of '
                            '%s (%s) required is within beta (%s), making no change.',
                            estimatedNodes, nodeType,
                            self.totalNodes[nodeShape],
                            self.scaler.config.betaInertia)
                        estimatedNodes = self.totalNodes[nodeShape]

                    # Bound number using the max and min node parameters
                    if estimatedNodes > self.maxNodes[nodeShape]:
                        logger.debug(
                            'Limiting the estimated number of necessary %s (%s) to the '
                            'configured maximum (%s).', nodeType,
                            estimatedNodes, self.maxNodes[nodeShape])
                        estimatedNodes = self.maxNodes[nodeShape]
                    elif estimatedNodes < self.minNodes[nodeShape]:
                        logger.info(
                            'Raising the estimated number of necessary %s (%s) to the '
                            'configured mininimum (%s).', nodeType,
                            estimatedNodes, self.minNodes[nodeShape])
                        estimatedNodes = self.minNodes[nodeShape]

                    if estimatedNodes != self.totalNodes[nodeShape]:
                        logger.info('Changing the number of %s from %s to %s.',
                                    nodeType, self.totalNodes[nodeShape],
                                    estimatedNodes)
                        self.totalNodes[nodeShape] = self.setNodeCount(
                            nodeType=nodeType,
                            numNodes=estimatedNodes,
                            preemptable=nodeShape.preemptable)

                    # If we were scaling up a preemptable node type and failed to meet
                    # our target, we will attempt to compensate for the deficit while scaling
                    # non-preemptable nodes of this type.
                    if nodeShape.preemptable:
                        if self.totalNodes[nodeShape] < estimatedNodes:
                            deficit = estimatedNodes - self.totalNodes[nodeType]
                            logger.info(
                                'Preemptable scaler detected deficit of %d nodes of type %s.'
                                % (deficit, nodeType))
                            self.preemptableNodeDeficit[nodeType] = deficit
                        else:
                            self.preemptableNodeDeficit[nodeType] = 0

                #Attempt to terminate any nodes that we previously designated for
                #termination, but which still had workers running.
                self._terminateIgnoredNodes()

                if self.stats:
                    self.stats.checkStats()

        self.shutDown()
        logger.info('Scaler exited normally.')
Beispiel #7
0
    def tryRun(self):
        global _preemptableNodeDeficit

        while not self.scaler.stop:
            with throttle(self.scaler.config.scaleInterval):
                self.totalNodes = len(self.scaler.leader.provisioner.getProvisionedWorkers(self.preemptable))
                # Estimate the number of nodes to run the issued jobs.
                # Number of jobs issued
                queueSize = self.scaler.leader.getNumberOfJobsIssued(preemptable=self.preemptable)
                
                # Job shapes of completed jobs
                recentJobShapes = self.jobShapes.get()
                assert len(recentJobShapes) > 0
                
                # Estimate of number of nodes needed to run recent jobs
                nodesToRunRecentJobs = binPacking(recentJobShapes, self.nodeShape)
                
                # Actual calculation of the estimated number of nodes required
                estimatedNodes = 0 if queueSize == 0 else max(1, int(round(
                    self.scaler.config.alphaPacking
                    * nodesToRunRecentJobs
                    * float(queueSize) / len(recentJobShapes))))
                
                # Account for case where the average historical runtime of completed jobs is less
                # than the runtime of currently running jobs. This is important
                # to avoid a deadlock where the estimated number of nodes to run the jobs
                # is too small to schedule a set service jobs and their dependent jobs, leading
                # to service jobs running indefinitely.
                
                # How many jobs are currently running and their average runtime.
                numberOfRunningJobs, currentAvgRuntime  = self.scaler.leader.getNumberAndAvgRuntimeOfCurrentlyRunningJobs()
                
                # Average runtime of recently completed jobs
                historicalAvgRuntime = old_div(sum([jS.wallTime for jS in recentJobShapes]),len(recentJobShapes))

                # Ratio of avg. runtime of currently running and completed jobs
                runtimeCorrection = old_div(float(currentAvgRuntime),historicalAvgRuntime) if currentAvgRuntime > historicalAvgRuntime and numberOfRunningJobs >= estimatedNodes else 1.0
                
                # Make correction, if necessary (only do so if cluster is busy and average runtime is higher than historical
                # average)
                if runtimeCorrection != 1.0:
                    estimatedNodes = int(round(estimatedNodes * runtimeCorrection))
                    if self.totalNodes < self.maxNodes:
                        logger.warn("Historical avg. runtime (%s) is less than current avg. runtime (%s) and cluster"
                                    " is being well utilised (%s running jobs), increasing cluster requirement by: %s" % 
                                    (historicalAvgRuntime, currentAvgRuntime, numberOfRunningJobs, runtimeCorrection))

                # If we're the non-preemptable scaler, we need to see if we have a deficit of
                # preemptable nodes that we should compensate for.
                if not self.preemptable:
                    compensation = self.scaler.config.preemptableCompensation
                    assert 0.0 <= compensation <= 1.0
                    # The number of nodes we provision as compensation for missing preemptable
                    # nodes is the product of the deficit (the number of preemptable nodes we did
                    # _not_ allocate) and configuration preference.
                    compensationNodes = int(round(_preemptableNodeDeficit * compensation))
                    if compensationNodes > 0:
                        logger.info('Adding %d preemptable nodes to compensate for a deficit of %d '
                                    'non-preemptable ones.', compensationNodes, _preemptableNodeDeficit)
                    estimatedNodes += compensationNodes

                jobsPerNode = (0 if nodesToRunRecentJobs <= 0
                               else old_div(len(recentJobShapes), float(nodesToRunRecentJobs)))
                if estimatedNodes > 0 and self.totalNodes < self.maxNodes:
                    logger.info('Estimating that cluster needs %s %s of shape %s, from current '
                                'size of %s, given a queue size of %s, the number of jobs per node '
                                'estimated to be %s, an alpha parameter of %s and a run-time length correction of %s.',
                                estimatedNodes, self.nodeTypeString, self.nodeShape,
                                self.totalNodes, queueSize, jobsPerNode,
                                self.scaler.config.alphaPacking, runtimeCorrection)

                # Use inertia parameter to stop small fluctuations
                delta = self.totalNodes * max(0.0, self.scaler.config.betaInertia - 1.0)
                if self.totalNodes - delta <= estimatedNodes <= self.totalNodes + delta:
                    logger.debug('Difference in new (%s) and previous estimates in number of '
                                 '%s (%s) required is within beta (%s), making no change.',
                                 estimatedNodes, self.nodeTypeString, self.totalNodes, self.scaler.config.betaInertia)
                    estimatedNodes = self.totalNodes

                # Bound number using the max and min node parameters
                if estimatedNodes > self.maxNodes:
                    logger.debug('Limiting the estimated number of necessary %s (%s) to the '
                                 'configured maximum (%s).', self.nodeTypeString, estimatedNodes, self.maxNodes)
                    estimatedNodes = self.maxNodes
                elif estimatedNodes < self.minNodes:
                    logger.info('Raising the estimated number of necessary %s (%s) to the '
                                'configured mininimum (%s).', self.nodeTypeString, estimatedNodes, self.minNodes)
                    estimatedNodes = self.minNodes

                if estimatedNodes != self.totalNodes:
                    logger.info('Changing the number of %s from %s to %s.', self.nodeTypeString, self.totalNodes,
                                estimatedNodes)
                    self.totalNodes = self.setNodeCount(numNodes=estimatedNodes, preemptable=self.preemptable)
                    
                    # If we were scaling up the number of preemptable nodes and failed to meet
                    # our target, we need to update the slack so that non-preemptable nodes will
                    # be allocated instead and we won't block. If we _did_ meet our target,
                    # we need to reset the slack to 0.
                    if self.preemptable:
                        if self.totalNodes < estimatedNodes:
                            deficit = estimatedNodes - self.totalNodes
                            logger.info('Preemptable scaler detected deficit of %d nodes.', deficit)
                            _preemptableNodeDeficit = deficit
                        else:
                            _preemptableNodeDeficit = 0

                if self.stats:
                    self.stats.checkStats()
                    
        self.shutDown(preemptable=self.preemptable)
        logger.info('Scaler exited normally.')
Beispiel #8
0
    def tryRun(self):
        if isinstance(self.scaler.jobBatcher.batchSystem,
                      AbstractScalableBatchSystem):
            totalNodes = len(
                self.scaler.jobBatcher.batchSystem.getNodes(self.preemptable))
        else:
            totalNodes = 0
        logger.info('Starting with %s node(s) in the cluster.', totalNodes)
        while not self.scaler.stop:
            with throttle(self.scaler.config.scaleInterval):
                # Calculate the approx. number nodes needed
                # TODO: Correct for jobs already running which can be considered fractions of a job
                queueSize = self.scaler.jobBatcher.getNumberOfJobsIssued()
                recentJobShapes = self.jobShapes.get()
                assert len(recentJobShapes) > 0
                nodesToRunRecentJobs = binPacking(recentJobShapes,
                                                  self.nodeShape)
                estimatedNodes = 0 if queueSize == 0 else max(
                    1,
                    int(
                        round(self.scaler.config.alphaPacking *
                              nodesToRunRecentJobs * float(queueSize) /
                              len(recentJobShapes))))

                fix_my_name = (0 if nodesToRunRecentJobs <= 0 else
                               len(recentJobShapes) /
                               float(nodesToRunRecentJobs))
                logger.debug(
                    'Estimating that cluster needs %s nodes of shape %s, from current '
                    'size of %s, given a queue size of %s, the number of jobs per node '
                    'estimated to be %s and an alpha parameter of %s.',
                    estimatedNodes, self.nodeShape, totalNodes, queueSize,
                    fix_my_name, self.scaler.config.alphaPacking)

                # Use inertia parameter to stop small fluctuations
                if estimatedNodes <= totalNodes * self.scaler.config.betaInertia <= estimatedNodes:
                    logger.debug(
                        'Difference in new (%s) and previous estimates in number of '
                        'nodes (%s) required is within beta (%s), making no change.',
                        estimatedNodes, totalNodes,
                        self.scaler.config.betaInertia)
                    estimatedNodes = totalNodes

                # Bound number using the max and min node parameters
                if estimatedNodes > self.maxNodes:
                    logger.info(
                        'Limiting the estimated number of necessary nodes (%s) to the '
                        'configured maximum (%s).', estimatedNodes,
                        self.maxNodes)
                    estimatedNodes = self.maxNodes
                elif estimatedNodes < self.minNodes:
                    logger.info(
                        'Raising the estimated number of necessary nodes (%s) to the '
                        'configured mininimum (%s).', estimatedNodes,
                        self.minNodes)
                    estimatedNodes = self.minNodes

                if estimatedNodes != totalNodes:
                    logger.info(
                        'Changing the number of worker nodes from %s to %s.',
                        totalNodes, estimatedNodes)
                    totalNodes = self.scaler.provisioner.setNodeCount(
                        numNodes=estimatedNodes, preemptable=self.preemptable)
        logger.info('Forcing provisioner to reduce cluster size to zero.')
        totalNodes = self.scaler.provisioner.setNodeCount(
            numNodes=0, preemptable=self.preemptable, force=True)
        if totalNodes != 0:
            raise RuntimeError(
                'Provisioner was not able to reduce cluster size to zero.')
        else:
            logger.info('Scaler exited normally.')