Ejemplo n.º 1
0
 def tryRun(self):
     while not self.stop:
         with throttle(self.scaler.config.scaleInterval):
             try:
                 queuedJobs = self.scaler.leader.getJobs()
                 queuedJobShapes = [
                     Shape(wallTime=self.scaler.getAverageRuntime(
                         jobName=job.jobName,
                         service=isinstance(job, ServiceJobNode)),
                           memory=job.memory,
                           cores=job.cores,
                           disk=job.disk,
                           preemptable=job.preemptable)
                     for job in queuedJobs
                 ]
                 currentNodeCounts = {}
                 for nodeShape in self.scaler.nodeShapes:
                     nodeType = self.scaler.nodeShapeToType[nodeShape]
                     currentNodeCounts[nodeShape] = len(
                         self.scaler.leader.provisioner.
                         getProvisionedWorkers(
                             nodeType=nodeType,
                             preemptable=nodeShape.preemptable))
                 estimatedNodeCounts = self.scaler.getEstimatedNodeCounts(
                     queuedJobShapes, currentNodeCounts)
                 self.scaler.updateClusterSize(estimatedNodeCounts)
                 if self.stats:
                     self.stats.checkStats()
             except:
                 logger.exception(
                     "Exception encountered in scaler thread. Making a best-effort "
                     "attempt to keep going, but things may go wrong from now on."
                 )
     self.scaler.shutDown()
Ejemplo n.º 2
0
    def _startServices(jobGraphsWithServicesToStart,
                       jobGraphsWithServicesThatHaveStarted,
                       serviceJobsToStart,
                       terminate, jobStore):
        """
        Thread used to schedule services.
        """
        servicesThatAreStarting = set()
        servicesRemainingToStartForJob = {}
        serviceToJobGraph = {}
        while True:
            with throttle(1.0):
                if terminate.is_set():
                    logger.debug('Received signal to quit starting services.')
                    break
                try:
                    jobGraph = jobGraphsWithServicesToStart.get_nowait()
                    if len(jobGraph.services) > 1:
                        # Have to fall back to the old blocking behavior to
                        # ensure entire service "groups" are issued as a whole.
                        blockUntilServiceGroupIsStarted(jobGraph,
                                                        jobGraphsWithServicesThatHaveStarted,
                                                        serviceJobsToStart, terminate, jobStore)
                        continue
                    # Found a new job that needs to schedule its services.
                    for serviceJob in jobGraph.services[0]:
                        serviceToJobGraph[serviceJob] = jobGraph
                    servicesRemainingToStartForJob[jobGraph] = len(jobGraph.services[0])
                    # Issue the service jobs all at once.
                    for serviceJob in jobGraph.services[0]:
                        logger.debug("Service manager is starting service job: %s, start ID: %s", serviceJob, serviceJob.startJobStoreID)
                        serviceJobsToStart.put(serviceJob)
                    # We should now start to monitor these services to see if
                    # they've started yet.
                    servicesThatAreStarting.update(jobGraph.services[0])
                except Empty:
                    # No new jobs that need services scheduled.
                    pass

                for serviceJob in list(servicesThatAreStarting):
                    if not jobStore.fileExists(serviceJob.startJobStoreID):
                        # Service has started!
                        servicesThatAreStarting.remove(serviceJob)
                        parentJob = serviceToJobGraph[serviceJob]
                        servicesRemainingToStartForJob[parentJob] -= 1
                        assert servicesRemainingToStartForJob[parentJob] >= 0
                        del serviceToJobGraph[serviceJob]

                # Find if any jobGraphs have had *all* their services started.
                jobGraphsToRemove = set()
                for jobGraph, remainingServices in servicesRemainingToStartForJob.items():
                    if remainingServices == 0:
                        jobGraphsWithServicesThatHaveStarted.put(jobGraph)
                        jobGraphsToRemove.add(jobGraph)
                for jobGraph in jobGraphsToRemove:
                    del servicesRemainingToStartForJob[jobGraph]
Ejemplo n.º 3
0
    def _startServices(jobDescriptionsWithServicesToStart,
                       jobDescriptionsWithServicesThatHaveStarted,
                       jobDescriptionsWithServicesThatHaveFailedToStart,
                       serviceJobsToStart, terminate, jobStore):
        """
        Thread used to schedule services.
        """

        # Keep the user informed, but not too informed, as services start up
        logLimiter = LocalThrottle(60)

        # These are all keyed by service JobDescription object, not ID
        # TODO: refactor!
        servicesThatAreStarting = set()
        servicesRemainingToStartForJob = {}
        serviceToParentJobDescription = {}
        jobDescriptionsWithFailedServices = set()
        while True:
            with throttle(1.0):
                if terminate.is_set():
                    logger.debug('Received signal to quit starting services.')
                    break
                try:
                    jobDesc = jobDescriptionsWithServicesToStart.get_nowait()
                    if len(list(jobDesc.serviceHostIDsInBatches())) > 1:
                        # Have to fall back to the old blocking behavior to
                        # ensure entire service "groups" are issued as a whole.
                        blockUntilServiceGroupIsStarted(
                            jobDesc,
                            jobDescriptionsWithServicesThatHaveStarted,
                            jobDescriptionsWithServicesThatHaveFailedToStart,
                            serviceJobsToStart, terminate, jobStore)
                        continue
                    # Found a new job that needs to schedule its services.
                    for onlyBatch in jobDesc.serviceHostIDsInBatches():
                        # There should be just one batch so we can do it here.
                        servicesRemainingToStartForJob[jobDesc] = len(
                            onlyBatch)
                        for serviceJobID in onlyBatch:
                            # Load up the service object.
                            # TODO: cache?
                            serviceJobDesc = jobStore.load(serviceJobID)
                            # Remember the parent job
                            serviceToParentJobDescription[
                                serviceJobDesc] = jobDesc
                            # We should now start to monitor this service to see if
                            # it has started yet.
                            servicesThatAreStarting.add(serviceJobDesc)
                            # Send the service JobDescription off to be started
                            logger.debug(
                                'Service manager is starting service job: %s, start ID: %s',
                                serviceJobDesc, serviceJobDesc.startJobStoreID)
                            serviceJobsToStart.put(serviceJobDesc)
                except Empty:
                    # No new jobs that need services scheduled.
                    pass

                pendingServiceCount = len(servicesThatAreStarting)
                if pendingServiceCount > 0 and logLimiter.throttle(False):
                    logger.debug('%d services are starting...',
                                 pendingServiceCount)

                for serviceJobDesc in list(servicesThatAreStarting):
                    if not jobStore.fileExists(serviceJobDesc.startJobStoreID):
                        # Service has started (or failed)
                        logger.debug(
                            'Service %s has removed %s and is therefore started',
                            serviceJobDesc, serviceJobDesc.startJobStoreID)
                        servicesThatAreStarting.remove(serviceJobDesc)
                        parentJob = serviceToParentJobDescription[
                            serviceJobDesc]
                        servicesRemainingToStartForJob[parentJob] -= 1
                        assert servicesRemainingToStartForJob[parentJob] >= 0
                        del serviceToParentJobDescription[serviceJobDesc]
                        if not jobStore.fileExists(
                                serviceJobDesc.errorJobStoreID):
                            logger.error(
                                'Service %s has immediately failed before it could be used',
                                serviceJobDesc)
                            # It probably hasn't fileld in the promise that the job that uses the service needs.
                            jobDescriptionsWithFailedServices.add(parentJob)

                # Find if any JobDescriptions have had *all* their services started.
                jobDescriptionsToRemove = set()
                for jobDesc, remainingServices in servicesRemainingToStartForJob.items(
                ):
                    if remainingServices == 0:
                        if jobDesc in jobDescriptionsWithFailedServices:
                            logger.error(
                                'Job %s has had all its services try to start, but at least one failed',
                                jobDesc)
                            jobDescriptionsWithServicesThatHaveFailedToStart.put(
                                jobDesc)
                        else:
                            logger.debug('Job %s has all its services started',
                                         jobDesc)
                            jobDescriptionsWithServicesThatHaveStarted.put(
                                jobDesc)
                        jobDescriptionsToRemove.add(jobDesc)
                for jobDesc in jobDescriptionsToRemove:
                    del servicesRemainingToStartForJob[jobDesc]
Ejemplo n.º 4
0
    def __start_services(self) -> None:
        """
        Thread used to schedule services.
        """

        # Keep the user informed, but not too informed, as services start up
        log_limiter = LocalThrottle(60)

        # These are all keyed by ID
        starting_services = set()
        remaining_services_by_client = {}
        service_to_client = {}
        clients_with_failed_services = set()
        while True:
            with throttle(1.0):
                if self.__terminate.is_set():
                    logger.debug('Received signal to quit starting services.')
                    break
                try:
                    client_id = self.__clients_in.get_nowait()
                    client = self.__toil_state.get_job(client_id)
                    host_id_batches = list(client.serviceHostIDsInBatches())
                    logger.debug(
                        "Service manager processing client %s with %d batches of services",
                        client, len(host_id_batches))
                    if len(host_id_batches) > 1:
                        # Have to fall back to the old blocking behavior to
                        # ensure entire service "groups" are issued as a whole.
                        self.__start_batches_blocking(client_id)
                        continue
                    # Found a new job that needs to schedule its services.
                    for batch in host_id_batches:
                        # There should be just one batch so we can do it here.
                        remaining_services_by_client[client_id] = len(batch)
                        for service_id in batch:
                            # Load up the service object.
                            service_job_desc = self.__toil_state.get_job(
                                service_id)
                            # Remember the parent job
                            service_to_client[service_id] = client_id
                            # We should now start to monitor this service to see if
                            # it has started yet.
                            starting_services.add(service_id)
                            # Send the service JobDescription off to be started
                            logger.debug(
                                'Service manager is starting service job: %s, start ID: %s',
                                service_job_desc,
                                service_job_desc.startJobStoreID)
                            self.__services_out.put(service_id)
                except Empty:
                    # No new jobs that need services scheduled.
                    pass

                pending_service_count = len(starting_services)
                if pending_service_count > 0 and log_limiter.throttle(False):
                    logger.debug('%d services are starting...',
                                 pending_service_count)

                for service_id in list(starting_services):
                    service_job_desc = self.__toil_state.get_job(service_id)
                    if not self.__job_store.fileExists(
                            service_job_desc.startJobStoreID):
                        # Service has started (or failed)
                        logger.debug(
                            'Service %s has removed %s and is therefore started',
                            service_job_desc, service_job_desc.startJobStoreID)
                        starting_services.remove(service_id)
                        client_id = service_to_client[service_id]
                        remaining_services_by_client[client_id] -= 1
                        assert remaining_services_by_client[client_id] >= 0
                        del service_to_client[service_id]
                        if not self.__job_store.fileExists(
                                service_job_desc.errorJobStoreID):
                            logger.error(
                                'Service %s has immediately failed before it could be used',
                                service_job_desc)
                            # It probably hasn't fileld in the promise that the job that uses the service needs.
                            clients_with_failed_services.add(client_id)

                # Find if any clients have had *all* their services started.
                ready_clients = set()
                for client_id, remainingServices in remaining_services_by_client.items(
                ):
                    if remainingServices == 0:
                        if client_id in clients_with_failed_services:
                            logger.error(
                                'Job %s has had all its services try to start, but at least one failed',
                                self.__toil_state.get_job(client_id))
                            self.__failed_clients_out.put(client_id)
                        else:
                            logger.debug('Job %s has all its services started',
                                         self.__toil_state.get_job(client_id))
                            self.__clients_out.put(client_id)
                        ready_clients.add(client_id)
                for client_id in ready_clients:
                    del remaining_services_by_client[client_id]