コード例 #1
0
ファイル: sitter.py プロジェクト: AlexeyMK/Cerebro
    def api_add_job(self, args):
        check = self._api_check(args,
                                ['dns_basename',
                                 'task_configuration',
                                 'deployment_layout',
                                 'deployment_recipe',
                                 'recipe_options',
                                 'persistent'])
        if check:
            return check
        job = ProductionJob(
            self,
            args['dns_basename'],
            args['task_configuration'],
            args['deployment_layout'],
            args['deployment_recipe'],
            args['recipe_options'],
            args['persistent'],
            args.get('linked_job'))

        if args.get('linked_job'):
            job.find_linked_job()
            if not job.linked_job_object:
                return "Couldn't find linked job!"

        if self.state.add_job(job):
            ClusterEventManager.handle(
                "Added a job: %s" % job.get_name())

            return "Job Added"
        else:
            return "Error adding job, see logs"
コード例 #2
0
ファイル: sitter.py プロジェクト: AlexeyMK/Cerebro
    def api_update_logging_level(self, args):
        # Default level is INFO
        level = args.get('level', 20)
        for logger in self.state.loggers:
            logger.setLevel(level)

        ClusterEventManager.handle(
            "Updated logging level to %s" % level)

        return "Level set to %s" % level
コード例 #3
0
ファイル: productionjob.py プロジェクト: AlexeyMK/Cerebro
    def ensure_on_linked_job(self, state, sitter):
        """
        1. Ensure the linked job exists, if not bail out
        2. Ensure that this job is running on each machine
        that the linked job is on.  If not, create a job filler for
        those machines and this job.
        Note: As a linked job we should never create a job filler
        that spawns new machines.  We should always just be populating
        existing machines.
        """
        linked_job = self.find_linked_job()

        if not linked_job:
            logger.warn(
                "Couldn't find linked job (%s) for %s!" % (
                self.linked_job, str(self)))
            # Returning False stops all other jobs this cycle, which
            # we don't want to do.
            return True

        job_fill_machines = state.get_job_machines()
        for zone in linked_job.get_shared_fate_zones():
            machines_to_fill = []
            machines = job_fill_machines.get(zone, [])

            for machine in machines:
                task_names = [
                    task['name'] for task in machine.get_running_tasks()]

                if not self.name in task_names:
                    machines_to_fill.append(machine)

            current_fillers = self.fillers[zone]
            currently_spawning = 0
            for filler in current_fillers:
                currently_spawning += filler.num_remaining()

            # Also check the linked job for active job fillers
            # we don't want to start a filler here if the linked job
            # is also actively filling, it should be sequential.
            current_fillers = linked_job.fillers[zone]
            for filler in current_fillers:
                currently_spawning += filler.num_remaining()

            if not currently_spawning and len(machines_to_fill) > 0:
                ClusterEventManager.handle(
                    "New JobFiller for Linked Job: %s, %s, %s, %s" % (
                        machines_to_fill, zone, str(self), self.linked_job))

                filler = JobFiller(len(machines_to_fill), self,
                                   zone, machines_to_fill)
                filler.start()
                self.fillers[zone].append(filler)

        return True
コード例 #4
0
ファイル: jobfiller.py プロジェクト: AlexeyMK/Cerebro
    def run(self):
        logger.info("Starting JobFiller")
        release_attempts = 1
        while self.state.get_state() != 8:
            state = self.state.get_state()
            logger.info(
                "Running State: %s, attempt #%s" % (
                str(self.state), release_attempts))

            try:
                if state == 0:
                    self.run_create_resources()
                elif state == 1:
                    self.ensure_dns()
                elif state == 2:
                    self.deploy_monitoring_code()
                elif state == 3:
                    self.deploy_job_code()
                elif state == 4:
                    self.launch_tasks()
                elif state == 5:
                    self.add_to_monitoring()
                elif state == 6:
                    self.ensure_dns(do_basename=True)
                elif state == 7:
                    self.reboot_dependent_jobs()
            except:
                release_attempts += 1
                import traceback
                traceback.print_exc()
                logger.error(traceback.format_exc())

                if release_attempts > 10 or self.fail_on_error:
                    logger.info("Job Filler: Failed")
                    ClusterEventManager.handle(
                        "Failed Filling: %s" % str(self))

                    if self.post_callback:
                        self.post_callback(self, success=False)

                    return False

        ClusterEventManager.handle(
            "Completed Filling: %s" % str(self))
        logger.info("Job Filler: Done!")
        self.end_time = datetime.now()

        if self in self.job.fillers.get(self.zone, []):
            self.job.fillers[self.zone].remove(self)

        if self.post_callback:
            self.post_callback(success=True)

        return True
コード例 #5
0
ファイル: sitter.py プロジェクト: AlexeyMK/Cerebro
    def api_remove_job(self, args):
        check = self._api_check(args, ['name'])

        if check:
            return check

        jobs = self.state.remove_job(args['name'])
        ClusterEventManager.handle(
            "Removed jobs: %s" % ', '.join(jobs))
        if jobs:
            return "Removed: %s" % ', '.join(jobs)
        else:
            return "Job Not Found"
コード例 #6
0
ファイル: sitter.py プロジェクト: AlexeyMK/Cerebro
    def api_update_job(self, args):
        check = self._api_check(args,
                                ['job_name'])

        if check:
            return check

        job_name = args['job_name']
        if not self.state.update_job(job_name):
            return "Error updating job: %s doesn't exist" % job_name

        ClusterEventManager.handle(
            'Update %s started' % job_name)
        return "Job update initiated"
コード例 #7
0
ファイル: sitter.py プロジェクト: AlexeyMK/Cerebro
    def api_enforce_idle(self, args):
        # Really naive right now, a global # of
        # max idle per zone.  Could do a lot more here.
        check = self._api_check(args, ['idle_count_per_zone'])

        if check:
            return check

        try:
            self.state.max_idle_per_zone = int(args['idle_count_per_zone'])
        except:
            return "Invalid limit"

        ClusterEventManager.handle(
            "Enforce Idle Limit at %s" % int(args['idle_count_per_zone']))
        return "Limit set"
コード例 #8
0
ファイル: sitter.py プロジェクト: AlexeyMK/Cerebro
    def decomission_machine(self, machine):
        self.state.remove_machine(machine)
        provider = self.state.get_zone_provider(
            machine.config.shared_fate_zone)
        if not provider:
            logger.warn(
                "No provider found for %s?" % machine.config.shared_fate_zone)
            return

        ClusterEventManager.handle(
            "Decomissioning %s" % str(machine))

        if not provider.decomission(machine):
            # If we can't decomission it then perhaps its locked
            # and we should leave well enough alone at this point,
            # just remove it from monitoring etc.
            ClusterEventManager.handle(
                "Provider doesn't allow decomissioning of %s" % str(machine))
            return

        if machine.config.dns_name:
            self.dns_provider.remove_record(data=machine.config.ip,
                                            hostName=machine.config.dns_name)

            # Strip off the leading number, e.g.
            # 12.bar.mydomain.com -> bar.mydomain.com
            root_name = '.'.join(machine.config.dns_name.split('.')[1:])

            self.dns_provider.remove_record(data=machine.config.ip,
                                            hostName=root_name)

        # Now look for other dangling records pointing to this machine
        # and delete those too.
        records = self.dns_provider.get_records()
        for record in records:
            if record['value'] == machine.config.ip:
                logger.info(
                    "Removing %s from %s" % (
                    machine.config.ip, record['record']))
                self.dns_provider.remove_record(data=machine.config.ip,
                                                hostName=record['record'])

        ClusterEventManager.handle(
            "Decomissioning of %s complete!" % str(machine))
コード例 #9
0
ファイル: productionjob.py プロジェクト: AlexeyMK/Cerebro
    def refill(self, state, sitter):
        self.sitter = sitter

        new_machines = False
        while self.sitter.machines_in_queue():
            new_machines = True
            # We want to ensure any machines recently added to monitoring
            # have had a chance to load their data, incase they are
            # running this job
            logger.info("Waiting for machine monitors to load machine data "
                        "before filling jobs")
            time.sleep(0.5)

        if new_machines:
            # If we had to wait for new machines that means that
            # there are new machines, and we need to recalculate
            # job fill before it is safe to do refill.  The next
            # pass should be OK.
            logger.info("Waiting for next jobfill to be calculated before "
                        "doing a refill")

            return False

        while not self.name in state.job_fill:
            # 1) Assume this job has already been added to state.jobs
            # 2) Want to ensure calculator has run at least once to find out
            #    if this job already exists throughout the cluster
            logger.info(
                "Waiting for calculator thread to kick in before "
                "filling jobs")
            time.sleep(0.5)

        # Clear out finished fillers after 5 minutes
        for zone, fillers in self.fillers.items():
            for filler in fillers:
                now = datetime.now()
                if (filler.is_done() and
                        now - filler.end_time > timedelta(minutes=5)):
                    logger.info(
                        "Removing a filler from %s for %s" % (
                        zone, self.name))
                    self.fillers[zone].remove(filler)

        # If we have a linked job then bypass all the normal logic
        # and just piggyback on those machines
        if self.linked_job:
            return self.ensure_on_linked_job(state, sitter)

        #!MACHINEASSUMPTION!
        # Step 1: Ensure we have enough machines in each SFZ
        # Step 1a: Check for idle machines and reserve as we find them
        for zone in self.get_shared_fate_zones():
            idle_available = state.get_idle_machines(zone)
            total_required = self.get_num_required_machines_in_zone(zone)
            idle_required = total_required - state.job_fill[self.name][zone]

            current_fillers = self.fillers[zone]
            currently_spawning = 0
            for filler in current_fillers:
                currently_spawning += filler.num_remaining()

            self.currently_spawning[zone] = currently_spawning

            idle_required -= currently_spawning

            # !MACHINEASSUMPTION! Ideally we're counting resources here
            # not machines
            required_new_machine_count = max(
                (idle_required - len(idle_available)), 0)

            do_log = logger.debug
            if idle_required > 0:
                do_log = logger.info

            do_log(
                ("Calculated job requirements for %s in %s: " % (self.name,
                                                                 zone)) +
                "Currently Active: %s " % (state.job_fill[self.name][zone]) +
                "Idle Required: %s, Total New: %s " % (
                    idle_required,
                    required_new_machine_count) +
                "Currently Spawning: %s " % (currently_spawning) +
                "idle-available: %s " % (len(idle_available)) +
                "total_required: %s " % (total_required)
            )

            usable_machines = []
            if required_new_machine_count <= 0:
                # idle_available > idle_required, so use just as many
                # as we need
                usable_machines = idle_available[:idle_required]
            elif required_new_machine_count > 0:
                # Otherwise take all the available idle ones, and
                # we'll make more
                usable_machines.extend(idle_available)

            if idle_required > 0:
                ClusterEventManager.handle(
                    "New JobFiller: %s, %s, %s, %s" % (
                        idle_required, zone, str(self), usable_machines))

                filler = JobFiller(idle_required, self,
                                   zone, usable_machines)
                self.fillers[zone].append(filler)
                filler.start()

        return True