Exemple #1
0
    def get_new_broks(self):
        """Get new broks from our satellites

        :return: None
        """
        for satellites in [
                self.schedulers, self.pollers, self.reactionners,
                self.receivers
        ]:
            for satellite_link in list(satellites.values()):
                logger.debug("Getting broks from %s", satellite_link)

                _t0 = time.time()
                try:
                    tmp_broks = satellite_link.get_broks(self.name)
                except LinkError:
                    logger.warning(
                        "Daemon %s connection failed, I could not get the broks!",
                        satellite_link)
                else:
                    if tmp_broks:
                        logger.debug("Got %d Broks from %s in %s",
                                     len(tmp_broks), satellite_link.name,
                                     time.time() - _t0)
                        statsmgr.gauge(
                            'get-new-broks-count.%s' % (satellite_link.name),
                            len(tmp_broks))
                        statsmgr.timer(
                            'get-new-broks-time.%s' % (satellite_link.name),
                            time.time() - _t0)
                        for brok in tmp_broks:
                            brok.instance_id = satellite_link.instance_id

                        # Add the broks to our global list
                        self.external_broks.extend(tmp_broks)
Exemple #2
0
    def get_internal_broks(self):
        """Get all broks from self.broks_internal_raised and append them to our broks
        to manage

        :return: None
        """
        statsmgr.gauge('get-new-broks-count.broker', len(self.internal_broks))
        # Add the broks to our global list
        self.external_broks.extend(self.internal_broks)
        self.internal_broks = []
Exemple #3
0
    def get_arbiter_broks(self):
        """Get the broks from the arbiters,
        but as the arbiter_broks list can be push by arbiter without Global lock,
        we must protect this with a lock

        TODO: really? check this arbiter behavior!

        :return: None
        """
        with self.arbiter_broks_lock:
            statsmgr.gauge('get-new-broks-count.arbiter',
                           len(self.arbiter_broks))
            # Add the broks to our global list
            self.external_broks.extend(self.arbiter_broks)
            self.arbiter_broks = []
Exemple #4
0
    def do_get_new_actions(self):
        """Get new actions from schedulers
        Create a Message and put into the module queue
        REF: doc/fusionsupervision-action-queues.png (1)

        :return: None
        """
        # Here are the differences between a poller and a reactionner:
        # Poller will only do checks,
        # Reactionner will do actions (notifications and event handlers)
        do_checks = self.__class__.do_checks
        do_actions = self.__class__.do_actions

        # We check and get the new actions to execute in each of our schedulers
        for scheduler_link_uuid in self.schedulers:
            scheduler_link = self.schedulers[scheduler_link_uuid]

            if not scheduler_link.active:
                logger.warning("My scheduler '%s' is not active currently", scheduler_link.name)
                continue

            logger.debug("get new actions, scheduler: %s", scheduler_link.name)

            # OK, go for it :)
            _t0 = time.time()
            actions = scheduler_link.get_actions({'do_checks': do_checks, 'do_actions': do_actions,
                                                  'poller_tags': self.poller_tags,
                                                  'reactionner_tags': self.reactionner_tags,
                                                  'worker_name': self.name,
                                                  'module_types': list(self.q_by_mod.keys())})
            if actions:
                logger.debug("Got %d actions from %s", len(actions), scheduler_link.name)
                # We 'tag' them with my_scheduler and put into queue for workers
                self.add_actions(actions, scheduler_link.instance_id)
                logger.debug("Got %d actions from %s in %s",
                             len(actions), scheduler_link.name, time.time() - _t0)
            statsmgr.gauge('actions.added.count.%s' % (scheduler_link.name), len(actions))
Exemple #5
0
    def do_loop_turn(self):
        """Receiver daemon main loop

        :return: None
        """

        # Begin to clean modules
        self.check_and_del_zombie_modules()

        # Maybe the arbiter pushed a new configuration...
        if self.watch_for_new_conf(timeout=0.05):
            logger.info("I got a new configuration...")
            # Manage the new configuration
            self.setup_new_conf()

        # Maybe external modules raised 'objects'
        # we should get them
        _t0 = time.time()
        self.get_objects_from_from_queues()
        statsmgr.timer('core.get-objects-from-queues', time.time() - _t0)

        # Get external commands from the arbiters...
        _t0 = time.time()
        self.get_external_commands_from_arbiters()
        statsmgr.timer('external-commands.got.time', time.time() - _t0)
        statsmgr.gauge('external-commands.got.count',
                       len(self.unprocessed_external_commands))

        _t0 = time.time()
        self.push_external_commands_to_schedulers()
        statsmgr.timer('external-commands.pushed.time', time.time() - _t0)

        # Say to modules it's a new tick :)
        _t0 = time.time()
        self.hook_point('tick')
        statsmgr.timer('hook.tick', time.time() - _t0)
Exemple #6
0
    def do_loop_turn(self):  # pylint: disable=too-many-branches
        """Satellite main loop::

        * Check and delete zombies actions / modules
        * Get returns from queues
        * Adjust worker number
        * Get new actions

        :return: None
        """
        # Try to see if one of my module is dead, and restart previously dead modules
        self.check_and_del_zombie_modules()

        # Also if some zombie workers exist...
        self.check_and_del_zombie_workers()

        # Call modules that manage a starting tick pass
        self.hook_point('tick')

        # Print stats for debug
        for _, sched in self.schedulers.items():
            for mod in self.q_by_mod:
                # In workers we've got actions sent to queue - queue size
                for (worker_id, queue) in list(self.q_by_mod[mod].items()):
                    try:
                        actions_count = queue.qsize()
                        results_count = self.returns_queue.qsize()
                        logger.debug("[%s][%s][%s] actions queued: %d, results queued: %d",
                                     sched.name, mod, worker_id, actions_count, results_count)
                        # Update the statistics
                        statsmgr.gauge('worker.%s.actions-queue-size' % worker_id,
                                       actions_count)
                        statsmgr.gauge('worker.%s.results-queue-size' % worker_id,
                                       results_count)
                    except (IOError, EOFError):
                        pass

        # todo temporaray deactivate all this stuff!
        # Before return or get new actions, see how we managed
        # the former ones: are they still in queue(s)? If so, we
        # must wait more or at least have more workers
        # wait_ratio = self.wait_ratio.get_load()
        # total_q = 0
        # try:
        #     for mod in self.q_by_mod:
        #         for queue in list(self.q_by_mod[mod].values()):
        #             total_q += queue.qsize()
        # except (IOError, EOFError):
        #     pass
        # if total_q != 0 and wait_ratio < 2 * self.worker_polling_interval:
        #     logger.debug("I decide to increase the wait ratio")
        #     self.wait_ratio.update_load(wait_ratio * 2)
        #     # self.wait_ratio.update_load(self.worker_polling_interval)
        # else:
        #     # Go to self.worker_polling_interval on normal run, if wait_ratio
        #     # was >2*self.worker_polling_interval,
        #     # it make it come near 2 because if < 2, go up :)
        #     self.wait_ratio.update_load(self.worker_polling_interval)
        # wait_ratio = self.wait_ratio.get_load()
        # statsmgr.timer('core.wait-ratio', wait_ratio)
        # if self.log_loop:
        #     logger.debug("[%s] wait ratio: %f", self.name, wait_ratio)

        # Maybe we do not have enough workers, we check for it
        # and launch the new ones if needed
        self.adjust_worker_number_by_load()

        # Manage all messages we've got in the last timeout
        # for queue in self.return_messages:
        try:
            logger.debug("[%s] manage action results: %d results",
                         self.name, self.returns_queue.qsize())
            while self.returns_queue.qsize():
                msg = self.returns_queue.get_nowait()
                if msg is None:
                    continue
                if not isinstance(msg, Message):
                    logger.warning("Should have received a Message, got a %s!", type(msg))
                    continue
                logger.debug("Got a message: %s", msg)
                if msg.get_type() == 'Done':
                    logger.debug("Got (from %s) an action result: %s",
                                 msg.get_source(), msg.get_data())
                    self.manage_action_return(msg.get_data())
                elif msg.get_type() == 'Stats':
                    logger.debug("Got (from %s) stats: %s",
                                 msg.get_source(), msg.get_data())
                    if msg.get_source() in self.workers:
                        self.workers[msg.get_source()].stats = msg.get_data()
                else:
                    logger.warning("Ignoring message of type: %s", msg.get_type())
        except Full:
            logger.warning("Returns queue is full")
        except Empty:
            logger.debug("Returns queue is empty")
        except (IOError, EOFError) as exp:
            logger.warning("My returns queue is no more available: %s", str(exp))
        except Exception as exp:  # pylint: disable=broad-except
            logger.error("Failed getting messages in returns queue: %s", str(exp))
            logger.error(traceback.format_exc())

        for _, sched in self.schedulers.items():
            if sched.wait_homerun:
                logger.debug("scheduler home run: %d results", len(sched.wait_homerun))

        if not self.passive:
            # If we are an active satellite, we do not initiate the check getting
            # and return
            try:
                # We send to our schedulers the results of all finished checks
                logger.debug("pushing results...")
                self.push_results()
            except LinkError as exp:
                logger.warning("Scheduler connection failed, I could not send my results!")

            try:
                # And we get the new actions from our schedulers
                logger.debug("getting new actions...")
                self.get_new_actions()
            except LinkError as exp:
                logger.warning("Scheduler connection failed, I could not get new actions!")

        # Get objects from our modules that are not Worker based
        if self.log_loop:
            logger.debug("[%s] get objects from queues", self.name)
        self.get_objects_from_from_queues()
        statsmgr.gauge('external-commands.count', len(self.external_commands))
        statsmgr.gauge('broks.count', len(self.broks))
        statsmgr.gauge('events.count', len(self.events))
Exemple #7
0
    def push_external_commands_to_schedulers(self):
        """Push received external commands to the schedulers

        :return: None
        """
        if not self.unprocessed_external_commands:
            return

        # Those are the global external commands
        commands_to_process = self.unprocessed_external_commands
        self.unprocessed_external_commands = []
        logger.debug("Commands: %s", commands_to_process)

        # Now get all external commands and put them into the good schedulers
        logger.debug("Commands to process: %d commands",
                     len(commands_to_process))
        for ext_cmd in commands_to_process:
            cmd = self.external_commands_manager.resolve_command(ext_cmd)
            logger.debug("Resolved command: %s, result: %s", ext_cmd.cmd_line,
                         cmd)
            if cmd and cmd['global']:
                # Send global command to all our schedulers
                for scheduler_link_uuid in self.schedulers:
                    self.schedulers[
                        scheduler_link_uuid].pushed_commands.append(ext_cmd)

        # Now for all active schedulers, send the commands
        count_pushed_commands = 0
        count_failed_commands = 0
        for scheduler_link_uuid in self.schedulers:
            link = self.schedulers[scheduler_link_uuid]

            if not link.active:
                logger.debug(
                    "The scheduler '%s' is not active, it is not possible to push "
                    "external commands to its connection!", link.name)
                continue

            # If there are some commands for this scheduler...
            commands = [ext_cmd.cmd_line for ext_cmd in link.pushed_commands]
            if not commands:
                logger.debug("The scheduler '%s' has no commands.", link.name)
                continue

            logger.debug("Sending %d commands to scheduler %s", len(commands),
                         link.name)
            sent = []
            try:
                sent = link.push_external_commands(commands)
            except LinkError:
                logger.warning(
                    "Scheduler connection failed, I could not push external commands!"
                )

            # Whether we sent the commands or not, clean the scheduler list
            link.pushed_commands = []

            # If we didn't sent them, add the commands to the arbiter list
            if sent:
                statsmgr.gauge('external-commands.pushed.%s' % link.name,
                               len(commands))
                count_pushed_commands = count_pushed_commands + len(commands)
            else:
                count_failed_commands = count_failed_commands + len(commands)
                statsmgr.gauge('external-commands.failed.%s' % link.name,
                               len(commands))
                # Kepp the not sent commands... for a next try
                self.external_commands.extend(commands)

        statsmgr.gauge('external-commands.pushed.all', count_pushed_commands)
        statsmgr.gauge('external-commands.failed.all', count_failed_commands)
Exemple #8
0
    def do_loop_turn(self):
        # pylint: disable=too-many-branches
        """Loop used to:
         * get initial status broks
         * check if modules are alive, if not restart them
         * get broks from ourself, the arbiters and our satellites
         * add broks to the queue of each external module
         * manage broks with each internal module

         If the internal broks management is longer than 0.8 seconds, postpone to hte next
         loop turn to avoid overloading the broker daemon.

         :return: None
        """
        if not self.got_initial_broks:
            # Asking initial broks from my schedulers
            my_satellites = self.get_links_of_type(s_type='scheduler')
            for satellite in list(my_satellites.values()):
                logger.info("Asking my initial broks from '%s'",
                            satellite.name)
                _t0 = time.time()
                try:
                    my_initial_broks = satellite.get_initial_broks(self.name)
                    statsmgr.timer('broks.initial.%s.time' % satellite.name,
                                   time.time() - _t0)
                    if not my_initial_broks:
                        logger.info("No initial broks were raised, "
                                    "my scheduler is not yet ready...")
                        return

                    self.got_initial_broks = True
                    logger.debug("Got %d initial broks from '%s'",
                                 my_initial_broks, satellite.name)
                    statsmgr.gauge('broks.initial.%s.count' % satellite.name,
                                   my_initial_broks)
                except LinkError as exp:
                    logger.warning(
                        "Scheduler connection failed, I could not get initial broks!"
                    )

        logger.debug("Begin Loop: still some old broks to manage (%d)",
                     len(self.external_broks))
        if self.external_broks:
            statsmgr.gauge('unmanaged.broks', len(self.external_broks))

        # Try to see if one of my module is dead, and restart previously dead modules
        self.check_and_del_zombie_modules()

        # Call modules that manage a starting tick pass
        _t0 = time.time()
        self.hook_point('tick')
        statsmgr.timer('hook.tick', time.time() - _t0)

        # Maybe the last loop we did raised some broks internally
        self.get_internal_broks()

        # Also reap broks sent from the arbiters
        self.get_arbiter_broks()

        # Now get broks from our distant daemons
        self.get_new_broks()

        # Get the list of broks not yet sent to our external modules
        _t0 = time.time()
        broks_to_send = [
            brok for brok in self.external_broks
            if getattr(brok, 'to_be_sent', True)
        ]
        statsmgr.gauge('get-new-broks-count.to_send', len(broks_to_send))

        # Send the broks to all external modules to_q queue so they can get the whole packet
        # beware, the sub-process/queue can be die/close, so we put to restart the whole module
        # instead of killing ourselves :)
        for module in self.modules_manager.get_external_instances():
            try:
                _t00 = time.time()
                queue_size = module.to_q.qsize()
                statsmgr.gauge(
                    'queues.external.%s.to.size' % module.get_name(),
                    queue_size)
                module.to_q.put(broks_to_send)
                statsmgr.timer('queues.external.%s.to.put' % module.get_name(),
                               time.time() - _t00)
            except Exception as exp:  # pylint: disable=broad-except
                # first we must find the modules
                logger.warning(
                    "Module %s queue exception: %s, I'm tagging it to restart later",
                    module.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(module)

        # No more need to send them
        for brok in broks_to_send:
            brok.to_be_sent = False
        logger.debug("Time to send %s broks (%d secs)", len(broks_to_send),
                     time.time() - _t0)

        # Make the internal modules manage the broks
        start = time.time()
        while self.external_broks:
            now = time.time()
            # Do not 'manage' more than 0.8s, we must get new broks almost every second
            if now - start > 0.8:
                logger.info(
                    "I did not yet managed all my broks, still %d broks",
                    len(self.external_broks))
                break

            # Get the first brok in the list
            brok = self.external_broks.pop(0)
            if self.modules_manager.get_internal_instances():
                self.manage_brok(brok)
                # Make a very short pause to avoid overloading
                self.make_a_pause(0.01, check_time_change=False)
            else:
                if getattr(brok, 'to_be_sent', False):
                    self.external_broks.append(brok)

        # Maybe our external modules raised 'objects', so get them
        if self.get_objects_from_from_queues():
            statsmgr.gauge('external-commands.got.count',
                           len(self.external_commands))
            statsmgr.gauge('broks.got.count', len(self.external_broks))