Ejemplo n.º 1
0
    def do_loop_turn(self):
        """Receiver daemon main loop

        :return: None
        """

        # Begin to clean modules
        self.check_and_del_zombie_modules()

        # Now we check if we received a new configuration - no sleep time, we will sleep later...
        self.watch_for_new_conf()
        if self.new_conf:
            self.setup_new_conf()

        # Maybe external modules raised 'objects'
        # we should get them
        _t0 = time.time()
        self.get_objects_from_from_queues()
        statsmgr.timer('core.get-objects-from-queues', time.time() - _t0)

        _t0 = time.time()
        self.push_external_commands_to_schedulers()
        statsmgr.timer('core.push-external-commands', time.time() - _t0)

        # Maybe we do not have something to do, so we wait a little
        # todo: check broks in the receiver ???
        if not self.broks:
            self.watch_for_new_conf(1.0)
Ejemplo n.º 2
0
    def get_new_broks(self):
        """Get new broks from our satellites

        :return: None
        """
        for satellites in [
                self.schedulers, self.pollers, self.reactionners,
                self.receivers
        ]:
            for satellite_link in list(satellites.values()):
                logger.debug("Getting broks from %s", satellite_link)

                _t0 = time.time()
                try:
                    tmp_broks = satellite_link.get_broks(self.name)
                except LinkError:
                    logger.warning(
                        "Daemon %s connection failed, I could not get the broks!",
                        satellite_link)
                else:
                    if tmp_broks:
                        logger.debug("Got %d Broks from %s in %s",
                                     len(tmp_broks), satellite_link.name,
                                     time.time() - _t0)
                        statsmgr.gauge(
                            'get-new-broks-count.%s' % (satellite_link.name),
                            len(tmp_broks))
                        statsmgr.timer(
                            'get-new-broks-time.%s' % (satellite_link.name),
                            time.time() - _t0)
                        for brok in tmp_broks:
                            brok.instance_id = satellite_link.instance_id

                        # Add the broks to our global list
                        self.external_broks.extend(tmp_broks)
Ejemplo n.º 3
0
    def get_new_broks(self):
        """Get new broks from our satellites

        :return: None
        """
        for satellites in [self.schedulers, self.pollers, self.reactionners, self.receivers]:
            for satellite_link in list(satellites.values()):
                logger.debug("Getting broks from %s", satellite_link)

                _t0 = time.time()
                try:
                    tmp_broks = satellite_link.get_broks(self.name)
                except LinkError:
                    logger.warning("Daemon %s connection failed, I could not get the broks!",
                                   satellite_link)
                else:
                    if tmp_broks:
                        logger.debug("Got %d Broks from %s in %s",
                                     len(tmp_broks), satellite_link.name, time.time() - _t0)
                        statsmgr.gauge('get-new-broks-count.%s'
                                       % (satellite_link.name), len(tmp_broks))
                        statsmgr.timer('get-new-broks-time.%s'
                                       % (satellite_link.name), time.time() - _t0)
                        for brok in tmp_broks:
                            brok.instance_id = satellite_link.instance_id

                        # Add the broks to our global list
                        self.external_broks.extend(tmp_broks)
Ejemplo n.º 4
0
    def get_new_actions(self):
        """ Wrapper function for do_get_new_actions
        For stats purpose

        :return: None
        TODO: Use a decorator for timing this function
        """
        try:
            _t0 = time.time()
            self.do_get_new_actions()
            statsmgr.timer('actions.got.time', time.time() - _t0)
        except RuntimeError:
            logger.error("Exception like issue #1007")
Ejemplo n.º 5
0
    def get_new_actions(self):
        """ Wrapper function for do_get_new_actions
        For stats purpose

        :return: None
        TODO: Use a decorator for timing this function
        """
        try:
            _t0 = time.time()
            self.do_get_new_actions()
            statsmgr.timer('actions.got.time', time.time() - _t0)
        except RuntimeError:
            logger.error("Exception like issue #1007")
Ejemplo n.º 6
0
    def do_loop_turn(self):
        """Scheduler loop turn

        Simply run the Alignak scheduler loop

        This is called when a configuration got received by the scheduler daemon. As of it,
        check if the first scheduling has been done... and manage this.

        :return: None
        """
        if not self.first_scheduling:
            # Ok, now all is initialized, we can make the initial broks
            logger.info("First scheduling launched")
            _t0 = time.time()
            # Program start brok
            self.sched.initial_program_status()
            # First scheduling
            self.sched.schedule()
            statsmgr.timer('first_scheduling', time.time() - _t0)
            logger.info("First scheduling done")

            # Connect to our passive satellites if needed
            for satellite in [
                    s for s in list(self.pollers.values()) if s.passive
            ]:
                if not self.daemon_connection_init(satellite):
                    logger.error("Passive satellite connection failed: %s",
                                 satellite)

            for satellite in [
                    s for s in list(self.reactionners.values()) if s.passive
            ]:
                if not self.daemon_connection_init(satellite):
                    logger.error("Passive satellite connection failed: %s",
                                 satellite)

            # Ticks are for recurrent function call like consume, del zombies etc
            self.sched.ticks = 0
            self.first_scheduling = True

        # Each loop turn, execute the daemon specific treatment...
        # only if the daemon has a configuration to manage
        if self.sched.pushed_conf:
            # If scheduling is not yet enabled, enable scheduling
            if not self.sched.must_schedule:
                self.sched.start_scheduling()
                self.sched.before_run()
            self.sched.run()
        else:
            logger.warning("#%d - No monitoring configuration to scheduler...",
                           self.loop_count)
Ejemplo n.º 7
0
    def do_loop_turn(self):
        """Receiver daemon main loop

        :return: None
        """

        # Begin to clean modules
        self.check_and_del_zombie_modules()

        # Maybe the arbiter pushed a new configuration...
        if self.watch_for_new_conf(timeout=0.05):
            logger.info("I got a new configuration...")
            # Manage the new configuration
            self.setup_new_conf()

        # Maybe external modules raised 'objects'
        # we should get them
        _t0 = time.time()
        self.get_objects_from_from_queues()
        statsmgr.timer('core.get-objects-from-queues', time.time() - _t0)

        # Get external commands from the arbiters...
        _t0 = time.time()
        self.get_external_commands_from_arbiters()
        statsmgr.timer('external-commands.got.time', time.time() - _t0)
        statsmgr.gauge('external-commands.got.count', len(self.unprocessed_external_commands))

        _t0 = time.time()
        self.push_external_commands_to_schedulers()
        statsmgr.timer('external-commands.pushed.time', time.time() - _t0)

        # Say to modules it's a new tick :)
        _t0 = time.time()
        self.hook_point('tick')
        statsmgr.timer('hook.tick', time.time() - _t0)
Ejemplo n.º 8
0
    def do_loop_turn(self):
        """Scheduler loop turn

        Simply run the Alignak scheduler loop

        This is called when a configuration got received by the scheduler daemon. As of it,
        check if the first scheduling has been done... and manage this.

        :return: None
        """
        if not self.first_scheduling:
            # Ok, now all is initialized, we can make the initial broks
            logger.info("First scheduling launched")
            _t0 = time.time()
            # Program start brok
            self.sched.initial_program_status()
            # First scheduling
            self.sched.schedule()
            statsmgr.timer('first_scheduling', time.time() - _t0)
            logger.info("First scheduling done")

            # Connect to our passive satellites if needed
            for satellite in [s for s in list(self.pollers.values()) if s.passive]:
                if not self.daemon_connection_init(satellite):
                    logger.error("Passive satellite connection failed: %s", satellite)

            for satellite in [s for s in list(self.reactionners.values()) if s.passive]:
                if not self.daemon_connection_init(satellite):
                    logger.error("Passive satellite connection failed: %s", satellite)

            # Ticks are for recurrent function call like consume, del zombies etc
            self.sched.ticks = 0
            self.first_scheduling = True

        # Each loop turn, execute the daemon specific treatment...
        # only if the daemon has a configuration to manage
        if self.sched.pushed_conf:
            # If scheduling is not yet enabled, enable scheduling
            if not self.sched.must_schedule:
                self.sched.start_scheduling()
                self.sched.before_run()
            self.sched.run()
        else:
            logger.warning("#%d - No monitoring configuration to scheduler...",
                           self.loop_count)
Ejemplo n.º 9
0
    def manage_brok(self, brok):
        """Get a brok.
        We put brok data to the modules

        :param brok: object with data
        :type brok: object
        :return: None
        """
        # Call all modules if they catch the call
        for mod in self.modules_manager.get_internal_instances():
            try:
                _t0 = time.time()
                mod.manage_brok(brok)
                statsmgr.timer('core.manage-broks.%s' % mod.get_name(), time.time() - _t0)
            except Exception as exp:  # pylint: disable=broad-except
                logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later",
                               mod.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(mod)
Ejemplo n.º 10
0
    def manage_brok(self, brok):
        """Get a brok.
        We put brok data to the modules

        :param brok: object with data
        :type brok: object
        :return: None
        """
        # Unserialize the brok before consuming it
        brok.prepare()

        for module in self.modules_manager.get_internal_instances():
            try:
                _t0 = time.time()
                module.manage_brok(brok)
                statsmgr.timer('manage-broks.internal.%s' % module.get_name(), time.time() - _t0)
            except Exception as exp:  # pylint: disable=broad-except
                logger.warning("The module %s raised an exception: %s, "
                               "I'm tagging it to restart later", module.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(module)
Ejemplo n.º 11
0
    def manage_brok(self, brok):
        """Get a brok.
        We put brok data to the modules

        :param brok: object with data
        :type brok: object
        :return: None
        """
        # Unserialize the brok before consuming it
        brok.prepare()

        for module in self.modules_manager.get_internal_instances():
            try:
                _t0 = time.time()
                module.manage_brok(brok)
                statsmgr.timer('manage-broks.internal.%s' % module.get_name(),
                               time.time() - _t0)
            except Exception as exp:  # pylint: disable=broad-except
                logger.warning(
                    "The module %s raised an exception: %s, "
                    "I'm tagging it to restart later", module.get_name(),
                    str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(module)
Ejemplo n.º 12
0
    def do_loop_turn(self):
        # pylint: disable=too-many-branches
        """Loop used to:
         * get initial status broks
         * check if modules are alive, if not restart them
         * get broks from ourself, the arbiters and our satellites
         * add broks to the queue of each external module
         * manage broks with each internal module

         If the internal broks management is longer than 0.8 seconds, postpone to hte next
         loop turn to avoid overloading the broker daemon.

         :return: None
        """
        if not self.got_initial_broks:
            # Asking initial broks from my schedulers
            my_satellites = self.get_links_of_type(s_type='scheduler')
            for satellite in list(my_satellites.values()):
                logger.info("Asking my initial broks from '%s'",
                            satellite.name)
                _t0 = time.time()
                try:
                    my_initial_broks = satellite.get_initial_broks(self.name)
                    statsmgr.timer('broks.initial.%s.time' % satellite.name,
                                   time.time() - _t0)
                    if not my_initial_broks:
                        logger.info("No initial broks were raised, "
                                    "my scheduler is not yet ready...")
                        return

                    self.got_initial_broks = True
                    logger.debug("Got %d initial broks from '%s'",
                                 my_initial_broks, satellite.name)
                    statsmgr.gauge('broks.initial.%s.count' % satellite.name,
                                   my_initial_broks)
                except LinkError as exp:
                    logger.warning(
                        "Scheduler connection failed, I could not get initial broks!"
                    )

        logger.debug("Begin Loop: still some old broks to manage (%d)",
                     len(self.external_broks))
        if self.external_broks:
            statsmgr.gauge('unmanaged.broks', len(self.external_broks))

        # Try to see if one of my module is dead, and restart previously dead modules
        self.check_and_del_zombie_modules()

        # Call modules that manage a starting tick pass
        _t0 = time.time()
        self.hook_point('tick')
        statsmgr.timer('hook.tick', time.time() - _t0)

        # Maybe the last loop we did raised some broks internally
        self.get_internal_broks()

        # Also reap broks sent from the arbiters
        self.get_arbiter_broks()

        # Now get broks from our distant daemons
        self.get_new_broks()

        # Get the list of broks not yet sent to our external modules
        _t0 = time.time()
        broks_to_send = [
            brok for brok in self.external_broks
            if getattr(brok, 'to_be_sent', True)
        ]
        statsmgr.gauge('get-new-broks-count.to_send', len(broks_to_send))

        # Send the broks to all external modules to_q queue so they can get the whole packet
        # beware, the sub-process/queue can be die/close, so we put to restart the whole module
        # instead of killing ourselves :)
        for module in self.modules_manager.get_external_instances():
            try:
                _t00 = time.time()
                queue_size = module.to_q.qsize()
                statsmgr.gauge(
                    'queues.external.%s.to.size' % module.get_name(),
                    queue_size)
                module.to_q.put(broks_to_send)
                statsmgr.timer('queues.external.%s.to.put' % module.get_name(),
                               time.time() - _t00)
            except Exception as exp:  # pylint: disable=broad-except
                # first we must find the modules
                logger.warning(
                    "Module %s queue exception: %s, I'm tagging it to restart later",
                    module.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(module)

        # No more need to send them
        for brok in broks_to_send:
            brok.to_be_sent = False
        logger.debug("Time to send %s broks (%d secs)", len(broks_to_send),
                     time.time() - _t0)

        # Make the internal modules manage the broks
        start = time.time()
        while self.external_broks:
            now = time.time()
            # Do not 'manage' more than 0.8s, we must get new broks almost every second
            if now - start > 0.8:
                logger.info(
                    "I did not yet managed all my broks, still %d broks",
                    len(self.external_broks))
                break

            # Get the first brok in the list
            brok = self.external_broks.pop(0)
            if self.modules_manager.get_internal_instances():
                self.manage_brok(brok)
                # Make a very short pause to avoid overloading
                self.make_a_pause(0.01, check_time_change=False)
            else:
                if getattr(brok, 'to_be_sent', False):
                    self.external_broks.append(brok)

        # Maybe our external modules raised 'objects', so get them
        if self.get_objects_from_from_queues():
            statsmgr.gauge('external-commands.got.count',
                           len(self.external_commands))
            statsmgr.gauge('broks.got.count', len(self.external_broks))
Ejemplo n.º 13
0
class Broker(BaseSatellite):
    """
    Class to manage a Broker daemon
    A Broker is used to get data from Scheduler and send them to modules. These modules in most
    cases export to other software, databases...
    """
    properties = BaseSatellite.properties.copy()
    properties.update({
        'daemon_type':
            StringProp(default='broker'),
        'pidfile':
            PathProp(default='brokerd.pid'),
        'port':
            IntegerProp(default=7772),
        'local_log':
            PathProp(default='brokerd.log'),
    })

    def __init__(self, config_file, is_daemon, do_replace, debug, debug_file,
                 port=None, local_log=None, daemon_name=None):
        self.daemon_name = 'broker'
        if daemon_name:
            self.daemon_name = daemon_name

        super(Broker, self).__init__(self.daemon_name, config_file, is_daemon, do_replace, debug,
                                     debug_file, port, local_log)

        # Our arbiters
        self.arbiters = {}

        # Our pollers, reactionners and receivers
        self.pollers = {}
        self.reactionners = {}
        self.receivers = {}

        # Modules are load one time
        self.have_modules = False

        # Can have a queue of external_commands given by modules
        # will be processed by arbiter
        self.external_commands = []

        # All broks to manage
        self.broks = []  # broks to manage
        # broks raised this turn and that needs to be put in self.broks
        self.broks_internal_raised = []
        # broks raised by the arbiters, we need a lock so the push can be in parallel
        # to our current activities and won't lock the arbiter
        self.arbiter_broks = []
        self.arbiter_broks_lock = threading.RLock()

        self.timeout = 1.0

        self.http_interface = BrokerInterface(self)

    def add(self, elt):  # pragma: no cover, seems not to be used
        """Add elt to this broker

        Original comment : Schedulers have some queues. We can simplify the call by adding
          elements into the proper queue just by looking at their type  Brok -> self.broks
          TODO: better tag ID?
          External commands -> self.external_commands

        TODO: is it useful?

        :param elt: object to add
        :type elt: object
        :return: None
        """
        cls_type = elt.__class__.my_type
        if cls_type == 'brok':
            # We tag the broks with our instance_id
            elt.instance_id = self.instance_id
            self.broks_internal_raised.append(elt)
            return
        elif cls_type == 'externalcommand':
            self.external_commands.append(elt)
        # Maybe we got a Message from the modules, it's way to ask something
        # like from now a full data from a scheduler for example.
        elif cls_type == 'message':
            # We got a message, great!
            logger.debug(str(elt.__dict__))
            if elt.get_type() == 'NeedData':
                data = elt.get_data()
                # Full instance id means: I got no data for this scheduler
                # so give me all dumb-ass!
                if 'full_instance_id' in data:
                    c_id = data['full_instance_id']
                    source = elt.source
                    logger.info('The module %s is asking me to get all initial data '
                                'from the scheduler %d',
                                source, c_id)
                    # so we just reset the connection and the running_id,
                    # it will just get all new things
                    try:
                        self.schedulers[c_id]['con'] = None
                        self.schedulers[c_id]['running_id'] = 0
                    except KeyError:  # maybe this instance was not known, forget it
                        logger.warning("the module %s ask me a full_instance_id "
                                       "for an unknown ID (%d)!", source, c_id)
            # Maybe a module tells me that it's dead, I must log it's last words...
            if elt.get_type() == 'ICrash':
                data = elt.get_data()
                logger.error('the module %s just crash! Please look at the traceback:',
                             data['name'])
                logger.error(data['trace'])

                # The module death will be looked for elsewhere and restarted.

    def manage_brok(self, brok):
        """Get a brok.
        We put brok data to the modules

        :param brok: object with data
        :type brok: object
        :return: None
        """
        # Call all modules if they catch the call
        for mod in self.modules_manager.get_internal_instances():
            try:
                _t0 = time.time()
                mod.manage_brok(brok)
                statsmgr.timer('core.manage-broks.%s' % mod.get_name(), time.time() - _t0)
            except Exception as exp:  # pylint: disable=broad-except
                logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later",
                               mod.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(mod)

    def add_broks_to_queue(self, broks):
        """ Add broks to global queue

        :param broks: some items
        :type broks: object
        :return: None
        """
        # Ok now put in queue broks to be managed by
        # internal modules
        self.broks.extend(broks)

    def interger_internal_broks(self):
        """Get all broks from self.broks_internal_raised and we put them in self.broks

        :return: None
        """
        self.add_broks_to_queue(self.broks_internal_raised)
        self.broks_internal_raised = []

    def interger_arbiter_broks(self):
        """We will get in the broks list the broks from the arbiters,
        but as the arbiter_broks list can be push by arbiter without Global lock,
        we must protect this with he list lock

        :return: None
        """
        with self.arbiter_broks_lock:
            self.add_broks_to_queue(self.arbiter_broks)
            self.arbiter_broks = []

    def get_new_broks(self, s_type='scheduler'):
        """Get new broks from daemon defined in type parameter

        :param s_type: type of object
        :type s_type: str
        :return: None
        """
        # Get the good links tab for looping..
        links = self.get_links_from_type(s_type)
        if links is None:
            logger.debug('Type unknown for connection! %s', s_type)
            return

        # We check for new check in each schedulers and put
        # the result in new_checks
        for s_id in links:
            logger.debug("Getting broks from %s", links[s_id]['name'])
            link = links[s_id]
            logger.debug("Link: %s", link)
            if not link['active']:
                logger.debug("The %s '%s' is not active, "
                             "do not get broks from its connection!", s_type, link['name'])
                continue

            if link['con'] is None:
                if not self.daemon_connection_init(s_id, s_type=s_type):
                    if link['connection_attempt'] <= link['max_failed_connections']:
                        logger.warning("The connection for the %s '%s' cannot be established, "
                                       "it is not possible to get broks from this daemon.",
                                       s_type, link['name'])
                    else:
                        logger.error("The connection for the %s '%s' cannot be established, "
                                     "it is not possible to get broks from this daemon.",
                                     s_type, link['name'])
                    continue

            try:
                _t0 = time.time()
                tmp_broks = link['con'].get('get_broks', {'bname': self.name}, wait='long')
                try:
                    tmp_broks = unserialize(tmp_broks, True)
                except AlignakClassLookupException as exp:  # pragma: no cover,
                    # simple protection
                    logger.error('Cannot un-serialize data received from "get_broks" call: %s',
                                 exp)
                    continue
                if tmp_broks:
                    logger.debug("Got %d Broks from %s in %s",
                                 len(tmp_broks), link['name'], time.time() - _t0)
                statsmgr.timer('con-broks-get.%s' % (link['name']), time.time() - _t0)
                statsmgr.gauge('con-broks-count.%s' % (link['name']), len(tmp_broks.values()))
                for brok in tmp_broks.values():
                    brok.instance_id = link['instance_id']
                # Ok, we can add theses broks to our queues
                _t0 = time.time()
                self.add_broks_to_queue(tmp_broks.values())
                statsmgr.timer('con-broks-add.%s' % s_type, time.time() - _t0)
            except HTTPClientConnectionException as exp:  # pragma: no cover, simple protection
                logger.warning("[%s] %s", link['name'], str(exp))
                link['con'] = None
                return
            except HTTPClientTimeoutException as exp:  # pragma: no cover, simple protection
                logger.warning("Connection timeout with the %s '%s' when getting broks: %s",
                               s_type, link['name'], str(exp))
                link['con'] = None
                return
            except HTTPClientException as exp:  # pragma: no cover, simple protection
                logger.error("Error with the %s '%s' when getting broks: %s",
                             s_type, link['name'], str(exp))
                link['con'] = None
                return
            # scheduler must not have checks
            #  What the F**k? We do not know what happened,
            # so.. bye bye :)
            except Exception as exp:  # pylint: disable=broad-except
                logger.exception(exp)
                sys.exit(1)

    def get_retention_data(self):  # pragma: no cover, useful?
        """Get all broks

        TODO: using retention in the broker is dangerous and
        do not seem of any utility with Alignak

        :return: broks container
        :rtype: object
        """
        return self.broks

    def restore_retention_data(self, data):  # pragma: no cover, useful?
        """Add data to broks container

        TODO: using retention in the arbiter is dangerous and
        do not seem of any utility with Alignak

        :param data: broks to add
        :type data: list
        :return: None
        """
        self.broks.extend(data)

    def do_stop(self):
        """Stop all children of this process

        :return: None
        """
        act = active_children()
        for child in act:
            child.terminate()
            child.join(1)
        super(Broker, self).do_stop()

    def setup_new_conf(self):  # pylint: disable=R0915,R0912
        """Parse new configuration and initialize all required

        :return: None
        """

        with self.conf_lock:
            self.clean_previous_run()
            conf = unserialize(self.new_conf, True)
            self.new_conf = None
            self.cur_conf = conf
            # Got our name from the globals
            g_conf = conf['global']
            if 'broker_name' in g_conf:
                name = g_conf['broker_name']
            else:
                name = 'Unnamed broker'
            self.name = name
            # Set my own process title
            self.set_proctitle(self.name)

            logger.info("[%s] Received a new configuration, containing:", self.name)
            for key in conf:
                logger.info("[%s] - %s", self.name, key)
            logger.debug("[%s] global configuration part: %s", self.name, conf['global'])

            # local statsd
            self.statsd_host = g_conf['statsd_host']
            self.statsd_port = g_conf['statsd_port']
            self.statsd_prefix = g_conf['statsd_prefix']
            self.statsd_enabled = g_conf['statsd_enabled']

            # We got a name so we can update the logger and the stats global objects
            statsmgr.register(name, 'broker',
                              statsd_host=self.statsd_host, statsd_port=self.statsd_port,
                              statsd_prefix=self.statsd_prefix, statsd_enabled=self.statsd_enabled)

            # Get our Schedulers
            for sched_id in conf['schedulers']:
                # Must look if we already have it to do not overdie our broks

                old_sched_id = self.get_previous_sched_id(conf['schedulers'][sched_id], sched_id)

                if old_sched_id:
                    logger.info("[%s] We already got the conf %s (%s)",
                                self.name, old_sched_id, name)
                    broks = self.schedulers[old_sched_id]['broks']
                    running_id = self.schedulers[old_sched_id]['running_id']
                    del self.schedulers[old_sched_id]
                else:
                    broks = {}
                    running_id = 0
                sched = conf['schedulers'][sched_id]
                self.schedulers[sched_id] = sched

                # replacing scheduler address and port by those defined in satellitemap
                if sched['name'] in g_conf['satellitemap']:
                    sched = dict(sched)  # make a copy
                    sched.update(g_conf['satellitemap'][sched['name']])

                # todo: why not using a SatteliteLink object?
                proto = 'http'
                if sched['use_ssl']:
                    proto = 'https'
                uri = '%s://%s:%s/' % (proto, sched['address'], sched['port'])
                self.schedulers[sched_id]['uri'] = uri

                self.schedulers[sched_id]['broks'] = broks
                self.schedulers[sched_id]['instance_id'] = sched['instance_id']
                self.schedulers[sched_id]['running_id'] = running_id
                self.schedulers[sched_id]['active'] = sched['active']
                self.schedulers[sched_id]['last_connection'] = 0
                self.schedulers[sched_id]['timeout'] = sched['timeout']
                self.schedulers[sched_id]['data_timeout'] = sched['data_timeout']
                self.schedulers[sched_id]['con'] = None
                self.schedulers[sched_id]['last_connection'] = 0
                self.schedulers[sched_id]['connection_attempt'] = 0
                self.schedulers[sched_id]['max_failed_connections'] = 3

            logger.debug("We have our schedulers: %s", self.schedulers)
            logger.info("We have our schedulers:")
            for daemon in self.schedulers.values():
                logger.info(" - %s ", daemon['name'])

            # Now get arbiters
            for arb_id in conf['arbiters']:
                # Must look if we already have it
                already_got = arb_id in self.arbiters
                if already_got:
                    broks = self.arbiters[arb_id]['broks']
                else:
                    broks = {}
                arb = conf['arbiters'][arb_id]
                self.arbiters[arb_id] = arb

                # replacing arbiter address and port by those defined in satellitemap
                if arb['name'] in g_conf['satellitemap']:
                    arb = dict(arb)  # make a copy
                    arb.update(g_conf['satellitemap'][arb['name']])

                # todo: why not using a SatteliteLink object?
                proto = 'http'
                if arb['use_ssl']:
                    proto = 'https'
                uri = '%s://%s:%s/' % (proto, arb['address'], arb['port'])
                self.arbiters[arb_id]['uri'] = uri

                self.arbiters[arb_id]['broks'] = broks
                self.arbiters[arb_id]['instance_id'] = 0  # No use so all to 0
                self.arbiters[arb_id]['running_id'] = 0
                self.arbiters[arb_id]['con'] = None
                self.arbiters[arb_id]['last_connection'] = 0
                self.arbiters[arb_id]['connection_attempt'] = 0
                self.arbiters[arb_id]['max_failed_connections'] = 3

                # We do not connect to the arbiter. Connection hangs

            logger.debug("We have our arbiters: %s ", self.arbiters)
            logger.info("We have our arbiters:")
            for daemon in self.arbiters.values():
                logger.info(" - %s ", daemon['name'])

            # Now for pollers
            # 658: temporary fix
            if 'pollers' in conf:
                for pol_id in conf['pollers']:
                    # Must look if we already have it
                    already_got = pol_id in self.pollers
                    if already_got:
                        broks = self.pollers[pol_id]['broks']
                        running_id = self.pollers[pol_id]['running_id']
                    else:
                        broks = {}
                        running_id = 0
                    poll = conf['pollers'][pol_id]
                    self.pollers[pol_id] = poll

                    # replacing poller address and port by those defined in satellitemap
                    if poll['name'] in g_conf['satellitemap']:
                        poll = dict(poll)  # make a copy
                        poll.update(g_conf['satellitemap'][poll['name']])

                    # todo: why not using a SatteliteLink object?
                    proto = 'http'
                    if poll['use_ssl']:
                        proto = 'https'

                    uri = '%s://%s:%s/' % (proto, poll['address'], poll['port'])
                    self.pollers[pol_id]['uri'] = uri

                    self.pollers[pol_id]['broks'] = broks
                    self.pollers[pol_id]['instance_id'] = 0  # No use so all to 0
                    self.pollers[pol_id]['running_id'] = running_id
                    self.pollers[pol_id]['con'] = None
                    self.pollers[pol_id]['last_connection'] = 0
                    self.pollers[pol_id]['connection_attempt'] = 0
                    self.pollers[pol_id]['max_failed_connections'] = 3
            else:
                logger.warning("[%s] no pollers in the received configuration", self.name)

            logger.debug("We have our pollers: %s", self.pollers)
            logger.info("We have our pollers:")
            for daemon in self.pollers.values():
                logger.info(" - %s ", daemon['name'])

            # Now reactionners
            # 658: temporary fix
            if 'reactionners' in conf:
                for rea_id in conf['reactionners']:
                    # Must look if we already have it
                    already_got = rea_id in self.reactionners
                    if already_got:
                        broks = self.reactionners[rea_id]['broks']
                        running_id = self.reactionners[rea_id]['running_id']
                    else:
                        broks = {}
                        running_id = 0

                    reac = conf['reactionners'][rea_id]
                    self.reactionners[rea_id] = reac

                    # replacing reactionner address and port by those defined in satellitemap
                    if reac['name'] in g_conf['satellitemap']:
                        reac = dict(reac)  # make a copy
                        reac.update(g_conf['satellitemap'][reac['name']])

                    # todo: why not using a SatteliteLink object?
                    proto = 'http'
                    if reac['use_ssl']:
                        proto = 'https'
                    uri = '%s://%s:%s/' % (proto, reac['address'], reac['port'])
                    self.reactionners[rea_id]['uri'] = uri

                    self.reactionners[rea_id]['broks'] = broks
                    self.reactionners[rea_id]['instance_id'] = 0  # No use so all to 0
                    self.reactionners[rea_id]['running_id'] = running_id
                    self.reactionners[rea_id]['con'] = None
                    self.reactionners[rea_id]['last_connection'] = 0
                    self.reactionners[rea_id]['connection_attempt'] = 0
                    self.reactionners[rea_id]['max_failed_connections'] = 3
            else:
                logger.warning("[%s] no reactionners in the received configuration", self.name)

            logger.debug("We have our reactionners: %s", self.reactionners)
            logger.info("We have our reactionners:")
            for daemon in self.reactionners.values():
                logger.info(" - %s ", daemon['name'])

            # Now receivers
            # 658: temporary fix
            if 'receivers' in conf:
                for rec_id in conf['receivers']:
                    # Must look if we already have it
                    already_got = rec_id in self.receivers
                    if already_got:
                        broks = self.receivers[rec_id]['broks']
                        running_id = self.receivers[rec_id]['running_id']
                    else:
                        broks = {}
                        running_id = 0

                    rec = conf['receivers'][rec_id]
                    self.receivers[rec_id] = rec

                    # replacing reactionner address and port by those defined in satellitemap
                    if rec['name'] in g_conf['satellitemap']:
                        rec = dict(rec)  # make a copy
                        rec.update(g_conf['satellitemap'][rec['name']])

                    # todo: why not using a SatteliteLink object?
                    proto = 'http'
                    if rec['use_ssl']:
                        proto = 'https'
                    uri = '%s://%s:%s/' % (proto, rec['address'], rec['port'])
                    self.receivers[rec_id]['uri'] = uri

                    self.receivers[rec_id]['broks'] = broks
                    self.receivers[rec_id]['instance_id'] = rec['instance_id']
                    self.receivers[rec_id]['running_id'] = running_id
                    self.receivers[rec_id]['con'] = None
                    self.receivers[rec_id]['last_connection'] = 0
                    self.receivers[rec_id]['connection_attempt'] = 0
                    self.receivers[rec_id]['max_failed_connections'] = 3
            else:
                logger.warning("[%s] no receivers in the received configuration", self.name)

            logger.debug("We have our receivers: %s", self.receivers)
            logger.info("We have our receivers:")
            for daemon in self.receivers.values():
                logger.info(" - %s ", daemon['name'])

            if not self.have_modules:
                self.modules = conf['global']['modules']
                self.have_modules = True

                # Ok now start, or restart them!
                # Set modules, init them and start external ones
                self.do_load_modules(self.modules)
                self.modules_manager.start_external_instances()

            # Set our giving timezone from arbiter
            use_timezone = conf['global']['use_timezone']
            if use_timezone != 'NOTSET':
                logger.info("Setting our timezone to %s", use_timezone)
                os.environ['TZ'] = use_timezone
                time.tzset()

            # Initialize connection with Schedulers, Pollers and Reactionners
            for sched_id in self.schedulers:
                self.daemon_connection_init(sched_id, s_type='scheduler')

            for pol_id in self.pollers:
                self.daemon_connection_init(pol_id, s_type='poller')

            for rea_id in self.reactionners:
                self.daemon_connection_init(rea_id, s_type='reactionner')

    def clean_previous_run(self):
        """Clean all (when we received new conf)

        :return: None
        """
        # Clean all lists
        self.schedulers.clear()
        self.pollers.clear()
        self.reactionners.clear()
        self.receivers.clear()
        self.broks = self.broks[:]
        self.arbiters.clear()
        self.broks_internal_raised = self.broks_internal_raised[:]
        with self.arbiter_broks_lock:
            self.arbiter_broks = self.arbiter_broks[:]
        self.external_commands = self.external_commands[:]

        # And now modules
        self.have_modules = False
        self.modules_manager.clear_instances()

    def get_stats_struct(self):
        """Get information of modules (internal and external) and add metrics of them

        :return: dictionary with state of all modules (internal and external)
        :rtype: dict
        :return: None
        """
        now = int(time.time())
        # call the daemon one
        res = super(Broker, self).get_stats_struct()
        res.update({'name': self.name, 'type': 'broker'})
        metrics = res['metrics']
        # metrics specific
        metrics.append('broker.%s.external-commands.queue %d %d' % (
            self.name, len(self.external_commands), now))
        metrics.append('broker.%s.broks.queue %d %d' % (self.name, len(self.broks), now))
        return res

    def do_loop_turn(self):
        """Loop use to:
         * check if modules are alive, if not restart them
         * add broks to queue of each modules

         :return: None
        """
        logger.debug("Begin Loop: managing old broks (%d)", len(self.broks))

        # Dump modules Queues size
        insts = [inst for inst in self.modules_manager.instances if inst.is_external]
        for inst in insts:
            try:
                logger.debug("External Queue len (%s): %s", inst.get_name(), inst.to_q.qsize())
            except Exception, exp:  # pylint: disable=W0703
                logger.debug("External Queue len (%s): Exception! %s", inst.get_name(), exp)

        # Begin to clean modules
        self.check_and_del_zombie_modules()

        # Now we check if we received a new configuration - no sleep time, we will sleep later...
        self.watch_for_new_conf()
        if self.new_conf:
            self.setup_new_conf()

        # Maybe the last loop we did raised some broks internally
        _t0 = time.time()
        # we should integrate them in broks
        self.interger_internal_broks()
        statsmgr.timer('get-new-broks.broker', time.time() - _t0)

        _t0 = time.time()
        # Also reap broks sent from the arbiters
        self.interger_arbiter_broks()
        statsmgr.timer('get-new-broks.arbiter', time.time() - _t0)

        # Main job, go get broks in our distant daemons
        types = ['scheduler', 'poller', 'reactionner', 'receiver']
        for _type in types:
            _t0 = time.time()
            # And from schedulers
            self.get_new_broks(s_type=_type)
            statsmgr.timer('get-new-broks.%s' % _type, time.time() - _t0)

        # Sort the brok list by id
        self.broks.sort(sort_by_ids)

        # and for external queues
        # REF: doc/broker-modules.png (3)
        # We put to external queues broks that was not already send
        t00 = time.time()
        # We are sending broks as a big list, more efficient than one by one
        ext_modules = self.modules_manager.get_external_instances()
        to_send = [brok for brok in self.broks if getattr(brok, 'need_send_to_ext', True)]

        # Send our pack to all external modules to_q queue so they can get the whole packet
        # beware, the sub-process/queue can be die/close, so we put to restart the whole module
        # instead of killing ourselves :)
        for mod in ext_modules:
            try:
                t000 = time.time()
                mod.to_q.put(to_send)
                statsmgr.timer('core.put-to-external-queue.%s' % mod.get_name(), time.time() - t000)
            except Exception as exp:  # pylint: disable=broad-except
                # first we must find the modules
                logger.warning("The mod %s queue raise an exception: %s, "
                               "I'm tagging it to restart later",
                               mod.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(mod)

        # No more need to send them
        for brok in to_send:
            brok.need_send_to_ext = False
        statsmgr.timer('core.put-to-external-queue', time.time() - t00)
        logger.debug("Time to send %s broks (%d secs)", len(to_send), time.time() - t00)

        # We must add new broks at the end of the list, so we reverse the list
        self.broks.reverse()

        start = time.time()
        while self.broks:
            now = time.time()
            # Do not 'manage' more than 1s, we must get new broks
            # every 1s
            if now - start > 1:
                break

            brok = self.broks.pop()
            # Ok, we can get the brok, and doing something with it
            # REF: doc/broker-modules.png (4-5)
            # We un serialize the brok before consume it
            brok.prepare()
            _t0 = time.time()
            self.manage_brok(brok)
            statsmgr.timer('core.manage-broks', time.time() - _t0)

            nb_broks = len(self.broks)

            # Ok we manage brok, but we still want to listen to arbiter even for a very short time
            self.make_a_pause(0.01, check_time_change=False)

            # if we got new broks here from arbiter, we should break the loop
            # because such broks will not be managed by the
            # external modules before this loop (we pop them!)
            if len(self.broks) != nb_broks:
                break

        # Maybe external modules raised 'objects'
        # we should get them
        self.get_objects_from_from_queues()

        # Maybe we do not have something to do, so we wait a little
        # TODO: redone the diff management....
        if not self.broks:
            while self.timeout > 0:
                begin = time.time()
                self.watch_for_new_conf(1.0)
                end = time.time()
                self.timeout = self.timeout - (end - begin)
            self.timeout = 1.0

        # Say to modules it's a new tick :)
        self.hook_point('tick')
Ejemplo n.º 14
0
    def get_new_broks(self, s_type='scheduler'):
        """Get new broks from daemon defined in type parameter

        :param s_type: type of object
        :type s_type: str
        :return: None
        """
        # Get the good links tab for looping..
        links = self.get_links_from_type(s_type)
        if links is None:
            logger.debug('Type unknown for connection! %s', s_type)
            return

        # We check for new check in each schedulers and put
        # the result in new_checks
        for s_id in links:
            logger.debug("Getting broks from %s", links[s_id]['name'])
            link = links[s_id]
            logger.debug("Link: %s", link)
            if not link['active']:
                logger.debug("The %s '%s' is not active, "
                             "do not get broks from its connection!", s_type, link['name'])
                continue

            if link['con'] is None:
                if not self.daemon_connection_init(s_id, s_type=s_type):
                    if link['connection_attempt'] <= link['max_failed_connections']:
                        logger.warning("The connection for the %s '%s' cannot be established, "
                                       "it is not possible to get broks from this daemon.",
                                       s_type, link['name'])
                    else:
                        logger.error("The connection for the %s '%s' cannot be established, "
                                     "it is not possible to get broks from this daemon.",
                                     s_type, link['name'])
                    continue

            try:
                _t0 = time.time()
                tmp_broks = link['con'].get('get_broks', {'bname': self.name}, wait='long')
                try:
                    tmp_broks = unserialize(tmp_broks, True)
                except AlignakClassLookupException as exp:  # pragma: no cover,
                    # simple protection
                    logger.error('Cannot un-serialize data received from "get_broks" call: %s',
                                 exp)
                    continue
                if tmp_broks:
                    logger.debug("Got %d Broks from %s in %s",
                                 len(tmp_broks), link['name'], time.time() - _t0)
                statsmgr.timer('con-broks-get.%s' % (link['name']), time.time() - _t0)
                statsmgr.gauge('con-broks-count.%s' % (link['name']), len(tmp_broks.values()))
                for brok in tmp_broks.values():
                    brok.instance_id = link['instance_id']
                # Ok, we can add theses broks to our queues
                _t0 = time.time()
                self.add_broks_to_queue(tmp_broks.values())
                statsmgr.timer('con-broks-add.%s' % s_type, time.time() - _t0)
            except HTTPClientConnectionException as exp:  # pragma: no cover, simple protection
                logger.warning("[%s] %s", link['name'], str(exp))
                link['con'] = None
                return
            except HTTPClientTimeoutException as exp:  # pragma: no cover, simple protection
                logger.warning("Connection timeout with the %s '%s' when getting broks: %s",
                               s_type, link['name'], str(exp))
                link['con'] = None
                return
            except HTTPClientException as exp:  # pragma: no cover, simple protection
                logger.error("Error with the %s '%s' when getting broks: %s",
                             s_type, link['name'], str(exp))
                link['con'] = None
                return
            # scheduler must not have checks
            #  What the F**k? We do not know what happened,
            # so.. bye bye :)
            except Exception as exp:  # pylint: disable=broad-except
                logger.exception(exp)
                sys.exit(1)
Ejemplo n.º 15
0
    def do_loop_turn(self):
        # pylint: disable=too-many-branches
        """Loop used to:
         * get initial status broks
         * check if modules are alive, if not restart them
         * get broks from ourself, the arbiters and our satellites
         * add broks to the queue of each external module
         * manage broks with each internal module

         If the internal broks management is longer than 0.8 seconds, postpone to hte next
         loop turn to avoid overloading the broker daemon.

         :return: None
        """
        if not self.got_initial_broks:
            # Asking initial broks from my schedulers
            my_satellites = self.get_links_of_type(s_type='scheduler')
            for satellite in list(my_satellites.values()):
                logger.info("Asking my initial broks from '%s'", satellite.name)
                _t0 = time.time()
                try:
                    my_initial_broks = satellite.get_initial_broks(self.name)
                    statsmgr.timer('broks.initial.%s.time' % satellite.name, time.time() - _t0)
                    if not my_initial_broks:
                        logger.info("No initial broks were raised, "
                                    "my scheduler is not yet ready...")
                        return

                    self.got_initial_broks = True
                    logger.debug("Got %d initial broks from '%s'",
                                 my_initial_broks, satellite.name)
                    statsmgr.gauge('broks.initial.%s.count' % satellite.name, my_initial_broks)
                except LinkError as exp:
                    logger.warning("Scheduler connection failed, I could not get initial broks!")

        logger.debug("Begin Loop: still some old broks to manage (%d)", len(self.external_broks))
        if self.external_broks:
            statsmgr.gauge('unmanaged.broks', len(self.external_broks))

        # Try to see if one of my module is dead, and restart previously dead modules
        self.check_and_del_zombie_modules()

        # Call modules that manage a starting tick pass
        _t0 = time.time()
        self.hook_point('tick')
        statsmgr.timer('hook.tick', time.time() - _t0)

        # Maybe the last loop we did raised some broks internally
        self.get_internal_broks()

        # Also reap broks sent from the arbiters
        self.get_arbiter_broks()

        # Now get broks from our distant daemons
        self.get_new_broks()

        # Get the list of broks not yet sent to our external modules
        _t0 = time.time()
        broks_to_send = [brok for brok in self.external_broks if getattr(brok, 'to_be_sent', True)]
        statsmgr.gauge('get-new-broks-count.to_send', len(broks_to_send))

        # Send the broks to all external modules to_q queue so they can get the whole packet
        # beware, the sub-process/queue can be die/close, so we put to restart the whole module
        # instead of killing ourselves :)
        for module in self.modules_manager.get_external_instances():
            try:
                _t00 = time.time()
                queue_size = module.to_q.qsize()
                statsmgr.gauge('queues.external.%s.to.size' % module.get_name(), queue_size)
                module.to_q.put(broks_to_send)
                statsmgr.timer('queues.external.%s.to.put' % module.get_name(), time.time() - _t00)
            except Exception as exp:  # pylint: disable=broad-except
                # first we must find the modules
                logger.warning("Module %s queue exception: %s, I'm tagging it to restart later",
                               module.get_name(), str(exp))
                logger.exception(exp)
                self.modules_manager.set_to_restart(module)

        # No more need to send them
        for brok in broks_to_send:
            brok.to_be_sent = False
        logger.debug("Time to send %s broks (%d secs)", len(broks_to_send), time.time() - _t0)

        # Make the internal modules manage the broks
        start = time.time()
        while self.external_broks:
            now = time.time()
            # Do not 'manage' more than 0.8s, we must get new broks almost every second
            if now - start > 0.8:
                logger.info("I did not yet managed all my broks, still %d broks",
                            len(self.external_broks))
                break

            # Get the first brok in the list
            brok = self.external_broks.pop(0)
            if self.modules_manager.get_internal_instances():
                self.manage_brok(brok)
                # Make a very short pause to avoid overloading
                self.make_a_pause(0.01, check_time_change=False)
            else:
                if getattr(brok, 'to_be_sent', False):
                    self.external_broks.append(brok)

        # Maybe our external modules raised 'objects', so get them
        if self.get_objects_from_from_queues():
            statsmgr.gauge('external-commands.got.count', len(self.external_commands))
            statsmgr.gauge('broks.got.count', len(self.external_broks))