def _manage_brok(self, brok):
     if brok.type == 'log' and 'NOTIFICATION' in brok.data['log']:
         try:
             self.queue.put_nowait(brok)
         except Queue.Full:
             logger.warn('[Mongodb-Notification-Broker] Queue full. '
                         'Ignore broks.')
 def _save(self, ref, ref_identity, notification):
     self._process_db_operation(self.notifications.insert, notification)
     if ref == 'service':
         _id = ','.join((ref_identity.get('host'),
                         ref_identity.get('service_description')))
         cursor = self._process_db_operation(self.services.find,
                                             {'_id': _id})
     elif ref == 'host':
         _id = ref_identity.get('host')
         cursor = self._process_db_operation(self.hosts.find, {'_id': _id})
     
     # if service or host find error, 'cursor' will be None.
     # then we can not make sure that whether specific host or service 
     # exists. In order to not make data be corrupted, we should stop here.
     if cursor:
         if not cursor.count():
             # if notification insert error, then '_id' will not be in it and we
             # then should ignore the notification.
             ref_identity.setdefault('notification_ids',
                                     [notification.get('_id')] if '_id' in notification else [])
             ref_identity.setdefault('_id', _id)
             
             if ref == 'service':
                 self._process_db_operation(self.services.insert, ref_identity)
             elif ref == 'host':
                 self._process_db_operation(self.hosts.insert, ref_identity)
         else:
             document = cursor[0]
             notification_ids = document.get('notification_ids')
             # if notification insert error, then '_id' will not be in it and we 
             # then should ignore the notification
             if '_id' in notification:
                 notification_ids.append(notification.get('_id'))
                 if ref == 'service':
                     self._process_db_operation(self.services.update,
                                                {'_id': _id},
                                                {'$set': {'notification_ids': notification_ids}})
                 elif ref == 'host':
                     self._process_db_operation(self.hosts.update,
                                                {'_id': _id},
                                                {'$set': {'notification_ids': notification_ids}})    
     else:
         logger.warn('[Mongodb-Notification-Broker] Update notification '
                     'success, link with host or service error.')
     logger.debug('[Mongodb-Notification-Broker] Update notification ends.')    
    def hook_save_retention(self, daemon):
        """
        Called by Scheduler to store data

        We must not do anything that will last for a long time. It will delay other operations in
        the Scheduler daemon's main event loop.

        So detach a process that will make the job ...

        If a previous process exists, kill it
        """
        retention = daemon.get_retention_data()
        if self.task and self.task.is_alive():
            logger.warn('[Mongodb-Scheduler-Retention] Previous storing job '
                        'is not yet finished! Make retention interval longer '
                        'in your shinken.cfg configuration.')
            logger.warn('[Mongodb-Scheduler-Retention] Current retention job '
                        'is postponed! ')
            if self.task_count > 2:
                logger.warn('[Mongodb-Scheduler-Retention] retention job has been '
                            'postponed twice. Killing current task to start a new job! ')
                os.kill(self.task.pid, signal.SIGKILL)
            else:
                self.task_count += 1
                return
        self.task = None
        self.task_count = 0

        # Detach a retention job ...
        self.task = Process(target=self._hook_save_retention, args=(retention,))
        self.task.daemon = True
        self.task.start()
        logger.debug('[Mongodb-Scheduler-Retention] New update begins.')
 def init(self):
     logger.info('[Mongodb-Notification-Broker] Initialization of '
                 'mongodb_notification_broker module')
     self._set_mongodb_url()
     logger.debug('[Mongodb-Notification-Broker] Mongodb connect url: %s' 
                  % self.mongodb_url)
     
     # In case notification broker process down occasionally, the self.conn 
     # object must be dropped cleanly in Broker daemon.
     self.do_stop()
     try:
         if not self.high_availability:
             self.conn = MongoClient(self.mongodb_url)
         else:
             self.conn = MongoReplicaSetClient(self.mongodb_url)
     except ConnectionFailure:
         logger.warn('[Mongodb-Notification-Broker] Can not make connection '
                     ' with MongoDB')
         raise
         
     except (InvalidURI, ConfigurationError):
         logger.warn('[Mongodb-Notification-Broker] Mongodb connect url '
                     'error')
         logger.warn('[Mongodb-Notification-Broker] Mongodb connect url: %s' 
                     % self.mongodb_url)
         raise 
     self._get_collections()
 def _process_db_operation(self, operation, *param):
     reconnect_start = time.time()
     result = None        
     while True:
         try:
             result = operation(*param)
         except AutoReconnect:
             logger.warn('[Mongodb-Notification-Broker] Update error. ' 
                         'Reconnected last %d seconds' % (time.time() - reconnect_start))
             # avoid to invoke too many write operations
             time.sleep(self.retry_per_log)
         except Exception:
             logger.warn('[Mongodb-Notification-Broker] Update error. '
                         'operation %s, param %s' % (operation, param))
             logger.warn('[Mongodb-Notification-Broker] %s' % traceback.format_exc())
             break
         else:
             logger.debug('[Mongodb-Notification-Broker] Update success. '
                          'Operation %s, param %s' % (operation, param))
             break
     return result    
Beispiel #6
0
    def dispatch(self):
        # Ok, we pass at least one time in dispatch, so now errors are True errors
        self.first_dispatch_done = True

        # If no needed to dispatch, do not dispatch :)
        if not self.dispatch_ok:
            for r in self.realms:
                conf_to_dispatch = [cfg for cfg in r.confs.values() if not cfg.is_assigned]
                nb_conf = len(conf_to_dispatch)
                if nb_conf > 0:
                    logger.info("Dispatching Realm %s", r.get_name())
                    logger.info('[%s] Dispatching %d/%d configurations',
                                r.get_name(), nb_conf, len(r.confs))

                # Now we get in scheds all scheduler of this realm and upper so
                # we will send them conf (in this order)
                scheds = self.get_scheduler_ordered_list(r)

                if nb_conf > 0:
                    print_string = '[%s] Schedulers order: %s' % (
                        r.get_name(), ','.join([s.get_name() for s in scheds]))
                    logger.info(print_string)

                # Try to send only for alive members
                scheds = [s for s in scheds if s.alive]

                # Now we do the real job
                # every_one_need_conf = False
                for conf in conf_to_dispatch:
                    logger.info('[%s] Dispatching configuration %s', r.get_name(), conf.id)

                    # If there is no alive schedulers, not good...
                    if len(scheds) == 0:
                        logger.warn('[%s] but there a no alive schedulers in this realm!',
                                    r.get_name())

                    # we need to loop until the conf is assigned
                    # or when there are no more schedulers available
                    while True:
                        try:
                            sched = scheds.pop()
                        except IndexError:  # No more schedulers.. not good, no loop
                            # need_loop = False
                            # The conf does not need to be dispatch
                            cfg_id = conf.id
                            # we need to dispatch the conf, but there are no
                            # available alive schedulers to use, this conf
                            # will be lost and something is wrong. We must
                            # log it!
                            logger.warn('[%s] Conf %d do not have additional '
                                        'alive schedulers to assign to!',
                                        r.get_name(), conf.id)
                            for kind in ('reactionner', 'poller', 'broker', 'receiver'):
                                r.to_satellites[kind][cfg_id] = None
                                r.to_satellites_need_dispatch[kind][cfg_id] = False
                                r.to_satellites_managed_by[kind][cfg_id] = []
                            break

                        logger.info('[%s] Trying to send conf %d to scheduler %s',
                                    r.get_name(), conf.id, sched.get_name())
                        if not sched.need_conf:
                            logger.info('[%s] The scheduler %s do not need conf, sorry',
                                        r.get_name(), sched.get_name())
                            continue

                        # We tag conf with the instance_name = scheduler_name
                        instance_name = sched.scheduler_name
                        # We give this configuration a new 'flavor'
                        conf.push_flavor = random.randint(1, 1000000)
                        # REF: doc/shinken-conf-dispatching.png (3)
                        # REF: doc/shinken-scheduler-lost.png (2)
                        override_conf = sched.get_override_configuration()
                        satellites_for_sched = r.get_satellites_links_for_scheduler()
                        s_conf = r.serialized_confs[conf.id]
                        # Prepare the conf before sending it
                        conf_package = {
                            'conf': s_conf, 'override_conf': override_conf,
                            'modules': sched.modules, 'satellites': satellites_for_sched,
                            'instance_name': sched.scheduler_name, 'push_flavor': conf.push_flavor,
                            'skip_initial_broks': sched.skip_initial_broks,
                            'accept_passive_unknown_check_results':
                                sched.accept_passive_unknown_check_results,
                            # shiken.io part
                            'api_key': self.conf.api_key,
                            'secret': self.conf.secret,
                            'http_proxy': self.conf.http_proxy,
                            # statsd one too because OlivierHA love statsd
                            # and after some years of effort he manages to make me
                            # understand the powerfullness of metrics :)
                            'statsd_host': self.conf.statsd_host,
                            'statsd_port': self.conf.statsd_port,
                            'statsd_prefix': self.conf.statsd_prefix,
                            'statsd_enabled': self.conf.statsd_enabled,
                        }

                        t1 = time.time()
                        is_sent = sched.put_conf(conf_package)
                        logger.debug("Conf is sent in %d", time.time() - t1)
                        if not is_sent:
                            logger.warning('[%s] configuration dispatching error for scheduler %s',
                                           r.get_name(), sched.get_name())
                            continue

                        logger.info('[%s] Dispatch OK of conf in scheduler %s',
                                    r.get_name(), sched.get_name())

                        sched.conf = conf
                        sched.push_flavor = conf.push_flavor
                        sched.need_conf = False
                        conf.is_assigned = True
                        conf.assigned_to = sched

                        # We update all data for this scheduler
                        sched.managed_confs = {conf.id: conf.push_flavor}

                        # Now we generate the conf for satellites:
                        cfg_id = conf.id
                        for kind in ('reactionner', 'poller', 'broker', 'receiver'):
                            r.to_satellites[kind][cfg_id] = sched.give_satellite_cfg()
                            r.to_satellites_need_dispatch[kind][cfg_id] = True
                            r.to_satellites_managed_by[kind][cfg_id] = []

                        # Ok, the conf is dispatched, no more loop for this
                        # configuration
                        break

            # We pop conf to dispatch, so it must be no more conf...
            conf_to_dispatch = [cfg for cfg in self.conf.confs.values() if not cfg.is_assigned]
            nb_missed = len(conf_to_dispatch)
            if nb_missed > 0:
                logger.warning("All schedulers configurations are not dispatched, %d are missing",
                               nb_missed)
            else:
                logger.info("OK, all schedulers configurations are dispatched :)")
                self.dispatch_ok = True

            # Sched without conf in a dispatch ok are set to no need_conf
            # so they do not raise dispatch where no use
            if self.dispatch_ok:
                for sched in self.schedulers.items.values():
                    if sched.conf is None:
                        # print "Tagging sched", sched.get_name(),
                        # "so it do not ask anymore for conf"
                        sched.need_conf = False

            arbiters_cfg = {}
            for arb in self.arbiters:
                arbiters_cfg[arb.id] = arb.give_satellite_cfg()

            # We put the satellites conf with the "new" way so they see only what we want
            for r in self.realms:
                for cfg in r.confs.values():
                    cfg_id = cfg.id
                    # flavor if the push number of this configuration send to a scheduler
                    flavor = cfg.push_flavor
                    for kind in ('reactionner', 'poller', 'broker', 'receiver'):
                        if r.to_satellites_need_dispatch[kind][cfg_id]:
                            cfg_for_satellite_part = r.to_satellites[kind][cfg_id]

                            # make copies of potential_react list for sort
                            satellites = []
                            for satellite in r.get_potential_satellites_by_type(kind):
                                satellites.append(satellite)
                            satellites.sort(alive_then_spare_then_deads)

                            # Only keep alive Satellites and reachable ones
                            satellites = [s for s in satellites if s.alive and s.reachable]

                            # If we got a broker, we make the list to pop a new
                            # item first for each scheduler, so it will smooth the load
                            # But the spare must stay at the end ;)
                            # WARNING : skip this if we are in a complet broker link realm
                            if kind == "broker" and not r.broker_complete_links:
                                nospare = [s for s in satellites if not s.spare]
                                # Should look over the list, not over
                                if len(nospare) != 0:
                                    idx = cfg_id % len(nospare)
                                    spares = [s for s in satellites if s.spare]
                                    new_satellites = nospare[idx:]
                                    for _b in nospare[: -idx + 1]:
                                        if _b not in new_satellites:
                                            new_satellites.append(_b)
                                    satellites = new_satellites
                                    satellites.extend(spares)

                            # Dump the order where we will send conf
                            satellite_string = "[%s] Dispatching %s satellite with order: " % (
                                r.get_name(), kind)
                            for satellite in satellites:
                                satellite_string += '%s (spare:%s), ' % (
                                    satellite.get_name(), str(satellite.spare))
                            logger.info(satellite_string)

                            # Now we dispatch cfg to every one ask for it
                            nb_cfg_sent = 0
                            for satellite in satellites:
                                # Send only if we need, and if we can
                                if (nb_cfg_sent < r.get_nb_of_must_have_satellites(kind) and
                                        satellite.alive):
                                    satellite.cfg['schedulers'][cfg_id] = cfg_for_satellite_part
                                    if satellite.manage_arbiters:
                                        satellite.cfg['arbiters'] = arbiters_cfg

                                    # Brokers should have poller/reactionners links too
                                    if kind == "broker":
                                        r.fill_broker_with_poller_reactionner_links(satellite)

                                    is_sent = False
                                    # Maybe this satellite already got this configuration,
                                    # so skip it
                                    if satellite.do_i_manage(cfg_id, flavor):
                                        logger.info('[%s] Skipping configuration %d send '
                                                    'to the %s %s: it already got it',
                                                    r.get_name(), cfg_id, kind,
                                                    satellite.get_name())
                                        is_sent = True
                                    else:  # ok, it really need it :)
                                        logger.info('[%s] Trying to send configuration to %s %s',
                                                    r.get_name(), kind, satellite.get_name())
                                        is_sent = satellite.put_conf(satellite.cfg)

                                    if is_sent:
                                        satellite.active = True
                                        logger.info('[%s] Dispatch OK of configuration %s to %s %s',
                                                    r.get_name(), cfg_id, kind,
                                                    satellite.get_name())
                                        # We change the satellite configuration, update our data
                                        satellite.known_conf_managed_push(cfg_id, flavor)

                                        nb_cfg_sent += 1
                                        r.to_satellites_managed_by[kind][cfg_id].append(satellite)

                                        # If we got a broker, the conf_id must be sent to only ONE
                                        # broker in a classic realm.
                                        if kind == "broker" and not r.broker_complete_links:
                                            break

                                        # If receiver, we must send the hostnames
                                        # of this configuration
                                        if kind == 'receiver':
                                            hnames = [h.get_name() for h in cfg.hosts]
                                            logger.debug("[%s] Sending %s hostnames to the "
                                                         "receiver %s",
                                                         r.get_name(), len(hnames),
                                                         satellite.get_name())
                                            satellite.push_host_names(cfg_id, hnames)
                            # else:
                            #    #I've got enough satellite, the next ones are considered spares
                            if nb_cfg_sent == r.get_nb_of_must_have_satellites(kind):
                                logger.info("[%s] OK, no more %s sent need", r.get_name(), kind)
                                r.to_satellites_need_dispatch[kind][cfg_id] = False
    def _hook_save_retention(self, retention):
        """
        Detached retention task ...
        """
        self.set_proctitle(self.name)
        try:
            self._open()
        except Exception:
            logger.warn("[Mongodb-Scheduler-Retention] retention save error")
            return

        # Hosts / services retention
        now = time.time()
        logger.info('[Mongodb-Scheduler-Retention] update hosts/services retention starting ...')
        hosts = retention['hosts']
        services = retention['services']
        comments = []
        downtimes = []
        try:
            for host in hosts:
                _id = '%s,hostcheck' % host
                logger.info('[Mongodb-Scheduler-Retention] update host retention: %s.' % host)
                host_retention = hosts[host]
                dumped_value = pickle.dumps(host_retention, protocol=pickle.HIGHEST_PROTOCOL)
                value = base64.b64encode(dumped_value)
                self.hosts_collection.remove({'_id': _id})
                retention_data = {'_id': _id,
                                  'value': value,
                                  'timestamp': int(time.time())
                                  }
                self.hosts_collection.insert(retention_data)
                if host_retention['downtimes']:
                    for downtime in host_retention['downtimes']:
                        downtimes.append(('%s,%s' % (_id, downtime.entry_time), self._get_element(downtime, host, 'hostcheck')))
                        logger.info('[Mongodb-Scheduler-Retention]  - host downtime: %s,%s: %s' % (_id, downtime.entry_time, downtime))
                if host_retention['comments']:
                    for comment in host_retention['comments']:
                        comments.append(('%s,%s' % (_id, comment.entry_time), self._get_element(comment, host, 'hostcheck')))
                        logger.info('[Mongodb-Scheduler-Retention]  - host comment: %s,%s: %s' % (_id, comment.entry_time, comment))
            logger.info('[Mongodb-Scheduler-Retention] updated hosts retention.')

            for (host, service) in services:
                _id = '%s,%s' % (host, service)
                logger.info('[Mongodb-Scheduler-Retention] update service retention: %s.' % _id)
                service_retention = services[(host, service)]
                dumped_value = pickle.dumps(service_retention, protocol=pickle.HIGHEST_PROTOCOL)
                value = base64.b64encode(dumped_value)
                self.services_collection.remove({'_id': _id})
                retention_data = {'_id': _id,
                                  'value': value,
                                  'timestamp': int(time.time())
                                  }
                if service_retention['downtimes']:
                    for downtime in service_retention['downtimes']:
                        downtimes.append(('%s,%s' % (_id, downtime.entry_time), self._get_element(downtime, host, service)))
                        logger.info('[Mongodb-Scheduler-Retention]  - service downtime: %s,%s: %s' % (_id, downtime.entry_time, downtime))
                if service_retention['comments']:
                    for comment in service_retention['comments']:
                        comments.append(('%s,%s' % (_id, comment.entry_time), self._get_element(comment, host, service)))
                        logger.info('[Mongodb-Scheduler-Retention]  - service comment: %s,%s: %s' % (_id, comment.entry_time, comment))
                self.services_collection.insert(retention_data)
            logger.info('[Mongodb-Scheduler-Retention] updated services retention.')
        except Exception:
            logger.warn('[Mongodb-Scheduler-Retention] update hosts/services retention error: %s'
                        % traceback.format_exc())

        logger.info("[Mongodb-Scheduler-Retention] update hosts/services retention (%3.4fs)", time.time() - now)

        # Comments / downtimes retention
        now = time.time()
        logger.info('[Mongodb-Scheduler-Retention] update comments/downtimes retention starting ...')
        try:
            for _id, comment in comments:
                filter = { "host": comment['host'], "service": comment['service'], "entry_time": comment['entry_time'] }
                try:
                    result = self.comments_collection.replace_one(filter, comment, upsert=True)
                    if result.upserted_id:
                        logger.info('[Mongodb-Scheduler-Retention] comment inserted: %s / %s.' % (_id, comment))
                except Exception:
                    logger.warn('[Mongodb-Scheduler-Retention] comment update/insert error: %s'
                                % traceback.format_exc())

        except Exception:
            logger.warn('[Mongodb-Scheduler-Retention] comments update error: %s' % traceback.format_exc())

        try:
            for _id, downtime in downtimes:
                filter = { "host": downtime['host'], "service": downtime['service'], "entry_time": downtime['entry_time'] }
                try:
                    result = self.downtimes_collection.replace_one(filter, downtime, upsert=True)
                    if result.upserted_id:
                        logger.info('[Mongodb-Scheduler-Retention] downtime inserted: %s / %s.' % (_id, downtime))
                except Exception:
                    logger.warn('[Mongodb-Scheduler-Retention] downtime update/insert error: %s'
                                % traceback.format_exc())

        except Exception:
            logger.warn('[Mongodb-Scheduler-Retention] comments update error: %s' % traceback.format_exc())

        logger.info("[Mongodb-Scheduler-Retention] update comments/downtimes retention (%3.4fs)", time.time() - now)

        self._close()
    def hook_load_retention(self, daemon):
        """
        Called by Scheduler to restore stored retention data
        """
        logger.info('[Mongodb-Scheduler-Retention] retention load starting ...')

        # Now the old flat file way :(
        if self.path:
            logger.info("[Mongodb-Scheduler-Retention] Reading from retention_file %s" % self.path)
            try:
                f = open(self.path, 'rb')
                all_data = cPickle.load(f)
                f.close()
            except (EOFError, ValueError, IOError) as exp:
                logger.warning("[Mongodb-Scheduler-Retention] error reading retention file: %s" % str(exp))
                return False
            except (IndexError, TypeError) as exp:
                logger.warning("[Mongodb-Scheduler-Retention] Sorry, the resource file is not compatible!")
                return False

            # call the scheduler helper function for restoring values
            daemon.restore_retention_data(all_data)

            logger.info("[Mongodb-Scheduler-Retention] Retention objects loaded successfully.")
            return

        try:
            self._open()
        except Exception:
            logger.warn("[Mongodb-Scheduler-Retention] retention load error")
            return

        hosts = {}
        services = {}
        restored_hosts = {}
        restored_services = {}
        try:
            host_cursor = self.hosts_collection.find()
            service_cursor = self.services_collection.find()
            for host in host_cursor:
                value = host.get('value')
                restored_hosts[host.get('_id')] = value
            for service in service_cursor:
                value = service.get('value')
                restored_services[service.get('_id')] = value
            for host in daemon.hosts:
                key = '%s,hostcheck' % (host.host_name)
                if key in restored_hosts:
                    restored_value = restored_hosts[key]
                    value = pickle.loads(base64.b64decode(restored_value))
                    hosts[host.host_name] = value
                    logger.info('[Mongodb-Scheduler-Retention] restored host retention: %s' % (key))
                    if hosts[host.host_name]['downtimes']:
                        for downtime in hosts[host.host_name]['downtimes']:
                            logger.info('[Mongodb-Scheduler-Retention]  - host downtime: %s: %s' % (key, downtime.id))
                    if hosts[host.host_name]['comments']:
                        for comment in hosts[host.host_name]['comments']:
                            logger.info('[Mongodb-Scheduler-Retention]  - host comment: %s: %s' % (key, comment.id))
            for service in daemon.services:
                key = '%s,%s' % (service.host.host_name, service.service_description)
                if key in restored_services:
                    restored_value = restored_services[key]
                    value = pickle.loads(base64.b64decode(restored_value))
                    services[(service.host.host_name,service.service_description)] = value
                    logger.info('[Mongodb-Scheduler-Retention] restored service retention: %s.' % (key))
                    if services[(service.host.host_name,service.service_description)]['downtimes']:
                        for downtime in services[(service.host.host_name,service.service_description)]['downtimes']:
                            logger.info('[Mongodb-Scheduler-Retention]  - host downtime: %s: %s' % (key, downtime.id))
                    if services[(service.host.host_name,service.service_description)]['comments']:
                        for comment in services[(service.host.host_name,service.service_description)]['comments']:
                            logger.info('[Mongodb-Scheduler-Retention]  - host comment: %s: %s' % (key, comment.id))

            retention_data = {'hosts': hosts, 'services': services}
            daemon.restore_retention_data(retention_data)

            logger.info('[Mongodb-Scheduler-Retention] retention load ends')
        except Exception:
            logger.error('[Mongodb-Scheduler-Retention] Retention load error.')
            logger.error('[Mongodb-Scheduler-Retention] %s'
                        % traceback.format_exc())
        finally:
            self._close()