def _manage_brok(self, brok): if brok.type == 'log' and 'NOTIFICATION' in brok.data['log']: try: self.queue.put_nowait(brok) except Queue.Full: logger.warn('[Mongodb-Notification-Broker] Queue full. ' 'Ignore broks.')
def _save(self, ref, ref_identity, notification): self._process_db_operation(self.notifications.insert, notification) if ref == 'service': _id = ','.join((ref_identity.get('host'), ref_identity.get('service_description'))) cursor = self._process_db_operation(self.services.find, {'_id': _id}) elif ref == 'host': _id = ref_identity.get('host') cursor = self._process_db_operation(self.hosts.find, {'_id': _id}) # if service or host find error, 'cursor' will be None. # then we can not make sure that whether specific host or service # exists. In order to not make data be corrupted, we should stop here. if cursor: if not cursor.count(): # if notification insert error, then '_id' will not be in it and we # then should ignore the notification. ref_identity.setdefault('notification_ids', [notification.get('_id')] if '_id' in notification else []) ref_identity.setdefault('_id', _id) if ref == 'service': self._process_db_operation(self.services.insert, ref_identity) elif ref == 'host': self._process_db_operation(self.hosts.insert, ref_identity) else: document = cursor[0] notification_ids = document.get('notification_ids') # if notification insert error, then '_id' will not be in it and we # then should ignore the notification if '_id' in notification: notification_ids.append(notification.get('_id')) if ref == 'service': self._process_db_operation(self.services.update, {'_id': _id}, {'$set': {'notification_ids': notification_ids}}) elif ref == 'host': self._process_db_operation(self.hosts.update, {'_id': _id}, {'$set': {'notification_ids': notification_ids}}) else: logger.warn('[Mongodb-Notification-Broker] Update notification ' 'success, link with host or service error.') logger.debug('[Mongodb-Notification-Broker] Update notification ends.')
def hook_save_retention(self, daemon): """ Called by Scheduler to store data We must not do anything that will last for a long time. It will delay other operations in the Scheduler daemon's main event loop. So detach a process that will make the job ... If a previous process exists, kill it """ retention = daemon.get_retention_data() if self.task and self.task.is_alive(): logger.warn('[Mongodb-Scheduler-Retention] Previous storing job ' 'is not yet finished! Make retention interval longer ' 'in your shinken.cfg configuration.') logger.warn('[Mongodb-Scheduler-Retention] Current retention job ' 'is postponed! ') if self.task_count > 2: logger.warn('[Mongodb-Scheduler-Retention] retention job has been ' 'postponed twice. Killing current task to start a new job! ') os.kill(self.task.pid, signal.SIGKILL) else: self.task_count += 1 return self.task = None self.task_count = 0 # Detach a retention job ... self.task = Process(target=self._hook_save_retention, args=(retention,)) self.task.daemon = True self.task.start() logger.debug('[Mongodb-Scheduler-Retention] New update begins.')
def init(self): logger.info('[Mongodb-Notification-Broker] Initialization of ' 'mongodb_notification_broker module') self._set_mongodb_url() logger.debug('[Mongodb-Notification-Broker] Mongodb connect url: %s' % self.mongodb_url) # In case notification broker process down occasionally, the self.conn # object must be dropped cleanly in Broker daemon. self.do_stop() try: if not self.high_availability: self.conn = MongoClient(self.mongodb_url) else: self.conn = MongoReplicaSetClient(self.mongodb_url) except ConnectionFailure: logger.warn('[Mongodb-Notification-Broker] Can not make connection ' ' with MongoDB') raise except (InvalidURI, ConfigurationError): logger.warn('[Mongodb-Notification-Broker] Mongodb connect url ' 'error') logger.warn('[Mongodb-Notification-Broker] Mongodb connect url: %s' % self.mongodb_url) raise self._get_collections()
def _process_db_operation(self, operation, *param): reconnect_start = time.time() result = None while True: try: result = operation(*param) except AutoReconnect: logger.warn('[Mongodb-Notification-Broker] Update error. ' 'Reconnected last %d seconds' % (time.time() - reconnect_start)) # avoid to invoke too many write operations time.sleep(self.retry_per_log) except Exception: logger.warn('[Mongodb-Notification-Broker] Update error. ' 'operation %s, param %s' % (operation, param)) logger.warn('[Mongodb-Notification-Broker] %s' % traceback.format_exc()) break else: logger.debug('[Mongodb-Notification-Broker] Update success. ' 'Operation %s, param %s' % (operation, param)) break return result
def dispatch(self): # Ok, we pass at least one time in dispatch, so now errors are True errors self.first_dispatch_done = True # If no needed to dispatch, do not dispatch :) if not self.dispatch_ok: for r in self.realms: conf_to_dispatch = [cfg for cfg in r.confs.values() if not cfg.is_assigned] nb_conf = len(conf_to_dispatch) if nb_conf > 0: logger.info("Dispatching Realm %s", r.get_name()) logger.info('[%s] Dispatching %d/%d configurations', r.get_name(), nb_conf, len(r.confs)) # Now we get in scheds all scheduler of this realm and upper so # we will send them conf (in this order) scheds = self.get_scheduler_ordered_list(r) if nb_conf > 0: print_string = '[%s] Schedulers order: %s' % ( r.get_name(), ','.join([s.get_name() for s in scheds])) logger.info(print_string) # Try to send only for alive members scheds = [s for s in scheds if s.alive] # Now we do the real job # every_one_need_conf = False for conf in conf_to_dispatch: logger.info('[%s] Dispatching configuration %s', r.get_name(), conf.id) # If there is no alive schedulers, not good... if len(scheds) == 0: logger.warn('[%s] but there a no alive schedulers in this realm!', r.get_name()) # we need to loop until the conf is assigned # or when there are no more schedulers available while True: try: sched = scheds.pop() except IndexError: # No more schedulers.. not good, no loop # need_loop = False # The conf does not need to be dispatch cfg_id = conf.id # we need to dispatch the conf, but there are no # available alive schedulers to use, this conf # will be lost and something is wrong. We must # log it! logger.warn('[%s] Conf %d do not have additional ' 'alive schedulers to assign to!', r.get_name(), conf.id) for kind in ('reactionner', 'poller', 'broker', 'receiver'): r.to_satellites[kind][cfg_id] = None r.to_satellites_need_dispatch[kind][cfg_id] = False r.to_satellites_managed_by[kind][cfg_id] = [] break logger.info('[%s] Trying to send conf %d to scheduler %s', r.get_name(), conf.id, sched.get_name()) if not sched.need_conf: logger.info('[%s] The scheduler %s do not need conf, sorry', r.get_name(), sched.get_name()) continue # We tag conf with the instance_name = scheduler_name instance_name = sched.scheduler_name # We give this configuration a new 'flavor' conf.push_flavor = random.randint(1, 1000000) # REF: doc/shinken-conf-dispatching.png (3) # REF: doc/shinken-scheduler-lost.png (2) override_conf = sched.get_override_configuration() satellites_for_sched = r.get_satellites_links_for_scheduler() s_conf = r.serialized_confs[conf.id] # Prepare the conf before sending it conf_package = { 'conf': s_conf, 'override_conf': override_conf, 'modules': sched.modules, 'satellites': satellites_for_sched, 'instance_name': sched.scheduler_name, 'push_flavor': conf.push_flavor, 'skip_initial_broks': sched.skip_initial_broks, 'accept_passive_unknown_check_results': sched.accept_passive_unknown_check_results, # shiken.io part 'api_key': self.conf.api_key, 'secret': self.conf.secret, 'http_proxy': self.conf.http_proxy, # statsd one too because OlivierHA love statsd # and after some years of effort he manages to make me # understand the powerfullness of metrics :) 'statsd_host': self.conf.statsd_host, 'statsd_port': self.conf.statsd_port, 'statsd_prefix': self.conf.statsd_prefix, 'statsd_enabled': self.conf.statsd_enabled, } t1 = time.time() is_sent = sched.put_conf(conf_package) logger.debug("Conf is sent in %d", time.time() - t1) if not is_sent: logger.warning('[%s] configuration dispatching error for scheduler %s', r.get_name(), sched.get_name()) continue logger.info('[%s] Dispatch OK of conf in scheduler %s', r.get_name(), sched.get_name()) sched.conf = conf sched.push_flavor = conf.push_flavor sched.need_conf = False conf.is_assigned = True conf.assigned_to = sched # We update all data for this scheduler sched.managed_confs = {conf.id: conf.push_flavor} # Now we generate the conf for satellites: cfg_id = conf.id for kind in ('reactionner', 'poller', 'broker', 'receiver'): r.to_satellites[kind][cfg_id] = sched.give_satellite_cfg() r.to_satellites_need_dispatch[kind][cfg_id] = True r.to_satellites_managed_by[kind][cfg_id] = [] # Ok, the conf is dispatched, no more loop for this # configuration break # We pop conf to dispatch, so it must be no more conf... conf_to_dispatch = [cfg for cfg in self.conf.confs.values() if not cfg.is_assigned] nb_missed = len(conf_to_dispatch) if nb_missed > 0: logger.warning("All schedulers configurations are not dispatched, %d are missing", nb_missed) else: logger.info("OK, all schedulers configurations are dispatched :)") self.dispatch_ok = True # Sched without conf in a dispatch ok are set to no need_conf # so they do not raise dispatch where no use if self.dispatch_ok: for sched in self.schedulers.items.values(): if sched.conf is None: # print "Tagging sched", sched.get_name(), # "so it do not ask anymore for conf" sched.need_conf = False arbiters_cfg = {} for arb in self.arbiters: arbiters_cfg[arb.id] = arb.give_satellite_cfg() # We put the satellites conf with the "new" way so they see only what we want for r in self.realms: for cfg in r.confs.values(): cfg_id = cfg.id # flavor if the push number of this configuration send to a scheduler flavor = cfg.push_flavor for kind in ('reactionner', 'poller', 'broker', 'receiver'): if r.to_satellites_need_dispatch[kind][cfg_id]: cfg_for_satellite_part = r.to_satellites[kind][cfg_id] # make copies of potential_react list for sort satellites = [] for satellite in r.get_potential_satellites_by_type(kind): satellites.append(satellite) satellites.sort(alive_then_spare_then_deads) # Only keep alive Satellites and reachable ones satellites = [s for s in satellites if s.alive and s.reachable] # If we got a broker, we make the list to pop a new # item first for each scheduler, so it will smooth the load # But the spare must stay at the end ;) # WARNING : skip this if we are in a complet broker link realm if kind == "broker" and not r.broker_complete_links: nospare = [s for s in satellites if not s.spare] # Should look over the list, not over if len(nospare) != 0: idx = cfg_id % len(nospare) spares = [s for s in satellites if s.spare] new_satellites = nospare[idx:] for _b in nospare[: -idx + 1]: if _b not in new_satellites: new_satellites.append(_b) satellites = new_satellites satellites.extend(spares) # Dump the order where we will send conf satellite_string = "[%s] Dispatching %s satellite with order: " % ( r.get_name(), kind) for satellite in satellites: satellite_string += '%s (spare:%s), ' % ( satellite.get_name(), str(satellite.spare)) logger.info(satellite_string) # Now we dispatch cfg to every one ask for it nb_cfg_sent = 0 for satellite in satellites: # Send only if we need, and if we can if (nb_cfg_sent < r.get_nb_of_must_have_satellites(kind) and satellite.alive): satellite.cfg['schedulers'][cfg_id] = cfg_for_satellite_part if satellite.manage_arbiters: satellite.cfg['arbiters'] = arbiters_cfg # Brokers should have poller/reactionners links too if kind == "broker": r.fill_broker_with_poller_reactionner_links(satellite) is_sent = False # Maybe this satellite already got this configuration, # so skip it if satellite.do_i_manage(cfg_id, flavor): logger.info('[%s] Skipping configuration %d send ' 'to the %s %s: it already got it', r.get_name(), cfg_id, kind, satellite.get_name()) is_sent = True else: # ok, it really need it :) logger.info('[%s] Trying to send configuration to %s %s', r.get_name(), kind, satellite.get_name()) is_sent = satellite.put_conf(satellite.cfg) if is_sent: satellite.active = True logger.info('[%s] Dispatch OK of configuration %s to %s %s', r.get_name(), cfg_id, kind, satellite.get_name()) # We change the satellite configuration, update our data satellite.known_conf_managed_push(cfg_id, flavor) nb_cfg_sent += 1 r.to_satellites_managed_by[kind][cfg_id].append(satellite) # If we got a broker, the conf_id must be sent to only ONE # broker in a classic realm. if kind == "broker" and not r.broker_complete_links: break # If receiver, we must send the hostnames # of this configuration if kind == 'receiver': hnames = [h.get_name() for h in cfg.hosts] logger.debug("[%s] Sending %s hostnames to the " "receiver %s", r.get_name(), len(hnames), satellite.get_name()) satellite.push_host_names(cfg_id, hnames) # else: # #I've got enough satellite, the next ones are considered spares if nb_cfg_sent == r.get_nb_of_must_have_satellites(kind): logger.info("[%s] OK, no more %s sent need", r.get_name(), kind) r.to_satellites_need_dispatch[kind][cfg_id] = False
def _hook_save_retention(self, retention): """ Detached retention task ... """ self.set_proctitle(self.name) try: self._open() except Exception: logger.warn("[Mongodb-Scheduler-Retention] retention save error") return # Hosts / services retention now = time.time() logger.info('[Mongodb-Scheduler-Retention] update hosts/services retention starting ...') hosts = retention['hosts'] services = retention['services'] comments = [] downtimes = [] try: for host in hosts: _id = '%s,hostcheck' % host logger.info('[Mongodb-Scheduler-Retention] update host retention: %s.' % host) host_retention = hosts[host] dumped_value = pickle.dumps(host_retention, protocol=pickle.HIGHEST_PROTOCOL) value = base64.b64encode(dumped_value) self.hosts_collection.remove({'_id': _id}) retention_data = {'_id': _id, 'value': value, 'timestamp': int(time.time()) } self.hosts_collection.insert(retention_data) if host_retention['downtimes']: for downtime in host_retention['downtimes']: downtimes.append(('%s,%s' % (_id, downtime.entry_time), self._get_element(downtime, host, 'hostcheck'))) logger.info('[Mongodb-Scheduler-Retention] - host downtime: %s,%s: %s' % (_id, downtime.entry_time, downtime)) if host_retention['comments']: for comment in host_retention['comments']: comments.append(('%s,%s' % (_id, comment.entry_time), self._get_element(comment, host, 'hostcheck'))) logger.info('[Mongodb-Scheduler-Retention] - host comment: %s,%s: %s' % (_id, comment.entry_time, comment)) logger.info('[Mongodb-Scheduler-Retention] updated hosts retention.') for (host, service) in services: _id = '%s,%s' % (host, service) logger.info('[Mongodb-Scheduler-Retention] update service retention: %s.' % _id) service_retention = services[(host, service)] dumped_value = pickle.dumps(service_retention, protocol=pickle.HIGHEST_PROTOCOL) value = base64.b64encode(dumped_value) self.services_collection.remove({'_id': _id}) retention_data = {'_id': _id, 'value': value, 'timestamp': int(time.time()) } if service_retention['downtimes']: for downtime in service_retention['downtimes']: downtimes.append(('%s,%s' % (_id, downtime.entry_time), self._get_element(downtime, host, service))) logger.info('[Mongodb-Scheduler-Retention] - service downtime: %s,%s: %s' % (_id, downtime.entry_time, downtime)) if service_retention['comments']: for comment in service_retention['comments']: comments.append(('%s,%s' % (_id, comment.entry_time), self._get_element(comment, host, service))) logger.info('[Mongodb-Scheduler-Retention] - service comment: %s,%s: %s' % (_id, comment.entry_time, comment)) self.services_collection.insert(retention_data) logger.info('[Mongodb-Scheduler-Retention] updated services retention.') except Exception: logger.warn('[Mongodb-Scheduler-Retention] update hosts/services retention error: %s' % traceback.format_exc()) logger.info("[Mongodb-Scheduler-Retention] update hosts/services retention (%3.4fs)", time.time() - now) # Comments / downtimes retention now = time.time() logger.info('[Mongodb-Scheduler-Retention] update comments/downtimes retention starting ...') try: for _id, comment in comments: filter = { "host": comment['host'], "service": comment['service'], "entry_time": comment['entry_time'] } try: result = self.comments_collection.replace_one(filter, comment, upsert=True) if result.upserted_id: logger.info('[Mongodb-Scheduler-Retention] comment inserted: %s / %s.' % (_id, comment)) except Exception: logger.warn('[Mongodb-Scheduler-Retention] comment update/insert error: %s' % traceback.format_exc()) except Exception: logger.warn('[Mongodb-Scheduler-Retention] comments update error: %s' % traceback.format_exc()) try: for _id, downtime in downtimes: filter = { "host": downtime['host'], "service": downtime['service'], "entry_time": downtime['entry_time'] } try: result = self.downtimes_collection.replace_one(filter, downtime, upsert=True) if result.upserted_id: logger.info('[Mongodb-Scheduler-Retention] downtime inserted: %s / %s.' % (_id, downtime)) except Exception: logger.warn('[Mongodb-Scheduler-Retention] downtime update/insert error: %s' % traceback.format_exc()) except Exception: logger.warn('[Mongodb-Scheduler-Retention] comments update error: %s' % traceback.format_exc()) logger.info("[Mongodb-Scheduler-Retention] update comments/downtimes retention (%3.4fs)", time.time() - now) self._close()
def hook_load_retention(self, daemon): """ Called by Scheduler to restore stored retention data """ logger.info('[Mongodb-Scheduler-Retention] retention load starting ...') # Now the old flat file way :( if self.path: logger.info("[Mongodb-Scheduler-Retention] Reading from retention_file %s" % self.path) try: f = open(self.path, 'rb') all_data = cPickle.load(f) f.close() except (EOFError, ValueError, IOError) as exp: logger.warning("[Mongodb-Scheduler-Retention] error reading retention file: %s" % str(exp)) return False except (IndexError, TypeError) as exp: logger.warning("[Mongodb-Scheduler-Retention] Sorry, the resource file is not compatible!") return False # call the scheduler helper function for restoring values daemon.restore_retention_data(all_data) logger.info("[Mongodb-Scheduler-Retention] Retention objects loaded successfully.") return try: self._open() except Exception: logger.warn("[Mongodb-Scheduler-Retention] retention load error") return hosts = {} services = {} restored_hosts = {} restored_services = {} try: host_cursor = self.hosts_collection.find() service_cursor = self.services_collection.find() for host in host_cursor: value = host.get('value') restored_hosts[host.get('_id')] = value for service in service_cursor: value = service.get('value') restored_services[service.get('_id')] = value for host in daemon.hosts: key = '%s,hostcheck' % (host.host_name) if key in restored_hosts: restored_value = restored_hosts[key] value = pickle.loads(base64.b64decode(restored_value)) hosts[host.host_name] = value logger.info('[Mongodb-Scheduler-Retention] restored host retention: %s' % (key)) if hosts[host.host_name]['downtimes']: for downtime in hosts[host.host_name]['downtimes']: logger.info('[Mongodb-Scheduler-Retention] - host downtime: %s: %s' % (key, downtime.id)) if hosts[host.host_name]['comments']: for comment in hosts[host.host_name]['comments']: logger.info('[Mongodb-Scheduler-Retention] - host comment: %s: %s' % (key, comment.id)) for service in daemon.services: key = '%s,%s' % (service.host.host_name, service.service_description) if key in restored_services: restored_value = restored_services[key] value = pickle.loads(base64.b64decode(restored_value)) services[(service.host.host_name,service.service_description)] = value logger.info('[Mongodb-Scheduler-Retention] restored service retention: %s.' % (key)) if services[(service.host.host_name,service.service_description)]['downtimes']: for downtime in services[(service.host.host_name,service.service_description)]['downtimes']: logger.info('[Mongodb-Scheduler-Retention] - host downtime: %s: %s' % (key, downtime.id)) if services[(service.host.host_name,service.service_description)]['comments']: for comment in services[(service.host.host_name,service.service_description)]['comments']: logger.info('[Mongodb-Scheduler-Retention] - host comment: %s: %s' % (key, comment.id)) retention_data = {'hosts': hosts, 'services': services} daemon.restore_retention_data(retention_data) logger.info('[Mongodb-Scheduler-Retention] retention load ends') except Exception: logger.error('[Mongodb-Scheduler-Retention] Retention load error.') logger.error('[Mongodb-Scheduler-Retention] %s' % traceback.format_exc()) finally: self._close()