def init_livestatus(self, modconf=None): self.livelogs = 'tmp/livelogs.db' + self.testid if modconf is None: modconf = Module({'module_name': 'LiveStatus', 'module_type': 'livestatus', 'port': str(50000 + os.getpid()), 'pnp_path': 'tmp/pnp4nagios_test' + self.testid, 'host': '127.0.0.1', 'socket': 'live', 'name': 'test', #? }) dbmodconf = Module({'module_name': 'LogStore', 'module_type': 'logstore_sqlite', 'use_aggressive_sql': "0", 'database_file': self.livelogs, 'archive_path': os.path.join(os.path.dirname(self.livelogs), 'archives'), }) modconf.modules = [dbmodconf] self.livestatus_broker = LiveStatus_broker(modconf) self.livestatus_broker.create_queues() #--- livestatus_broker.main self.livestatus_broker.log = logger # this seems to damage the logger so that the scheduler can't use it #self.livestatus_broker.log.load_obj(self.livestatus_broker) self.livestatus_broker.debug_output = [] self.livestatus_broker.modules_manager = ModulesManager('livestatus', self.livestatus_broker.find_modules_path(), []) self.livestatus_broker.modules_manager.set_modules(self.livestatus_broker.modules) # We can now output some previouly silented debug ouput self.livestatus_broker.do_load_modules() for inst in self.livestatus_broker.modules_manager.instances: if inst.properties["type"].startswith('logstore'): f = getattr(inst, 'load', None) if f and callable(f): f(self.livestatus_broker) # !!! NOT self here !!!! break for s in self.livestatus_broker.debug_output: print "errors during load", s del self.livestatus_broker.debug_output self.livestatus_broker.rg = LiveStatusRegenerator() self.livestatus_broker.datamgr = datamgr datamgr.load(self.livestatus_broker.rg) self.livestatus_broker.query_cache = LiveStatusQueryCache() self.livestatus_broker.query_cache.disable() self.livestatus_broker.rg.register_cache(self.livestatus_broker.query_cache) #--- livestatus_broker.main self.livestatus_broker.init() self.livestatus_broker.db = self.livestatus_broker.modules_manager.instances[0] self.livestatus_broker.livestatus = LiveStatus(self.livestatus_broker.datamgr, self.livestatus_broker.query_cache, self.livestatus_broker.db, self.livestatus_broker.pnp_path, self.livestatus_broker.from_q) #--- livestatus_broker.do_main self.livestatus_broker.db.open()
def init(self): print "Initialisation of the thrift broker" # to_queue is where we get broks from Broker #self.to_q = self.properties['to_queue'] # from_quue is where we push back objects like # external commands to the broker #self.from_q = self.properties['from_queue'] # db has to be opened in the manage_brok thread self.prepare_log_db() self.prepare_pnp_path() self.thrift = Thrift_status(self.configs, self.hosts, self.services, self.contacts, self.hostgroups, self.servicegroups, self.contactgroups, self.timeperiods, self.commands, self.schedulers, self.pollers, self.reactionners, self.brokers, self.dbconn, self.pnp_path, self.from_q) m = MacroResolver() m.output_macros = ['HOSTOUTPUT', 'HOSTPERFDATA', 'HOSTACKAUTHOR', 'HOSTACKCOMMENT', 'SERVICEOUTPUT', 'SERVICEPERFDATA', 'SERVICEACKAUTHOR', 'SERVICEACKCOMMENT']
class Thrift_broker(BaseModule): def __init__(self, mod_conf, host, port, socket, allowed_hosts, database_file, max_logs_age, pnp_path, debug=None, debug_queries=False): BaseModule.__init__(self, mod_conf) self.host = host self.port = port self.socket = socket self.allowed_hosts = allowed_hosts self.database_file = database_file self.max_logs_age = max_logs_age self.pnp_path = pnp_path self.debug = debug self.debug_queries = debug_queries #Our datas self.configs = {} self.hosts = SortedDict() self.services = SortedDict() self.contacts = SortedDict() self.hostgroups = SortedDict() self.servicegroups = SortedDict() self.contactgroups = SortedDict() self.timeperiods = SortedDict() self.commands = SortedDict() #Now satellites self.schedulers = SortedDict() self.pollers = SortedDict() self.reactionners = SortedDict() self.brokers = SortedDict() self.service_id_cache = {} self.instance_ids = [] self.number_of_objects = 0 self.last_need_data_send = time.time() #Called by Broker so we can do init stuff def init(self): print "Initialisation of the thrift broker" #to_queue is where we get broks from Broker #self.to_q = self.properties['to_queue'] #from_quue is where we push back objects like #external commands to the broker #self.from_q = self.properties['from_queue'] # db has to be opened in the manage_brok thread self.prepare_log_db() self.prepare_pnp_path() self.thrift = Thrift_status(self.configs, self.hosts, self.services, self.contacts, self.hostgroups, self.servicegroups, self.contactgroups, self.timeperiods, self.commands, self.schedulers, self.pollers, self.reactionners, self.brokers, self.dbconn, self.pnp_path, self.from_q) m = MacroResolver() m.output_macros = ['HOSTOUTPUT', 'HOSTPERFDATA', 'HOSTACKAUTHOR', 'HOSTACKCOMMENT', 'SERVICEOUTPUT', 'SERVICEPERFDATA', 'SERVICEACKAUTHOR', 'SERVICEACKCOMMENT'] def manage_program_status_brok(self, b): data = b.data c_id = data['instance_id'] print "Creating config:", c_id, data c = Config() for prop in data: setattr(c, prop, data[prop]) #print "CFG:", c self.configs[0] = c # And we save that we got data from this instance_id self.instance_ids.append(c_id) # We should clean all previously added hosts and services inst_id = data['instance_id'] to_del = [] to_del_srv = [] for h in self.hosts.values(): # If the host was in this instance, del it if h.instance_id == inst_id: to_del.append(h.host_name) for s in self.services.values(): if s.instance_id == inst_id: to_del_srv.append(s.host_name + s.service_description) # Now clean hostgroups too for hg in self.hostgroups.values(): print "Len before exclude", len(hg.members) # Exclude from members the hosts with this inst_id hg.members = [h for h in hg.members if h.instance_id != inst_id] print "Len after", len(hg.members) # Now clean service groups for sg in self.servicegroups.values(): sg.members = [s for s in sg.members if s.instance_id != inst_id] # Ok, really clean the hosts for i in to_del: try: del self.hosts[i] except KeyError: # maybe it was not inserted in a good way, pass it pass # And services for i in to_del_srv: try: del self.services[i] except KeyError: # maybe it was not inserted in a good way, pass it pass def manage_update_program_status_brok(self, b): data = b.data c_id = data['instance_id'] if c_id not in self.instance_ids: # Do not ask data too quickly, very dangerous # one a minute if time.time() - self.last_need_data_send > 60: print "I ask the broker for instance id data :", c_id msg = Message(id=0, type='NeedData', data={'full_instance_id' : c_id}) self.from_q.put(msg) self.last_need_data_send = time.time() return # We have only one config here, with id 0 c = self.configs[0] self.update_element(c, data) def set_schedulingitem_values(self, i): i.check_period = self.get_timeperiod(i.check_period) i.notification_period = self.get_timeperiod(i.notification_period) i.contacts = self.get_contacts(i.contacts) i.rebuild_ref() #Escalations is not use for status_dat del i.escalations def manage_initial_host_status_brok(self, b): data = b.data host_name = data['host_name'] inst_id = data['instance_id'] #print "Creating host:", h_id, b h = Host({}) self.update_element(h, data) self.set_schedulingitem_values(h) h.service_ids = [] h.services = [] h.instance_id = inst_id # We need to rebuild Downtime and Comment relationship for dtc in h.downtimes + h.comments: dtc.ref = h self.hosts[host_name] = h self.number_of_objects += 1 #In fact, an update of a host is like a check return def manage_update_host_status_brok(self, b): self.manage_host_check_result_brok(b) data = b.data host_name = data['host_name'] #In the status, we've got duplicated item, we must relink thems try: h = self.hosts[host_name] except KeyError: print "Warning : the host %s is unknown!" % host_name return self.update_element(h, data) self.set_schedulingitem_values(h) for dtc in h.downtimes + h.comments: dtc.ref = h self.thrift.count_event('host_checks') def manage_initial_hostgroup_status_brok(self, b): data = b.data hostgroup_name = data['hostgroup_name'] members = data['members'] del data['members'] # Maybe we already got this hostgroup. If so, use the existing object # because in different instance, we will ahve the same group with different # elements try: hg = self.hostgroups[hostgroup_name] except KeyError: # If we got none, create a new one #print "Creating hostgroup:", hg_id, data hg = Hostgroup() # Set by default members to a void list setattr(hg, 'members', []) self.update_element(hg, data) for (h_id, h_name) in members: if h_name in self.hosts: hg.members.append(self.hosts[h_name]) # Should got uniq value, do uniq this list hg.members = list(set(hg.members)) #print "HG:", hg self.hostgroups[hostgroup_name] = hg self.number_of_objects += 1 def manage_initial_service_status_brok(self, b): data = b.data s_id = data['id'] host_name = data['host_name'] service_description = data['service_description'] inst_id = data['instance_id'] #print "Creating Service:", s_id, data s = Service({}) s.instance_id = inst_id self.update_element(s, data) self.set_schedulingitem_values(s) try: h = self.hosts[host_name] # Reconstruct the connection between hosts and services h.services.append(s) # There is already a s.host_name, but a reference to the h object can be useful too s.host = h except Exception: return for dtc in s.downtimes + s.comments: dtc.ref = s self.services[host_name+service_description] = s self.number_of_objects += 1 # We need this for manage_initial_servicegroup_status_brok where it # will speed things up dramatically self.service_id_cache[s.id] = s #In fact, an update of a service is like a check return def manage_update_service_status_brok(self, b): self.manage_service_check_result_brok(b) data = b.data host_name = data['host_name'] service_description = data['service_description'] #In the status, we've got duplicated item, we must relink thems try: s = self.services[host_name+service_description] except KeyError: print "Warning : the service %s/%s is unknown!" % (host_name, service_description) return self.update_element(s, data) self.set_schedulingitem_values(s) for dtc in s.downtimes + s.comments: dtc.ref = s self.thrift.count_event('service_checks') def manage_initial_servicegroup_status_brok(self, b): data = b.data sg_id = data['id'] servicegroup_name = data['servicegroup_name'] members = data['members'] del data['members'] # Like for hostgroups, maybe we already got this # service group from another instance, need to # factorize all try: sg = self.servicegroups[servicegroup_name] except KeyError: #print "Creating servicegroup:", sg_id, data sg = Servicegroup() # By default set members as a void list setattr(sg, 'members', []) self.update_element(sg, data) for (s_id, s_name) in members: # A direct lookup by s_host_name+s_name is not possible # because we don't have the host_name in members, only ids. try: sg.members.append(self.service_id_cache[s_id]) except Exception: pass sg.members = list(set(sg.members)) self.servicegroups[servicegroup_name] = sg self.number_of_objects += 1 def manage_initial_contact_status_brok(self, b): data = b.data contact_name = data['contact_name'] #print "Creating Contact:", c_id, data c = Contact({}) self.update_element(c, data) #print "C:", c self.contacts[contact_name] = c self.number_of_objects += 1 def manage_initial_contactgroup_status_brok(self, b): data = b.data contactgroup_name = data['contactgroup_name'] members = data['members'] del data['members'] #print "Creating contactgroup:", cg_id, data cg = Contactgroup() self.update_element(cg, data) setattr(cg, 'members', []) for (c_id, c_name) in members: if c_name in self.contacts: cg.members.append(self.contacts[c_name]) #print "CG:", cg self.contactgroups[contactgroup_name] = cg self.number_of_objects += 1 def manage_initial_timeperiod_status_brok(self, b): data = b.data timeperiod_name = data['timeperiod_name'] #print "Creating Timeperiod:", tp_id, data tp = Timeperiod({}) self.update_element(tp, data) #print "TP:", tp self.timeperiods[timeperiod_name] = tp self.number_of_objects += 1 def manage_initial_command_status_brok(self, b): data = b.data command_name = data['command_name'] #print "Creating Command:", c_id, data c = Command({}) self.update_element(c, data) #print "CMD:", c self.commands[command_name] = c self.number_of_objects += 1 def manage_initial_scheduler_status_brok(self, b): data = b.data scheduler_name = data['scheduler_name'] print "Creating Scheduler:", scheduler_name, data sched = SchedulerLink({}) print "Created a new scheduler", sched self.update_element(sched, data) print "Updated scheduler" #print "CMD:", c self.schedulers[scheduler_name] = sched print "scheduler added" #print "MONCUL: Add a new scheduler ", sched self.number_of_objects += 1 def manage_update_scheduler_status_brok(self, b): data = b.data scheduler_name = data['scheduler_name'] try: s = self.schedulers[scheduler_name] self.update_element(s, data) #print "S:", s except Exception: pass def manage_initial_poller_status_brok(self, b): data = b.data poller_name = data['poller_name'] print "Creating Poller:", poller_name, data poller = PollerLink({}) print "Created a new poller", poller self.update_element(poller, data) print "Updated poller" #print "CMD:", c self.pollers[poller_name] = poller print "poller added" #print "MONCUL: Add a new scheduler ", sched self.number_of_objects += 1 def manage_update_poller_status_brok(self, b): data = b.data poller_name = data['poller_name'] try: s = self.pollers[poller_name] self.update_element(s, data) except Exception: pass def manage_initial_reactionner_status_brok(self, b): data = b.data reactionner_name = data['reactionner_name'] print "Creating Reactionner:", reactionner_name, data reac = ReactionnerLink({}) print "Created a new reactionner", reac self.update_element(reac, data) print "Updated reactionner" #print "CMD:", c self.reactionners[reactionner_name] = reac print "reactionner added" #print "MONCUL: Add a new scheduler ", sched self.number_of_objects += 1 def manage_update_reactionner_status_brok(self, b): data = b.data reactionner_name = data['reactionner_name'] try: s = self.reactionners[reactionner_name] self.update_element(s, data) except Exception: pass def manage_initial_broker_status_brok(self, b): data = b.data broker_name = data['broker_name'] print "Creating Broker:", broker_name, data broker = BrokerLink({}) print "Created a new broker", broker self.update_element(broker, data) print "Updated broker" #print "CMD:", c self.brokers[broker_name] = broker print "broker added" #print "MONCUL: Add a new scheduler ", sched self.number_of_objects += 1 def manage_update_broker_status_brok(self, b): data = b.data broker_name = data['broker_name'] try: s = self.brokers[broker_name] self.update_element(s, data) except Exception: pass #A service check have just arrived, we UPDATE data info with this def manage_service_check_result_brok(self, b): data = b.data host_name = data['host_name'] service_description = data['service_description'] try: s = self.services[host_name+service_description] self.update_element(s, data) except Exception: pass #A service check update have just arrived, we UPDATE data info with this def manage_service_next_schedule_brok(self, b): self.manage_service_check_result_brok(b) def manage_host_check_result_brok(self, b): data = b.data host_name = data['host_name'] try: h = self.hosts[host_name] self.update_element(h, data) except Exception: pass # this brok should arrive within a second after the host_check_result_brok def manage_host_next_schedule_brok(self, b): self.manage_host_check_result_brok(b) #A log brok will be written into a database def manage_log_brok(self, b): data = b.data line = data['log'].encode('UTF-8').rstrip() # split line and make sql insert #print "LOG--->", line # [1278280765] SERVICE ALERT: test_host_0 # split leerzeichen if line[0] != '[' and line[11] != ']': pass print "INVALID" # invalid else: service_states = { 'OK' : 0, 'WARNING' : 1, 'CRITICAL' : 2, 'UNKNOWN' : 3, 'RECOVERY' : 0 } host_states = { 'UP' : 0, 'DOWN' : 1, 'UNREACHABLE' : 2, 'UNKNOWN': 3, 'RECOVERY' : 0 } # 'attempt', 'class', 'command_name', 'comment', 'contact_name', 'host_name', 'lineno', 'message', # 'options', 'plugin_output', 'service_description', 'state', 'state_type', 'time', 'type', # 0:info, 1:state, 2:program, 3:notification, 4:passive, 5:command # lineno, message?, plugin_output? logobject = LOGOBJECT_INFO logclass = LOGCLASS_INVALID attempt, state = [0] * 2 command_name, comment, contact_name, host_name, message, options, plugin_output, service_description, state_type = [''] * 9 time= line[1:11] #print "i start with a timestamp", time first_type_pos = line.find(' ') + 1 last_type_pos = line.find(':') first_detail_pos = last_type_pos + 2 type = line[first_type_pos:last_type_pos] options = line[first_detail_pos:] message = line if type == 'CURRENT SERVICE STATE': logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_STATE host_name, service_description, state, state_type, attempt, plugin_output = options.split(';', 5) elif type == 'INITIAL SERVICE STATE': logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_STATE host_name, service_description, state, state_type, attempt, plugin_output = options.split(';', 5) elif type == 'SERVICE ALERT': # SERVICE ALERT: srv-40;Service-9;CRITICAL;HARD;1;[Errno 2] No such file or directory logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_ALERT host_name, service_description, state, state_type, attempt, plugin_output = options.split(';', 5) state = service_states[state] elif type == 'SERVICE DOWNTIME ALERT': logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_ALERT host_name, service_description, state_type, comment = options.split(';', 3) elif type == 'SERVICE FLAPPING ALERT': logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_ALERT host_name, service_description, state_type, comment = options.split(';', 3) elif type == 'CURRENT HOST STATE': logobject = LOGOBJECT_HOST logclass = LOGCLASS_STATE host_name, state, state_type, attempt, plugin_output = options.split(';', 4) elif type == 'INITIAL HOST STATE': logobject = LOGOBJECT_HOST logclass = LOGCLASS_STATE host_name, state, state_type, attempt, plugin_output = options.split(';', 4) elif type == 'HOST ALERT': logobject = LOGOBJECT_HOST logclass = LOGCLASS_ALERT host_name, state, state_type, attempt, plugin_output = options.split(';', 4) state = host_states[state] elif type == 'HOST DOWNTIME ALERT': logobject = LOGOBJECT_HOST logclass = LOGCLASS_ALERT host_name, state_type, comment = options.split(';', 2) elif type == 'HOST FLAPPING ALERT': logobject = LOGOBJECT_HOST logclass = LOGCLASS_ALERT host_name, state_type, comment = options.split(';', 2) elif type == 'SERVICE NOTIFICATION': # tust_cuntuct;test_host_0;test_ok_0;CRITICAL;notify-service;i am CRITICAL <-- normal # SERVICE NOTIFICATION: test_contact;test_host_0;test_ok_0;DOWNTIMESTART (OK);notify-service;OK logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_NOTIFICATION contact_name, host_name, service_description, state_type, command_name, check_plugin_output = options.split(';', 5) if '(' in state_type: # downtime/flapping/etc-notifications take the type UNKNOWN state_type = 'UNKNOWN' state = service_states[state_type] elif type == 'HOST NOTIFICATION': # tust_cuntuct;test_host_0;DOWN;notify-host;i am DOWN logobject = LOGOBJECT_HOST logclass = LOGCLASS_NOTIFICATION contact_name, host_name, state_type, command_name, check_plugin_output = options.split(';', 4) if '(' in state_type: state_type = 'UNKNOWN' state = host_states[state_type] elif type == 'PASSIVE SERVICE CHECK': logobject = LOGOBJECT_SERVICE logclass = LOGCLASS_PASSIVECHECK host_name, service_description, state, check_plugin_output = options.split(';', 3) elif type == 'PASSIVE HOST CHECK': logobject = LOGOBJECT_HOST logclass = LOGCLASS_PASSIVECHECK host_name, state, check_plugin_output = options.split(';', 2) elif type == 'SERVICE EVENT HANDLER': # SERVICE EVENT HANDLER: test_host_0;test_ok_0;CRITICAL;SOFT;1;eventhandler logobject = LOGOBJECT_SERVICE host_name, service_description, state, state_type, attempt, command_name = options.split(';', 5) state = service_states[state] elif type == 'HOST EVENT HANDLER': logobject = LOGOBJECT_HOST host_name, state, state_type, attempt, command_name = options.split(';', 4) state = host_states[state] elif type == 'EXTERNAL COMMAND': logobject = LOGOBJECT_INFO logclass = LOGCLASS_COMMAND elif type.startswith('starting...') or \ type.startswith('shutting down...') or \ type.startswith('Bailing out') or \ type.startswith('active mode...') or \ type.startswith('standby mode...'): logobject = LOGOBJECT_INFO logclass = LOGCLASS_PROGRAM else: pass #print "does not match" lineno = 0 try: values = (logobject, attempt, logclass, command_name, comment, contact_name, host_name, lineno, message, options, plugin_output, service_description, state, state_type, time, type) except: print "Unexpected error:", sys.exc_info()[0] #print "LOG:", logobject, logclass, type, host_name, service_description, state, state_type, attempt, plugin_output, contact_name, comment, command_name #print "LOG:", values try: if logclass != LOGCLASS_INVALID: if sqlite3.paramstyle == 'pyformat': values = dict(zip([str(x) for x in xrange(len(values))], values)) self.dbcursor.execute('INSERT INTO LOGS VALUES(%(0)s, %(1)s, %(2)s, %(3)s, %(4)s, %(5)s, %(6)s, %(7)s, %(8)s, %(9)s, %(10)s, %(11)s, %(12)s, %(13)s, %(14)s, %(15)s)', values) else: self.dbcursor.execute('INSERT INTO LOGS VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', values) self.dbconn.commit() except sqlite3.Error, e: print "An error occurred:", e.args[0] print "DATABASE ERROR!!!!!!!!!!!!!!!!!" self.thrift.count_event('log_message')