def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.data['instance_id'] = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': print "Adding in queue an external command", ExternalCommand.__dict__ self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something #like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! print elt.__dict__ if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id mean : I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] logger.log('A module is asking me to get all initial data from the scheduler %d' % c_id) # so we just reset the connexion adn the running_id, it will just get all new things self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0
def load(self): now = int(time.time()) """ Try to import the requested modules ; put the imported modules in self.imported_modules. The previous imported modules, if any, are cleaned before. """ # We get all modules file with .py modules_files = [ fname[:-3] for fname in os.listdir(self.modules_path) if fname.endswith(".py") ] # And directories modules_files.extend([ fname for fname in os.listdir(self.modules_path) if os.path.isdir(os.path.join(self.modules_path, fname)) ]) # Now we try to load thems # So first we add their dir into the sys.path if not self.modules_path in sys.path: sys.path.append(self.modules_path) # We try to import them, but we keep only the one of # our type del self.imported_modules[:] for fname in modules_files: #print "Try to load", fname try: m = __import__(fname) if not hasattr(m, 'properties'): continue # We want to keep only the modules of our type if self.modules_type in m.properties['daemons']: self.imported_modules.append(m) except Exception , exp: logger.log("Warning in importing module : %s" % exp)
def no_loop_in_parents(self): # Ok, we say "from now, no loop :) " r = True # Create parent graph parents = Graph() # With all hosts as nodes for h in self: if h is not None: parents.add_node(h) # And now fill edges for h in self: for p in h.parents: if p is not None: parents.add_edge(p, h) # Now get the list of all hosts in a loop host_in_loops = parents.loop_check() # and raise errors about it for h in host_in_loops: logger.log("Error: The host '%s' is part of a circular parent/child chain!" % h.get_name()) r = False return r
def is_correct(self): state = True #guilty or not? :) cls = self.__class__ #All of the above are checks in the notificationways part for prop, entry in cls.properties.items(): if prop not in _special_properties: if not hasattr(self, prop) and entry.required: print self.get_name(), " : I do not have", prop state = False #Bad boy... #There is a case where there is no nw : when there is not special_prop defined #at all!! if self.notificationways == []: for p in _special_properties: print self.get_name(), " : I'm missing the property %s" % p state = False if hasattr(self, 'contact_name'): for c in cls.illegal_object_name_chars: if c in self.contact_name: logger.log( "%s : My contact_name got the caracter %s that is not allowed." % (self.get_name(), c)) state = False else: if hasattr(self, 'alias'): #take the alias if we miss the contact_name self.contact_name = self.alias return state
def is_correct(self): state = True #guilty or not? :) cls = self.__class__ #All of the above are checks in the notificationways part for prop, entry in cls.properties.items(): if prop not in _special_properties: if not hasattr(self, prop) and entry.required: print self.get_name(), " : I do not have", prop state = False #Bad boy... #There is a case where there is no nw : when there is not special_prop defined #at all!! if self.notificationways == []: for p in _special_properties: print self.get_name()," : I'm missing the property %s" % p state = False if hasattr(self, 'contact_name'): for c in cls.illegal_object_name_chars: if c in self.contact_name: logger.log("%s : My contact_name got the caracter %s that is not allowed." % (self.get_name(), c)) state = False else: if hasattr(self, 'alias'): #take the alias if we miss the contact_name self.contact_name = self.alias return state
def search(look_at): # Now really publish it proxy = CONFIG['shinken.io']['proxy'] api_key = CONFIG['shinken.io']['api_key'] # Ok we will push the file with a 10s timeout c = pycurl.Curl() c.setopt(c.POST, 0) c.setopt(c.CONNECTTIMEOUT, 10) c.setopt(c.TIMEOUT, 10) if proxy: c.setopt(c.PROXY, proxy) args = {'keywords': ','.join(look_at)} c.setopt(c.URL, str('shinken.io/searchcli?' + urllib.urlencode(args))) response = StringIO() c.setopt(pycurl.WRITEFUNCTION, response.write) #c.setopt(c.VERBOSE, 1) c.perform() r = c.getinfo(pycurl.HTTP_CODE) c.close() if r != 200: logger.error("There was a critical error : %s" % response.getvalue()) sys.exit(2) else: ret = json.loads(response.getvalue().replace('\\/', '/')) status = ret.get('status') result = ret.get('result') if status != 200: logger.log(result) return [] return result
def publish_archive(archive): # Now really publish it proxy = CONFIG['shinken.io']['proxy'] api_key = CONFIG['shinken.io']['api_key'] # Ok we will push the file with a 10s timeout c = pycurl.Curl() c.setopt(c.POST, 1) c.setopt(c.CONNECTTIMEOUT, 10) c.setopt(c.TIMEOUT, 10) if proxy: c.setopt(c.PROXY, proxy) c.setopt(c.URL, "http://shinken.io/push") c.setopt(c.HTTPPOST, [("api_key", api_key), ("data", (c.FORM_FILE, str(archive), c.FORM_CONTENTTYPE, "application/x-gzip")) ]) response = StringIO() c.setopt(pycurl.WRITEFUNCTION, response.write) c.setopt(c.VERBOSE, 1) c.perform() r = c.getinfo(pycurl.HTTP_CODE) c.close() if r != 200: logger.error("There was a critical error : %s" % response.getvalue()) sys.exit(2) else: ret = json.loads(response.getvalue().replace('\\/', '/')) status = ret.get('status') text = ret.get('text') if status == 200: logger.log(text) else: logger.error(text)
def get_scheduler_ordered_list(self, r): # get scheds, alive and no spare first scheds = [] for s in r.schedulers: scheds.append(s) # now the spare scheds of higher realms # they are after the sched of realm, so # they will be used after the spare of # the realm for higher_r in r.higher_realms: for s in higher_r.schedulers: if s.spare: scheds.append(s) # Now we sort the scheds so we take master, then spare # the dead, but we do not care about thems scheds.sort(alive_then_spare_then_deads) scheds.reverse() #pop is last, I need first #DBG: dump print_sched = [s.get_name() for s in scheds] print_sched.reverse() print_string = '[%s] Schedulers order : ' % r.get_name() for s in print_sched: print_string += '%s ' % s logger.log(print_string) #END DBG return scheds
def add(self, elt): cls_type = elt.__class__.my_type if cls_type == "brok": # For brok, we TAG brok with our instance_id elt.data["instance_id"] = 0 self.broks_internal_raised.append(elt) return elif cls_type == "externalcommand": print "Adding in queue an external command", ExternalCommand.__dict__ self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something # like from now a full data from a scheduler for example. elif cls_type == "message": # We got a message, great! print elt.__dict__ if elt.get_type() == "NeedData": data = elt.get_data() # Full instance id mean : I got no data for this scheduler # so give me all dumbass! if "full_instance_id" in data: c_id = data["full_instance_id"] logger.log("A module is asking me to get all initial data from the scheduler %d" % c_id) # so we just reset the connection adn the running_id, it will just get all new things try: self.schedulers[c_id]["con"] = None self.schedulers[c_id]["running_id"] = 0 except KeyError: # maybe this instance was not known, forget it print "WARNING: a module ask me a full_instance_id for an unknown ID!", c_id
def init(self): logger.log("I connect to NDO database") self.db = DBMysql(self.host, self.user, self.password, self.database, self.character_set, table_prefix='nagios_', port=self.port) self.connect_database() # Cache for hosts and services # The structure is as follow: # First the instance id then the host / (host,service desc) to access the wanted data self.services_cache_sync = {} self.hosts_cache_sync = {} # We need to search for centreon_specific fields, like long_output query = u"select TABLE_NAME from information_schema.columns where TABLE_SCHEMA='ndo' and TABLE_NAME='nagios_servicestatus' and COLUMN_NAME='long_output';" self.db.execute_query(query) row = self.db.fetchone() if row is None or len(row) < 1: self.centreon_version = False else: self.centreon_version = True logger.log("[MySQL/NDO] Using the centreon version") # Cache for database id # In order not to query the database every time self.database_id_cache = {} # Mapping service_id in Shinken and in database # Because can't acces host_name from a service everytime :( self.mapping_service_id = {} # Todo list to manage brok self.todo = []
def get_new_broks(self, type='scheduler'): # Get the good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.log('DBG: Type unknown for connection! %s' % type) return # We check for new check in each schedulers and put # the result in new_checks for sched_id in links: try: con = links[sched_id]['con'] if con is not None: # None = not initilized tmp_broks = con.get_broks() for b in tmp_broks.values(): b.instance_id = links[sched_id]['instance_id'] # Ok, we can add theses broks to our queues self.add_broks_to_queue(tmp_broks.values()) else: # no con? make the connection self.pynag_con_init(sched_id, type=type) # Ok, con is not known, so we create it except KeyError , exp: print exp self.pynag_con_init(sched_id, type=type) except Pyro.errors.ProtocolError , exp: logger.log("[%s] Connection problem to the %s %s : %s" % (self.name, type, links[sched_id]['name'], str(exp))) links[sched_id]['con'] = None
def search(look_at): # Now really publish it proxy = CONFIG['shinken.io']['proxy'] api_key = CONFIG['shinken.io']['api_key'] # Ok we will push the file with a 10s timeout c = pycurl.Curl() c.setopt(c.POST, 0) c.setopt(c.CONNECTTIMEOUT, 10) c.setopt(c.TIMEOUT, 10) if proxy: c.setopt(c.PROXY, proxy) args = {'keywords':','.join(look_at)} c.setopt(c.URL, str('shinken.io/searchcli?'+urllib.urlencode(args))) response = StringIO() c.setopt(pycurl.WRITEFUNCTION, response.write) #c.setopt(c.VERBOSE, 1) c.perform() r = c.getinfo(pycurl.HTTP_CODE) c.close() if r != 200: logger.error("There was a critical error : %s" % response.getvalue()) sys.exit(2) else: ret = json.loads(response.getvalue().replace('\\/', '/')) status = ret.get('status') result = ret.get('result') if status != 200: logger.log(result) return [] return result
def hook_save_retention(self, daemon): log_mgr = logger logger.log( "[PickleRetentionGeneric] asking me to update the retention objects" ) #Now the flat file method try: # Open a file near the path, with .tmp extension # so in cae or problem, we do not lost the old one f = open(self.path + '.tmp', 'wb') # We get interesting retention data from the daemon it self all_data = daemon.get_retention_data() # And we save it on file :) #s = cPickle.dumps(all_data) #s_compress = zlib.compress(s) cPickle.dump(all_data, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.write(s_compress) f.close() # Now move the .tmp fiel to the real path shutil.move(self.path + '.tmp', self.path) except IOError, exp: log_mgr.log("Error: retention file creation failed, %s" % str(exp)) return
def try_instance_init(self, inst): """ Try to "init" the given module instance. Returns: True on successfull init. False if instance init method raised any Exception. """ try: print "Trying to init module", inst.get_name() inst.init_try += 1 # Maybe it's a retry if inst.init_try > 1: # Do not try until 5 sec, or it's too loopy if inst.last_init_try > time.time() - 5: return False inst.last_init_try = time.time() # If it's an external, create/update Queues() if inst.is_external: inst.create_queues() inst.init() except Exception, e: logger.log( "Error : the instance %s raised an exception %s, I remove it!" % (inst.get_name(), str(e))) output = cStringIO.StringIO() traceback.print_exc(file=output) logger.log("Back trace of this remove : %s" % (output.getvalue())) output.close() return False
def hook_save_retention(self, daemon): log_mgr = logger logger.log("[PickleRetentionGeneric] asking me to update the retention objects") #Now the flat file method try: # Open a file near the path, with .tmp extension # so in cae or problem, we do not lost the old one f = open(self.path+'.tmp', 'wb') # We get interesting retention data from the daemon it self all_data = daemon.get_retention_data() # And we save it on file :) #s = cPickle.dumps(all_data) #s_compress = zlib.compress(s) cPickle.dump(all_data, f, protocol=cPickle.HIGHEST_PROTOCOL) #f.write(s_compress) f.close() # Now move the .tmp fiel to the real path shutil.move(self.path+'.tmp', self.path) except IOError , exp: log_mgr.log("Error: retention file creation failed, %s" % str(exp)) return
def try_instance_init(self, inst): """ Try to "init" the given module instance. Returns: True on successfull init. False if instance init method raised any Exception. """ try: print "Trying to init module", inst.get_name() inst.init_try += 1 # Maybe it's a retry if inst.init_try > 1: # Do not try until 5 sec, or it's too loopy if inst.last_init_try > time.time() - 5: return False inst.last_init_try = time.time() # If it's an external, create/update Queues() if inst.is_external: inst.create_queues() inst.init() except Exception, e: logger.log("Error : the instance %s raised an exception %s, I remove it!" % (inst.get_name(), str(e))) output = cStringIO.StringIO() traceback.print_exc(file=output) logger.log("Back trace of this remove : %s" % (output.getvalue())) output.close() return False
def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.data['instance_id'] = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': print "Adding in queue an external command", ExternalCommand.__dict__ self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something #like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! print elt.__dict__ if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id mean : I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] logger.log( 'A module is asking me to get all initial data from the scheduler %d' % c_id) # so we just reset the connexion adn the running_id, it will just get all new things self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0
def compensate_system_time_change(self, difference): """ Compensate a system time change of difference for all hosts/services/checks/notifs """ logger.log('Warning: A system time change of %d has been detected. Compensating...' % difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) # Then we compasate all host/services for h in self.sched.hosts: h.compensate_system_time_change(difference) for s in self.sched.services: s.compensate_system_time_change(difference) # Now all checks and actions for c in self.sched.checks.values(): # Already launch checks should not be touch if c.status == 'scheduled': t_to_go = c.t_to_go ref = c.ref new_t = max(0, t_to_go + difference) # But it's no so simple, we must match the timeperiod new_t = ref.check_period.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: c.state = 'waitconsume' c.exit_status = 2 c.output = '(Error: there is no available check time after time change!)' c.check_time = time.time() c.execution_time = 0 else: c.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for c in self.sched.actions.values(): # Already launch checks should not be touch if c.status == 'scheduled': t_to_go = c.t_to_go # Event handler do not have ref ref = getattr(c, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if c.is_a == 'notification': # But it's no so simple, we must match the timeperiod new_t = ref.notification_period.get_next_valid_time_from_t(new_t) # And got a creation_time variable too c.creation_time = c.creation_time + difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: c.state = 'waitconsume' c.exit_status = 2 c.output = '(Error: there is no available check time after time change!)' c.check_time = time.time() c.execution_time = 0 else: c.t_to_go = new_t
def setup_new_conf(self): conf = self.new_conf self.new_conf = None self.cur_conf = conf # Got our name from the globals if 'receiver_name' in conf['global']: name = conf['global']['receiver_name'] else: name = 'Unnamed receiver' self.name = name self.log.load_obj(self, name) print "[%s] Sending us configuration %s" % (self.name, conf) if not self.have_modules: self.modules = mods = conf['global']['modules'] self.have_modules = True logger.log("[%s] We received modules %s " % (self.name, mods)) # Set our giving timezone from arbiter use_timezone = conf['global']['use_timezone'] if use_timezone != 'NOTSET': logger.log("[%s] Setting our timezone to" % (self.name, use_timezone)) os.environ['TZ'] = use_timezone time.tzset()
def get_new_broks(self, type='scheduler'): # Get teh good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.log('DBG: Type unknown for connexion! %s' % type) return # We check for new check in each schedulers and put # the result in new_checks for sched_id in links: try: con = links[sched_id]['con'] if con is not None: # None = not initilized tmp_broks = con.get_broks() for b in tmp_broks.values(): b.instance_id = links[sched_id]['instance_id'] # Ok, we can add theses broks to our queues self.add_broks_to_queue(tmp_broks.values()) else: # no con? make the connexion self.pynag_con_init(sched_id, type=type) # Ok, con is not know, so we create it except KeyError, exp: print exp self.pynag_con_init(sched_id, type=type) except Pyro.errors.ProtocolError, exp: logger.log( "[%s] Connexion problem to the %s %s : %s" % (self.name, type, links[sched_id]['name'], str(exp))) links[sched_id]['con'] = None
def is_me(self): logger.log( "And arbiter is launched with the hostname:%s from an arbiter point of view of addr :%s" % (self.host_name, socket.getfqdn()), print_it=False, ) return self.host_name == socket.getfqdn() or self.host_name == socket.gethostname()
def publish_archive(archive): # Now really publish it proxy = CONFIG['shinken.io']['proxy'] api_key = CONFIG['shinken.io']['api_key'] # Ok we will push the file with a 10s timeout c = pycurl.Curl() c.setopt(c.POST, 1) c.setopt(c.CONNECTTIMEOUT, 10) c.setopt(c.TIMEOUT, 10) if proxy: c.setopt(c.PROXY, proxy) c.setopt(c.URL, "http://shinken.io/push") c.setopt(c.HTTPPOST, [("api_key", api_key), ("data", (c.FORM_FILE, str(archive), c.FORM_CONTENTTYPE, "application/x-gzip"))]) response = StringIO() c.setopt(pycurl.WRITEFUNCTION, response.write) c.setopt(c.VERBOSE, 1) c.perform() r = c.getinfo(pycurl.HTTP_CODE) c.close() if r != 200: logger.error("There was a critical error : %s" % response.getvalue()) sys.exit(2) else: ret = json.loads(response.getvalue().replace('\\/', '/')) status = ret.get('status') text = ret.get('text') if status == 200: logger.log(text) else: logger.error(text)
def main(self): self.set_signal_handler() logger.log("[%s[%d]]: Now running.." % (self.name, os.getpid())) while not self.interrupted: self.do_loop_turn() self.do_stop() logger.log("[%s]: exiting now.." % (self.name))
def main(self): self.load_config_file() for line in self.get_header(): self.log.log(line) logger.log("[Broker] Using working directory : %s" % os.path.abspath(self.workdir)) self.do_daemon_init_and_start() self.uri2 = self.pyro_daemon.register(self.interface, "ForArbiter") print "The Arbtier uri it at", self.uri2 # We wait for initial conf self.wait_for_initial_conf() if not self.new_conf: return self.setup_new_conf() # Set modules, init them and start external ones self.modules_manager.set_modules(self.modules) self.do_load_modules() self.modules_manager.start_external_instances() # Do the modules part, we have our modules in self.modules # REF: doc/broker-modules.png (1) self.hook_point('load_retention') # Now the main loop self.do_mainloop()
def main(self): self.load_config_file() for line in self.get_header(): self.log.log(line) logger.log("[Receiver] Using working directory : %s" % os.path.abspath(self.workdir)) self.do_daemon_init_and_start() self.uri2 = self.pyro_daemon.register(self.interface, "ForArbiter") print "The Arbtier uri it at", self.uri2 # We wait for initial conf self.wait_for_initial_conf() if not self.new_conf: return self.setup_new_conf() self.modules_manager.set_modules(self.modules) self.do_load_modules() # and start external modules too self.modules_manager.start_external_instances() # Do the modules part, we have our modules in self.modules # REF: doc/receiver-modules.png (1) # Now the main loop self.do_mainloop()
def main(self): """ module "main" method. Only used by external modules. """ self.set_signal_handler() logger.log("[%s[%d]]: Now running.." % (self.name, os.getpid())) while not self.interrupted: self.do_loop_turn() self.do_stop() logger.log("[%s]: exiting now.." % (self.name))
def create_connection(self): try: self.uri = pyro.create_uri(self.address, self.port, "ForArbiter", self.__class__.use_ssl) self.con = pyro.getProxy(self.uri) pyro.set_timeout(self.con, self.timeout) except Pyro_exp_pack , exp: self.con = None logger.log('Error : in creation connexion for %s : %s' % (self.get_name(), str(exp)))
def hook_early_configuration(self, arb): logger.log("[IpTag] in hook late config") for h in arb.conf.hosts: if not hasattr(h, 'address') and not hasattr(h, 'host_name'): continue # The address to resolve addr = None #By default take the address, if not, take host_name if not hasattr(h, 'address'): addr = h.host_name else: addr = h.address print "Looking for h", h.get_name() print addr h_ip = None try: IP(addr) # If we reach here, it's it was a real IP :) h_ip = addr except: pass # Ok, try again with name resolution if not h_ip: try: h_ip = socket.gethostbyname(addr) except: pass # Ok, maybe we succeed :) print "Host ip is:", h_ip # If we got an ip that match and the object do not already got # the property, tag it! if h_ip and h_ip in self.ip_range: print "Is in the range" # 2 cases : append or replace. # append will join with the value if exist # replace will replace it if NOT existing if self.method == 'append': orig_v = getattr(h, self.property, '') print "Orig_v", orig_v new_v = ','.join([orig_v, self.value]) print "Newv", new_v setattr(h, self.property, new_v) # If it's a poller_tag, remember to also tag commands! if(self.property == 'poller_tag'): h.check_command.poller_tag = self.value if self.method == 'replace': if not hasattr(h, self.property): # Ok, set the value! setattr(h, self.property, self.value) # If it's a poller_tag, remember to also tag commands! if(self.property == 'poller_tag'): h.check_command.poller_tag = self.value
def pynag_con_init(self, id, type="scheduler"): # Get teh good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.log("DBG: Type unknown for connection! %s" % type) return if type == "scheduler": # If sched is not active, I do not try to init # it is just useless is_active = links[id]["active"] if not is_active: return # If we try to connect too much, we slow down our tests if self.is_connection_try_too_close(links[id]): return # Ok, we can now update it links[id]["last_connection"] = time.time() # DBG: print "Init connection with", links[id]['uri'] running_id = links[id]["running_id"] # DBG: print "Running id before connection", running_id uri = links[id]["uri"] links[id]["con"] = Pyro.core.getProxyForURI(uri) try: # intial ping must be quick pyro.set_timeout(links[id]["con"], 5) links[id]["con"].ping() new_run_id = links[id]["con"].get_running_id() # data transfert can be longer pyro.set_timeout(links[id]["con"], 120) # The schedulers have been restart : it has a new run_id. # So we clear all verifs, they are obsolete now. if new_run_id != running_id: print "[%s] New running id for the %s %s : %s (was %s)" % ( self.name, type, links[id]["name"], new_run_id, running_id, ) links[id]["broks"].clear() # we must ask for a enw full broks if # it's a scheduler if type == "scheduler": print "[%s] I ask for a broks generation to the scheduler %s" % (self.name, links[id]["name"]) links[id]["con"].fill_initial_broks() # else: # print "I do nto ask for brok generation" links[id]["running_id"] = new_run_id except (Pyro.errors.ProtocolError, Pyro.errors.CommunicationError), exp: logger.log("[%s] Connexion problem to the %s %s : %s" % (self.name, type, links[id]["name"], str(exp))) links[id]["con"] = None return
def register_local_log(self): # The arbiter don't have such an attribute if hasattr(self, 'use_local_log') and self.use_local_log: try: self.local_log_fd = self.log.register_local_log(self.local_log) except IOError, exp: print "Error : opening the log file '%s' failed with '%s'" % (self.local_log, exp) sys.exit(2) logger.log("Using the local log file '%s'" % self.local_log)
def stop_process(self): if self.process: logger.log("I'm stopping module '%s' process pid:%s " % (self.get_name(), self.process.pid)) self.process.terminate() self.process.join(timeout=1) if self.process.is_alive(): logger.log("The process is still alive, I help it to die") self.__kill() self.process = None
def pynag_con_init(self, id, type='scheduler'): # Get teh good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.log('DBG: Type unknown for connexion! %s' % type) return if type == 'scheduler': # If sched is not active, I do not try to init # it is just useless is_active = links[id]['active'] if not is_active: return # If we try to connect too much, we slow down our tests if self.is_connexion_try_too_close(links[id]): return # Ok, we can now update it links[id]['last_connexion'] = time.time() # DBG: print "Init connexion with", links[id]['uri'] running_id = links[id]['running_id'] # DBG: print "Running id before connexion", running_id uri = links[id]['uri'] links[id]['con'] = Pyro.core.getProxyForURI(uri) try: # intial ping must be quick pyro.set_timeout(links[id]['con'], 5) links[id]['con'].ping() new_run_id = links[id]['con'].get_running_id() # data transfert can be longer pyro.set_timeout(links[id]['con'], 120) # The schedulers have been restart : it has a new run_id. # So we clear all verifs, they are obsolete now. if new_run_id != running_id: print "[%s] New running id for the %s %s : %s (was %s)" % ( self.name, type, links[id]['name'], new_run_id, running_id) links[id]['broks'].clear() # we must ask for a enw full broks if # it's a scheduler if type == 'scheduler': print "[%s] I ask for a broks generation to the scheduler %s" % ( self.name, links[id]['name']) links[id]['con'].fill_initial_broks() # else: # print "I do nto ask for brok generation" links[id]['running_id'] = new_run_id except (Pyro.errors.ProtocolError, Pyro.errors.CommunicationError), exp: logger.log("[%s] Connexion problem to the %s %s : %s" % (self.name, type, links[id]['name'], str(exp))) links[id]['con'] = None return
def start(self): """ Start this module process if it's external. if not -> donothing """ if not self.is_external: return self.stop_process() logger.log("Starting external process for instance %s" % (self.name)) p = self.process = Process(target=self.main, args=()) self.properties['process'] = p ## TODO: temporary p.start() logger.log("%s is now started ; pid=%d" % (self.name, p.pid))
def register_local_log(self): # The arbiter don't have such an attribute if hasattr(self, 'use_local_log') and self.use_local_log: try: self.local_log_fd = self.log.register_local_log(self.local_log) except IOError, exp: print "Error : opening the log file '%s' failed with '%s'" % ( self.local_log, exp) sys.exit(2) logger.log("Using the local log file '%s'" % self.local_log)
def hook_point(self, hook_name): for inst in self.modules_manager.instances: full_hook_name = 'hook_' + hook_name if hasattr(inst, full_hook_name): f = getattr(inst, full_hook_name) try : f(self) except Exception, exp: logger.log('The instance %s raise an exception %s. I disable, and set it to restart later' % (inst.get_name(), str(exp))) self.modules_manager.set_to_restart(inst)
def raise_notification_log_entry(self, n): contact = n.contact command = n.command_call if n.type in ('DOWNTIMESTART', 'DOWNTIMEEND', 'CUSTOM', 'ACKNOWLEDGEMENT', 'FLAPPINGSTART', 'FLAPPINGSTOP', 'FLAPPINGDISABLED'): state = '%s (%s)' % (n.type, self.state) else: state = self.state if self.__class__.log_notifications: logger.log("HOST NOTIFICATION: %s;%s;%s;%s;%s" % (contact.get_name(), self.get_name(), state, \ command.get_name(), self.output))
def set_dead(self): was_alive = self.alive self.alive = False self.con = None # We are dead now. Must raise # a brok to say it if was_alive: logger.log("Warning : Setting the satellite %s to a dead state." % self.get_name()) b = self.get_update_status_brok() self.broks.append(b)
def add_failed_check_attempt(self, reason=''): self.reachable = False self.attempt += 1 self.attempt = min(self.attempt, self.max_check_attempts) # Don't need to warn again and again if the satellite is already dead if self.alive: s = "Info : Add failed attempt to %s (%d/%d) %s" % (self.get_name(), self.attempt, self.max_check_attempts, reason) logger.log(s) # check when we just go HARD (dead) if self.attempt == self.max_check_attempts: self.set_dead()
def stop_process(self): """ Request the module process to stop and release it """ if self.process: logger.log("I'm stopping process pid:%s " % self.process.pid) self.process.terminate() self.process.join(timeout=1) print dir(self.process) if self.process.is_alive(): logger.log("The process is still alive, I help it to die") self.__kill() self.process = None
def start_external_instances(self): for inst in [inst for inst in self.instances if inst.is_external]: # But maybe the init failed a bit, so bypass this ones from now if not self.try_instance_init(inst): logger.log("Warning : the module '%s' failed to init, I will try to restart it later" % inst.get_name()) self.to_restart.append(inst) continue # ok, init succeed logger.log("Starting external module %s" % inst.get_name()) inst.start()
def hook_point(self, hook_name): for inst in self.modules_manager.instances: full_hook_name = 'hook_' + hook_name if hasattr(inst, full_hook_name): f = getattr(inst, full_hook_name) try: f(self) except Exception, exp: logger.log( 'The instance %s raise an exception %s. I disable, and set it to restart later' % (inst.get_name(), str(exp))) self.modules_manager.set_to_restart(inst)
def start_external_instances(self): for inst in [inst for inst in self.instances if inst.is_external]: # But maybe the init failed a bit, so bypass this ones from now if not self.try_instance_init(inst): logger.log( "Warning : the module '%s' failed to init, I will try to restart it later" % inst.get_name()) self.to_restart.append(inst) continue # ok, init succeed print "Starting external module %s" % inst.get_name(), inst.from_q inst.start()
def do_stop(self): if self.modules_manager: # We save what we can but NOT for the scheduler # because the current sched object is a dummy one # and the old one aleady do it! if not hasattr(self, 'sched'): self.hook_point('save_retention') # And we quit logger.log('Stopping all modules') self.modules_manager.stop_all() if self.pyro_daemon: pyro.shutdown(self.pyro_daemon) #.shutdown(True) logger.quit()
def wait_for_initial_conf(self, timeout=1.0): logger.log("Waiting for initial configuration") cur_timeout = timeout # Arbiter do not already set our have_conf param while not self.new_conf and not self.interrupted: elapsed, _, _ = self.handleRequests(cur_timeout) if elapsed: cur_timeout -= elapsed if cur_timeout > 0: continue cur_timeout = timeout sys.stdout.write(".") sys.stdout.flush()
def check_alive_instances(self): #to_del = [] #Only for external for inst in self.instances: if not inst in self.to_restart: if inst.is_external and not inst.process.is_alive(): logger.log( "Error : the external module %s goes down unexpectly!" % inst.get_name()) logger.log("Setting the module %s to restart" % inst.get_name()) # We clean its queues, they are no more useful inst.clear_queues() self.to_restart.append(inst)
def is_correct(self): state = True properties = self.__class__.properties # Raised all previously saw errors like unknown contacts and co if self.configuration_errors != []: state = False for err in self.configuration_errors: logger.log(err) for prop, entry in properties.items(): if not hasattr(self, prop) and entry.required: print self.get_name(), "missing property :", prop state = False return state
def check_and_do_archive(self, first_pass=False): now = int(time.time()) #first check if the file last mod (or creation) was #not our day try: t_last_mod = int(float(str(os.path.getmtime(self.path)))) except OSError: #there should be no path from now, so no move :) return False #print "Ctime %d" % os.path.getctime(self.path) t_last_mod_day = get_day(t_last_mod) today = get_day(now) #print "Dates: t_last_mod : %d, t_last_mod_day: %d, today : %d" % (t_last_mod, t_last_mod_day, today) if t_last_mod_day != today: logger.log("We are archiving the old log file") #For the first pass, it's not already open if not first_pass: self.file.close() #Now we move it #Get a new name like MM #f_name is like nagios.log f_name = os.path.basename(self.path) #remove the ext -> (nagios,.log) (f_base_name, ext) = os.path.splitext(f_name) #make the good looking day for archive name #like -05-09-2010-00 d = datetime.datetime.fromtimestamp(today) s_day = d.strftime("-%m-%d-%Y-00") archive_name = f_base_name + s_day + ext file_archive_path = os.path.join(self.archive_path, archive_name) logger.log("Moving the old log file from %s to %s" % (self.path, file_archive_path)) shutil.move(self.path, file_archive_path) #and we overwrite it print "I open the log file %s" % self.path self.file = open(self.path, 'a') return True return False
def test_utf8log(self): sutf = 'h\351h\351' # Latin Small Letter E with acute in Latin-1 logger.log(sutf) sutf8 = u'I love myself $£¤' # dollar, pound, currency logger.log(sutf8) s = unichr(40960) + u'abcd' + unichr(1972) logger.log(s)
def dump_memory(self): logger.log("I dump my memory, it can ask some seconds to do") try: from guppy import hpy hp = hpy() logger.log(hp.heap()) except ImportError: logger.log( 'I do not have the module guppy for memory dump, please install it' )
def check_bad_dispatch(self): for elt in self.elements: if hasattr(elt, 'conf'): # If element have a conf, I do not care, it's a good dispatch # If die : I do not ask it something, it won't respond.. if elt.conf is None and elt.reachable: # print "Ask", elt.get_name() , 'if it got conf' if elt.have_conf(): logger.log( 'Warning : The element %s have a conf and should not have one! I ask it to idle now' % elt.get_name()) elt.active = False elt.wait_new_conf() # I do not care about order not send or not. If not, # The next loop wil resent it # else: # print "No conf" # I ask satellite witch sched_id they manage. If I am not agree, I ask # them to remove it for satellite in self.satellites: kind = satellite.get_my_type() if satellite.reachable: cfg_ids = satellite.what_i_managed() # I do nto care about satellites that do nothing, it already # do what I want :) if len(cfg_ids) != 0: id_to_delete = [] for cfg_id in cfg_ids: # DBG print kind, ":", satellite.get_name(), "manage cfg id:", cfg_id # Ok, we search for realm that have the conf for r in self.realms: if cfg_id in r.confs: # Ok we've got the realm, we check it's to_satellites_managed_by # to see if reactionner is in. If not, we remove he sched_id for it if not satellite in r.to_satellites_managed_by[ kind][cfg_id]: id_to_delete.append(cfg_id) # Maybe we removed all cfg_id of this reactionner # We can make it idle, no active and wait_new_conf if len(id_to_delete) == len(cfg_ids): satellite.active = False logger.log("I ask %s to wait a new conf" % satellite.get_name()) satellite.wait_new_conf() else: #It is not fully idle, just less cfg for id in id_to_delete: logger.log( "I ask to remove configuration N%d from %s" % (cfg_id, satellite.get_name())) satellite.remove_from_conf(cfg_id)
def manage_brok(self, b): # Call all modules if they catch the call for mod in self.modules_manager.get_internal_instances(): try: mod.manage_brok(b) except Exception, exp: print exp.__dict__ logger.log( "[%s] Warning : The mod %s raise an exception: %s, I'm tagging it to restart later" % (self.name, mod.get_name(), str(exp))) logger.log("[%s] Exception type : %s" % (self.name, type(exp))) logger.log("Back trace of this kill: %s" % (traceback.format_exc())) self.modules_manager.set_to_restart(inst)
def manage_brok(self, b): to_del = [] # Call all modules if they catch the call for mod in self.modules_manager.get_internal_instances(): try: mod.manage_brok(b) except Exception, exp: print exp.__dict__ logger.log( "[%s] Warning : The mod %s raise an exception: %s, I kill it" % (self.name, mod.get_name(), str(exp))) logger.log("[%s] Exception type : %s" % (self.name, type(exp))) logger.log("Back trace of this kill: %s" % (traceback.format_exc())) to_del.append(mod)
def get_instances(self): """ Create, init and then returns the list of module instances that the caller needs. If an instance can't be created or init'ed then only log is done. That instance is skipped. The previous modules instance(s), if any, are all cleaned. """ self.clear_instances() for (mod_conf, module) in self.modules_assoc: try: mod_conf.properties = module.properties.copy() inst = module.get_instance(mod_conf) if inst is None: #None = Bad thing happened :) logger.log("get_instance for module %s returned None !" % (mod_conf.get_name())) continue assert (isinstance(inst, BaseModule)) self.instances.append(inst) except Exception, exp: logger.log( "Error : the module %s raised an exception %s, I remove it!" % (mod_conf.get_name(), str(exp))) output = cStringIO.StringIO() traceback.print_exc(file=output) logger.log("Back trace of this remove : %s" % (output.getvalue())) output.close()
def hook_save_retention(self, daemon): logger.log( "[NagiosRetention] asking me to update the retention objects, but I won't do it." )