class Poller(Satellite): do_checks = True # I do checks do_actions = False # but no actions properties = Satellite.properties.copy() properties.update({ 'pidfile': PathProp(default='pollerd.pid'), 'port': IntegerProp(default='7771'), 'local_log': PathProp(default='pollerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): super(Poller, self).__init__('poller', config_file, is_daemon, do_replace, debug, debug_file)
class Reactionner(Satellite): do_checks = False # I do not do checks do_actions = True my_type = 'reactionner' properties = Satellite.properties.copy() properties.update({ 'pidfile': PathProp(default='reactionnerd.pid'), 'port': IntegerProp(default=7769), 'local_log': PathProp(default='reactionnerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile=''): super(Reactionner, self).__init__('reactionner', config_file, is_daemon, do_replace, debug, debug_file)
class Reactionner(Satellite): do_checks = False # I do not do checks do_actions = True # just actions like notifications properties = Satellite.properties.copy() properties.update({ 'pidfile': PathProp(default='/usr/local/shinken/var/reactionnerd.pid'), 'port': IntegerProp(default='7769'), 'local_log': PathProp(default='/usr/local/shinken/var/reactionnerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): super(Reactionner, self).__init__('reactionner', config_file, is_daemon, do_replace, debug, debug_file)
class Receiver(BaseSatellite): properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='/usr/local/shinken/var/receiverd.pid'), 'port': IntegerProp(default='7773'), 'local_log': PathProp(default='/usr/local/shinken/var/receiverd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): super(Receiver, self).__init__('receiver', config_file, is_daemon, do_replace, debug, debug_file) # Our arbiters self.arbiters = {} # Our pollers and reactionners self.pollers = {} self.reactionners = {} # Modules are load one time self.have_modules = False # Can have a queue of external_commands give by modules # will be taken by arbiter to process self.external_commands = [] # All broks to manage self.broks = [] # broks to manage # broks raised this turn and that need to be put in self.broks self.broks_internal_raised = [] # Schedulers have some queues. We can simplify call by adding # elements into the proper queue just by looking at their type # Brok -> self.broks # TODO : better tag ID? # External commands -> self.external_commands def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.data['instance_id'] = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': print "Adding in queue an external command", ExternalCommand.__dict__ self.external_commands.append(elt) # # Get teh good tabs for links by the kind. If unknown, return None # def get_links_from_type(self, type): # t = {'scheduler' : self.schedulers, 'arbiter' : self.arbiters, \ # 'poller' : self.pollers, 'reactionner' : self.reactionners} # if type in t : # return t[type] # return None # Call by arbiter to get our external commands def get_external_commands(self): res = self.external_commands self.external_commands = [] return res # Get a brok. Our role is to put it in the modules # THEY MUST DO NOT CHANGE data of b !!! # REF: doc/receiver-modules.png (4-5) def manage_brok(self, b): to_del = [] # Call all modules if they catch the call for mod in self.modules_manager.get_internal_instances(): try: mod.manage_brok(b) except Exception, exp: print exp.__dict__ logger.log( "[%s] Warning : The mod %s raise an exception: %s, I kill it" % (self.name, mod.get_name(), str(exp))) logger.log("[%s] Exception type : %s" % (self.name, type(exp))) logger.log("Back trace of this kill: %s" % (traceback.format_exc())) to_del.append(mod) # Now remove mod that raise an exception self.modules_manager.clear_instances(to_del)
class Receiver(Satellite): my_type = 'receiver' properties = Satellite.properties.copy() properties.update({ 'pidfile': PathProp(default='receiverd.pid'), 'port': IntegerProp(default=7773), 'local_log': PathProp(default='receiverd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): super(Receiver, self).__init__('receiver', config_file, is_daemon, do_replace, debug, debug_file) # Our arbiters self.arbiters = {} # Our pollers and reactionners self.pollers = {} self.reactionners = {} # Modules are load one time self.have_modules = False # Can have a queue of external_commands give by modules # will be taken by arbiter to process self.external_commands = [] # and the unprocessed one, a buffer self.unprocessed_external_commands = [] self.host_assoc = {} self.direct_routing = False self.accept_passive_unknown_check_results = False self.istats = IStats(self) self.ibroks = IBroks(self) # Now create the external commander. It's just here to dispatch # the commands to schedulers e = ExternalCommandManager(None, 'receiver') e.load_receiver(self) self.external_command = e # Schedulers have some queues. We can simplify call by adding # elements into the proper queue just by looking at their type # Brok -> self.broks # TODO: better tag ID? # External commands -> self.external_commands def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.instance_id = 0 self.broks[elt.id] = elt return elif cls_type == 'externalcommand': logger.debug("Enqueuing an external command: %s", str(ExternalCommand.__dict__)) self.unprocessed_external_commands.append(elt) def push_host_names(self, sched_id, hnames): for h in hnames: self.host_assoc[h] = sched_id def get_sched_from_hname(self, hname): i = self.host_assoc.get(hname, None) e = self.schedulers.get(i, None) return e # Get a brok. Our role is to put it in the modules # THEY MUST DO NOT CHANGE data of b!!! # REF: doc/receiver-modules.png (4-5) def manage_brok(self, b): to_del = [] # Call all modules if they catch the call for mod in self.modules_manager.get_internal_instances(): try: mod.manage_brok(b) except Exception, exp: logger.warning("The mod %s raise an exception: %s, I kill it", mod.get_name(), str(exp)) logger.warning("Exception type: %s", type(exp)) logger.warning("Back trace of this kill: %s", traceback.format_exc()) to_del.append(mod) # Now remove mod that raise an exception self.modules_manager.clear_instances(to_del)
class Broker(BaseSatellite): properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='brokerd.pid'), 'port': IntegerProp(default=7772), 'local_log': PathProp(default='brokerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile=''): super(Broker, self).__init__('broker', config_file, is_daemon, do_replace, debug, debug_file) # Our arbiters self.arbiters = {} # Our pollers, reactionners and receivers self.pollers = {} self.reactionners = {} self.receivers = {} # Modules are load one time self.have_modules = False # Can have a queue of external_commands given by modules # will be processed by arbiter self.external_commands = [] # All broks to manage self.broks = deque() # broks to manage self.external_module_broks = deque() # broks during this loop to send to external modules self.broks_lock = threading.RLock() # to manage lock when managing broks # broks raised this turn and that needs to be put in self.broks self.broks_internal_raised = [] # broks raised by the arbiters, we need a lock so the push can be in parallel # to our current activities and won't lock the arbiter self.arbiter_broks = [] self.arbiter_broks_lock = threading.RLock() self.timeout = 1.0 self.istats = IStats(self) # Schedulers have some queues. We can simplify the call by adding # elements into the proper queue just by looking at their type # Brok -> self.broks # TODO: better tag ID? # External commands -> self.external_commands def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.instance_id = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': logger.debug("Enqueuing an external command '%s'", str(ExternalCommand.__dict__)) self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something # like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! logger.debug(str(elt.__dict__)) if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id means: I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] source = elt.source logger.info('The module %s is asking me to get all initial data ' 'from the scheduler %d', source, c_id) # so we just reset the connection and the running_id, # it will just get all new things try: self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0 except KeyError: # maybe this instance was not known, forget it logger.warning("the module %s ask me a full_instance_id " "for an unknown ID (%d)!", source, c_id) # Maybe a module tells me that it's dead, I must log it's last words... if elt.get_type() == 'ICrash': data = elt.get_data() logger.error('the module %s just crash! Please look at the traceback:', data['name']) logger.error(data['trace']) # The module death will be looked for elsewhere and restarted. # Get the good tabs for links by the kind. If unknown, return None def get_links_from_type(self, d_type): t = {'scheduler': self.schedulers, 'arbiter': self.arbiters, 'poller': self.pollers, 'reactionner': self.reactionners, 'receiver': self.receivers } if d_type in t: return t[d_type] return None # Check if we do not connect to often to this def is_connection_try_too_close(self, elt): now = time.time() last_connection = elt['last_connection'] if now - last_connection < 5: return True return False # wrapper function for the real function do_ # just for timing the connection def pynag_con_init(self, id, type='scheduler'): _t = time.time() r = self.do_pynag_con_init(id, type) statsmgr.timing('con-init.%s' % type, time.time() - _t, 'perf') return r # initialize or re-initialize connection with scheduler or # arbiter if type == arbiter def do_pynag_con_init(self, id, type='scheduler'): # Get the good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.debug('Type unknown for connection! %s', type) return # default timeout for daemons like pollers/reactionners/... timeout = 3 data_timeout = 120 if type == 'scheduler': # If sched is not active, I do not try to init # it is just useless is_active = links[id]['active'] if not is_active: return # schedulers also got real timeout to respect timeout = links[id]['timeout'] data_timeout = links[id]['data_timeout'] # If we try to connect too much, we slow down our tests if self.is_connection_try_too_close(links[id]): return # Ok, we can now update it links[id]['last_connection'] = time.time() # DBG: print "Init connection with", links[id]['uri'] running_id = links[id]['running_id'] # DBG: print "Running id before connection", running_id uri = links[id]['uri'] try: con = links[id]['con'] = HTTPClient(uri=uri, strong_ssl=links[id]['hard_ssl_name_check'], timeout=timeout, data_timeout=data_timeout) except HTTPExceptions, exp: # But the multiprocessing module is not compatible with it! # so we must disable it immediately after logger.info("Connection problem to the %s %s: %s", type, links[id]['name'], str(exp)) links[id]['con'] = None return try: # initial ping must be quick con.get('ping') new_run_id = con.get('get_running_id') new_run_id = float(new_run_id) # data transfer can be longer # The schedulers have been restarted: it has a new run_id. # So we clear all verifs, they are obsolete now. if new_run_id != running_id: logger.debug("[%s] New running id for the %s %s: %s (was %s)", self.name, type, links[id]['name'], new_run_id, running_id) del links[id]['broks'][:] # we must ask for a new full broks if # it's a scheduler if type == 'scheduler': logger.debug("[%s] I ask for a broks generation to the scheduler %s", self.name, links[id]['name']) con.get('fill_initial_broks', {'bname': self.name}, wait='long') # Ok all is done, we can save this new running id links[id]['running_id'] = new_run_id except HTTPExceptions, exp: logger.info("Connection problem to the %s %s: %s", type, links[id]['name'], str(exp)) links[id]['con'] = None return
class Shinken(BaseSatellite): properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='schedulerd.pid'), 'port': IntegerProp(default='7768'), 'local_log': PathProp(default='schedulerd.log'), }) # Create the shinken class: # Create a Pyro server (port = arvg 1) # then create the interface for arbiter # Then, it wait for a first configuration def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile=''): BaseSatellite.__init__(self, 'scheduler', config_file, is_daemon, do_replace, debug, debug_file) self.interface = IForArbiter(self) self.istats = IStats(self) self.sched = Scheduler(self) self.ichecks = None self.ibroks = None self.must_run = True # Now the interface self.uri = None self.uri2 = None # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} self.brokers = {} def do_stop(self): if self.http_daemon: if self.ibroks: self.http_daemon.unregister(self.ibroks) if self.ichecks: self.http_daemon.unregister(self.ichecks) super(Shinken, self).do_stop() def compensate_system_time_change(self, difference): """ Compensate a system time change of difference for all hosts/services/checks/notifs """ logger.warning("A system time change of %d has been detected. Compensating..." % difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for h in self.sched.hosts: h.compensate_system_time_change(difference) for s in self.sched.services: s.compensate_system_time_change(difference) # Now all checks and actions for c in self.sched.checks.values(): # Already launch checks should not be touch if c.status == 'scheduled' and c.t_to_go is not None: t_to_go = c.t_to_go ref = c.ref new_t = max(0, t_to_go + difference) if ref.check_period is not None: # But it's no so simple, we must match the timeperiod new_t = ref.check_period.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: c.state = 'waitconsume' c.exit_status = 2 c.output = '(Error: there is no available check time after time change!)' c.check_time = time.time() c.execution_time = 0 else: c.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for c in self.sched.actions.values(): # Already launch checks should not be touch if c.status == 'scheduled': t_to_go = c.t_to_go # Event handler do not have ref ref = getattr(c, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if c.is_a == 'notification': if ref.notification_period: # But it's no so simple, we must match the timeperiod new_t = ref.notification_period.get_next_valid_time_from_t(new_t) # And got a creation_time variable too c.creation_time = c.creation_time + difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: c.state = 'waitconsume' c.exit_status = 2 c.output = '(Error: there is no available check time after time change!)' c.check_time = time.time() c.execution_time = 0 else: c.t_to_go = new_t def manage_signal(self, sig, frame): logger.warning("Received a SIGNAL %s" % sig) # If we got USR1, just dump memory if sig == signal.SIGUSR1: self.sched.need_dump_memory = True elif sig == signal.SIGUSR2: #usr2, dump objects self.sched.need_objects_dump = True else: # if not, die :) self.sched.die() self.must_run = False Daemon.manage_signal(self, sig, frame) def do_loop_turn(self): # Ok, now the conf self.wait_for_initial_conf() if not self.new_conf: return logger.info("New configuration received") self.setup_new_conf() logger.info("New configuration loaded") self.sched.run() def setup_new_conf(self): pk = self.new_conf conf_raw = pk['conf'] override_conf = pk['override_conf'] modules = pk['modules'] satellites = pk['satellites'] instance_name = pk['instance_name'] push_flavor = pk['push_flavor'] skip_initial_broks = pk['skip_initial_broks'] accept_passive_unknown_check_results = pk['accept_passive_unknown_check_results'] # horay, we got a name, we can set it in our stats objects statsmgr.register(instance_name, 'scheduler') t0 = time.time() conf = cPickle.loads(conf_raw) logger.debug("Conf received at %d. Unserialized in %d secs" % (t0, time.time() - t0)) self.new_conf = None # Tag the conf with our data self.conf = conf self.conf.push_flavor = push_flavor self.conf.instance_name = instance_name self.conf.skip_initial_broks = skip_initial_broks self.conf.accept_passive_unknown_check_results = accept_passive_unknown_check_results self.cur_conf = conf self.override_conf = override_conf self.modules = modules self.satellites = satellites #self.pollers = self.app.pollers if self.conf.human_timestamp_log: logger.set_human_format() # Now We create our pollers for pol_id in satellites['pollers']: # Must look if we already have it already_got = pol_id in self.pollers p = satellites['pollers'][pol_id] self.pollers[pol_id] = p if p['name'] in override_conf['satellitemap']: p = dict(p) # make a copy p.update(override_conf['satellitemap'][p['name']]) proto = 'http' if p['use_ssl']: proto = 'https' uri = '%s://%s:%s/' % (proto, p['address'], p['port']) self.pollers[pol_id]['uri'] = uri self.pollers[pol_id]['last_connection'] = 0 # First mix conf and override_conf to have our definitive conf for prop in self.override_conf: #print "Overriding the property %s with value %s" % (prop, self.override_conf[prop]) val = self.override_conf[prop] setattr(self.conf, prop, val) if self.conf.use_timezone != '': logger.debug("Setting our timezone to %s" % str(self.conf.use_timezone)) os.environ['TZ'] = self.conf.use_timezone time.tzset() if len(self.modules) != 0: logger.debug("I've got %s modules" % str(self.modules)) # TODO: if scheduler had previous modules instanciated it must clean them! self.modules_manager.set_modules(self.modules) self.do_load_modules() # give it an interface # But first remove previous interface if exists if self.ichecks is not None: logger.debug("Deconnecting previous Check Interface") self.http_daemon.unregister(self.ichecks) # Now create and connect it self.ichecks = IChecks(self.sched) self.http_daemon.register(self.ichecks) logger.debug("The Scheduler Interface uri is: %s" % self.uri) # Same for Broks if self.ibroks is not None: logger.debug("Deconnecting previous Broks Interface") self.http_daemon.unregister(self.ibroks) # Create and connect it self.ibroks = IBroks(self.sched) self.http_daemon.register(self.ibroks) logger.info("Loading configuration.") self.conf.explode_global_conf() # we give sched it's conf self.sched.reset() self.sched.load_conf(self.conf) self.sched.load_satellites(self.pollers, self.reactionners) # We must update our Config dict macro with good value # from the config parameters self.sched.conf.fill_resource_macros_names_macros() #print "DBG: got macros", self.sched.conf.macros # Creating the Macroresolver Class & unique instance m = MacroResolver() m.init(self.conf) #self.conf.dump() #self.conf.quick_debug() # Now create the external commander # it's a applyer: it role is not to dispatch commands, # but to apply them e = ExternalCommandManager(self.conf, 'applyer') # Scheduler need to know about external command to # activate it if necessary self.sched.load_external_command(e) # External command need the sched because he can raise checks e.load_scheduler(self.sched) # We clear our schedulers managed (it's us :) ) # and set ourself in it self.schedulers = {self.conf.instance_id: self.sched} # Give the arbiter the data about what I manage # for me it's just my instance_id and my push flavor def what_i_managed(self): if hasattr(self, 'conf'): return {self.conf.instance_id: self.conf.push_flavor} else: return {} # our main function, launch after the init def main(self): try: self.load_config_file() self.look_for_early_exit() self.do_daemon_init_and_start() self.load_modules_manager() self.http_daemon.register(self.interface) self.http_daemon.register(self.istats) #self.inject = Injector(self.sched) #self.http_daemon.register(self.inject) self.http_daemon.unregister(self.interface) self.uri = self.http_daemon.uri logger.info("[scheduler] General interface is at: %s" % self.uri) self.do_mainloop() except Exception, exp: logger.critical("I got an unrecoverable error. I have to exit") logger.critical("You can log a bug ticket at https://github.com/naparuba/shinken/issues/new to get help") logger.critical("Back trace of it: %s" % (traceback.format_exc())) raise
class Broker(BaseSatellite): properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='brokerd.pid'), 'port': IntegerProp(default='7772'), 'local_log': PathProp(default='brokerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): super(Broker, self).__init__('broker', config_file, is_daemon, do_replace, debug, debug_file) # Our arbiters self.arbiters = {} # Our pollers and reactionners self.pollers = {} self.reactionners = {} # Modules are load one time self.have_modules = False # Can have a queue of external_commands given by modules # will be processed by arbiter self.external_commands = [] # All broks to manage self.broks = [] # broks to manage # broks raised this turn and that needs to be put in self.broks self.broks_internal_raised = [] self.timeout = 1.0 # Schedulers have some queues. We can simplify the call by adding # elements into the proper queue just by looking at their type # Brok -> self.broks # TODO: better tag ID? # External commands -> self.external_commands def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.instance_id = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': logger.debug("Enqueuing an external command '%s'" % str(ExternalCommand.__dict__)) self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something # like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! logger.debug(str(elt.__dict__)) if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id means: I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] source = elt.source logger.info( 'The module %s is asking me to get all initial data from the scheduler %d' % (source, c_id)) # so we just reset the connection and the running_id, it will just get all new things try: self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0 except KeyError: # maybe this instance was not known, forget it logger.warning( "the module %s ask me a full_instance_id for an unknown ID (%d)!" % (source, c_id)) # Maybe a module tells me that it's dead, I must log it's last words... if elt.get_type() == 'ICrash': data = elt.get_data() logger.error( 'the module %s just crash! Please look at the traceback:' % data['name']) logger.error(data['trace']) # The module death will be looked for elsewhere and restarted. # Get the good tabs for links by the kind. If unknown, return None def get_links_from_type(self, type): t = {'scheduler': self.schedulers, 'arbiter': self.arbiters, \ 'poller': self.pollers, 'reactionner': self.reactionners} if type in t: return t[type] return None # Call by arbiter to get our external commands def get_external_commands(self): res = self.external_commands self.external_commands = [] return res # Check if we do not connect to often to this def is_connection_try_too_close(self, elt): now = time.time() last_connection = elt['last_connection'] if now - last_connection < 5: return True return False # initialize or re-initialize connection with scheduler or # arbiter if type == arbiter def pynag_con_init(self, id, type='scheduler'): # Get the good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.debug('Type unknown for connection! %s' % type) return if type == 'scheduler': # If sched is not active, I do not try to init # it is just useless is_active = links[id]['active'] if not is_active: return # If we try to connect too much, we slow down our tests if self.is_connection_try_too_close(links[id]): return # Ok, we can now update it links[id]['last_connection'] = time.time() # DBG: print "Init connection with", links[id]['uri'] running_id = links[id]['running_id'] # DBG: print "Running id before connection", running_id uri = links[id]['uri'] try: socket.setdefaulttimeout(3) links[id]['con'] = Pyro.core.getProxyForURI(uri) socket.setdefaulttimeout(None) except Pyro_exp_pack, exp: # But the multiprocessing module is not compatible with it! # so we must disable it immediately after socket.setdefaulttimeout(None) logger.info("Connection problem to the %s %s: %s" % (type, links[id]['name'], str(exp))) links[id]['con'] = None return try: # initial ping must be quick pyro.set_timeout(links[id]['con'], 5) links[id]['con'].ping() new_run_id = links[id]['con'].get_running_id() # data transfer can be longer pyro.set_timeout(links[id]['con'], 120) # The schedulers have been restarted: it has a new run_id. # So we clear all verifs, they are obsolete now. if new_run_id != running_id: logger.debug("[%s] New running id for the %s %s: %s (was %s)" % (self.name, type, links[id]['name'], new_run_id, running_id)) links[id]['broks'].clear() # we must ask for a new full broks if # it's a scheduler if type == 'scheduler': logger.debug( "[%s] I ask for a broks generation to the scheduler %s" % (self.name, links[id]['name'])) links[id]['con'].fill_initial_broks(self.name) # Ok all is done, we can save this new running id links[id]['running_id'] = new_run_id except Pyro_exp_pack, exp: logger.info("Connection problem to the %s %s: %s" % (type, links[id]['name'], str(exp))) links[id]['con'] = None return
class Shinken(BaseSatellite): properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='/usr/local/shinken/var/schedulerd.pid'), 'port': IntegerProp(default='7768'), 'local_log': PathProp(default='/usr/local/shinken/var/schedulerd.log'), }) #Create the shinken class: #Create a Pyro server (port = arvg 1) #then create the interface for arbiter #Then, it wait for a first configuration def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): BaseSatellite.__init__(self, 'scheduler', config_file, is_daemon, do_replace, debug, debug_file) self.interface = IForArbiter(self) self.sched = Scheduler(self) self.ichecks = None self.ibroks = None self.must_run = True # Now the interface self.uri = None self.uri2 = None # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} def do_stop(self): self.pyro_daemon.unregister(self.ibroks) self.pyro_daemon.unregister(self.ichecks) super(Shinken, self).do_stop() def compensate_system_time_change(self, difference): """ Compensate a system time change of difference for all hosts/services/checks/notifs """ logger.log('Warning: A system time change of %d has been detected. Compensating...' % difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) # Then we compasate all host/services for h in self.sched.hosts: h.compensate_system_time_change(difference) for s in self.sched.services: s.compensate_system_time_change(difference) # Now all checks and actions for c in self.sched.checks.values(): # Already launch checks should not be touch if c.status == 'scheduled': t_to_go = c.t_to_go ref = c.ref new_t = max(0, t_to_go + difference) # But it's no so simple, we must match the timeperiod new_t = ref.check_period.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: c.state = 'waitconsume' c.exit_status = 2 c.output = '(Error: there is no available check time after time change!)' c.check_time = time.time() c.execution_time = 0 else: c.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for c in self.sched.actions.values(): # Already launch checks should not be touch if c.status == 'scheduled': t_to_go = c.t_to_go # Event handler do not have ref ref = getattr(c, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if c.is_a == 'notification': # But it's no so simple, we must match the timeperiod new_t = ref.notification_period.get_next_valid_time_from_t(new_t) # And got a creation_time variable too c.creation_time = c.creation_time + difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: c.state = 'waitconsume' c.exit_status = 2 c.output = '(Error: there is no available check time after time change!)' c.check_time = time.time() c.execution_time = 0 else: c.t_to_go = new_t def manage_signal(self, sig, frame): # If we got USR1, just dump memory if sig == 10: self.sched.need_dump_memory = True else: # if not, die :) self.sched.die() self.must_run = False Daemon.manage_signal(self, sig, frame) def do_loop_turn(self): # Ok, now the conf self.wait_for_initial_conf() if not self.new_conf: return print "Ok we've got conf" self.setup_new_conf() print "Configuration Loaded" self.sched.run() def setup_new_conf(self): #self.use_ssl = self.app.use_ssl (conf, override_conf, modules, satellites) = self.new_conf self.new_conf = None # In fact it make the scheduler just DIE as a bad guy. # Must manage it better or not manage it at all! #if self.cur_conf and self.cur_conf.magic_hash == conf.magic_hash: # print("I received a conf with same hash than me, I skip it.") # return self.conf = conf self.cur_conf = conf self.override_conf = override_conf self.modules = modules self.satellites = satellites #self.pollers = self.app.pollers # Now We create our pollers for pol_id in satellites['pollers']: # Must look if we already have it already_got = pol_id in self.pollers p = satellites['pollers'][pol_id] self.pollers[pol_id] = p uri = pyro.create_uri(p['address'], p['port'], 'Schedulers', self.use_ssl) self.pollers[pol_id]['uri'] = uri self.pollers[pol_id]['last_connexion'] = 0 print "Got a poller", p #First mix conf and override_conf to have our definitive conf for prop in self.override_conf: print "Overriding the property %s with value %s" % (prop, self.override_conf[prop]) val = self.override_conf[prop] setattr(self.conf, prop, val) if self.conf.use_timezone != 'NOTSET': print "Setting our timezone to", self.conf.use_timezone os.environ['TZ'] = self.conf.use_timezone time.tzset() print "I've got modules", self.modules # TODO: if scheduler had previous modules instanciated it must clean them ! self.modules_manager.set_modules(self.modules) self.do_load_modules() # And start external ones too self.modules_manager.start_external_instances() # give it an interface # But first remove previous interface if exists if self.ichecks is not None: print "Deconnecting previous Check Interface from pyro_daemon" self.pyro_daemon.unregister(self.ichecks) #Now create and connect it self.ichecks = IChecks(self.sched) self.uri = self.pyro_daemon.register(self.ichecks, "Checks") print "The Checks Interface uri is:", self.uri #Same for Broks if self.ibroks is not None: print "Deconnecting previous Broks Interface from pyro_daemon" self.pyro_daemon.unregister(self.ibroks) #Create and connect it self.ibroks = IBroks(self.sched) self.uri2 = self.pyro_daemon.register(self.ibroks, "Broks") print "The Broks Interface uri is:", self.uri2 print("Loading configuration..") self.conf.explode_global_conf() #we give sched it's conf self.sched.reset() self.sched.load_conf(self.conf) self.sched.load_satellites(self.pollers, self.reactionners) #We must update our Config dict macro with good value #from the config parameters self.sched.conf.fill_resource_macros_names_macros() #print "DBG: got macors", self.sched.conf.macros #Creating the Macroresolver Class & unique instance m = MacroResolver() m.init(self.conf) #self.conf.dump() #self.conf.quick_debug() #Now create the external commander #it's a applyer : it role is not to dispatch commands, #but to apply them e = ExternalCommandManager(self.conf, 'applyer') #Scheduler need to know about external command to #activate it if necessery self.sched.load_external_command(e) #External command need the sched because he can raise checks e.load_scheduler(self.sched) # our main function, launch after the init def main(self): self.load_config_file() self.do_daemon_init_and_start() self.uri2 = self.pyro_daemon.register(self.interface, "ForArbiter") print "The Arbiter Interface is at:", self.uri2 self.do_mainloop()
class Broker(BaseSatellite): properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='/usr/local/shinken/var/brokerd.pid'), 'port': IntegerProp(default='7772'), 'local_log': PathProp(default='/usr/local/shinken/var/brokerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file): super(Broker, self).__init__('broker', config_file, is_daemon, do_replace, debug, debug_file) # Our arbiters self.arbiters = {} # Our pollers and reactionners self.pollers = {} self.reactionners = {} # Modules are load one time self.have_modules = False # Can have a queue of external_commands give by modules # will be taken by arbiter to process self.external_commands = [] # All broks to manage self.broks = [] # broks to manage # broks raised this turn and that need to be put in self.broks self.broks_internal_raised = [] self.timeout = 1.0 # Schedulers have some queues. We can simplify call by adding # elements into the proper queue just by looking at their type # Brok -> self.broks # TODO : better tag ID? # External commands -> self.external_commands def add(self, elt): cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.data['instance_id'] = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': print "Adding in queue an external command", ExternalCommand.__dict__ self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something #like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! print elt.__dict__ if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id mean : I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] logger.log( 'A module is asking me to get all initial data from the scheduler %d' % c_id) # so we just reset the connexion adn the running_id, it will just get all new things self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0 # Get teh good tabs for links by the kind. If unknown, return None def get_links_from_type(self, type): t = {'scheduler' : self.schedulers, 'arbiter' : self.arbiters, \ 'poller' : self.pollers, 'reactionner' : self.reactionners} if type in t: return t[type] return None # Call by arbiter to get our external commands def get_external_commands(self): res = self.external_commands self.external_commands = [] return res # Check if we do not connect to ofthen to this def is_connexion_try_too_close(self, elt): now = time.time() last_connexion = elt['last_connexion'] if now - last_connexion < 5: return True return False # initialise or re-initialise connexion with scheduler or # arbiter if type == arbiter def pynag_con_init(self, id, type='scheduler'): # Get teh good links tab for looping.. links = self.get_links_from_type(type) if links is None: logger.log('DBG: Type unknown for connexion! %s' % type) return if type == 'scheduler': # If sched is not active, I do not try to init # it is just useless is_active = links[id]['active'] if not is_active: return # If we try to connect too much, we slow down our tests if self.is_connexion_try_too_close(links[id]): return # Ok, we can now update it links[id]['last_connexion'] = time.time() # DBG: print "Init connexion with", links[id]['uri'] running_id = links[id]['running_id'] # DBG: print "Running id before connexion", running_id uri = links[id]['uri'] links[id]['con'] = Pyro.core.getProxyForURI(uri) try: # intial ping must be quick pyro.set_timeout(links[id]['con'], 5) links[id]['con'].ping() new_run_id = links[id]['con'].get_running_id() # data transfert can be longer pyro.set_timeout(links[id]['con'], 120) # The schedulers have been restart : it has a new run_id. # So we clear all verifs, they are obsolete now. if new_run_id != running_id: print "[%s] New running id for the %s %s : %s (was %s)" % ( self.name, type, links[id]['name'], new_run_id, running_id) links[id]['broks'].clear() # we must ask for a enw full broks if # it's a scheduler if type == 'scheduler': print "[%s] I ask for a broks generation to the scheduler %s" % ( self.name, links[id]['name']) links[id]['con'].fill_initial_broks() # else: # print "I do nto ask for brok generation" links[id]['running_id'] = new_run_id except (Pyro.errors.ProtocolError, Pyro.errors.CommunicationError), exp: logger.log("[%s] Connexion problem to the %s %s : %s" % (self.name, type, links[id]['name'], str(exp))) links[id]['con'] = None return except Pyro.errors.NamingError, exp: logger.log("[%s] the %s '%s' is not initilised : %s" % (self.name, type, links[id]['name'], str(exp))) links[id]['con'] = None return
class Daemon(object): properties = { 'workdir': PathProp(default='/usr/local/shinken/var'), 'host': StringProp(default='0.0.0.0'), 'user': StringProp(default='shinken'), 'group': StringProp(default='shinken'), 'use_ssl': BoolProp(default='0'), 'certs_dir': StringProp(default='etc/certs'), 'ca_cert': StringProp(default='etc/certs/ca.pem'), 'server_cert': StringProp(default='etc/certs/server.pem'), 'use_local_log': BoolProp(default='0'), 'hard_ssl_name_check': BoolProp(default='0'), 'idontcareaboutsecurity': BoolProp(default='0'), 'spare': BoolProp(default='0') } def __init__(self, name, config_file, is_daemon, do_replace, debug, debug_file): self.check_shm() self.name = name self.config_file = config_file self.is_daemon = is_daemon self.do_replace = do_replace self.debug = debug self.debug_file = debug_file self.interrupted = False # Track time now = time.time() self.program_start = now self.t_each_loop = now # used to track system time change self.sleep_time = 0.0 #used to track the time we wait self.pyro_daemon = None # Log init self.log = logger self.log.load_obj(self) self.new_conf = None # used by controller to push conf self.cur_conf = None # Flag to know if we need to dump memory or not self.need_dump_memory = False #Keep a trace of the local_log file desc if need self.local_log_fd = None self.modules_manager = ModulesManager(name, self.find_modules_path(), []) os.umask(UMASK) self.set_exit_handler() # At least, lose the local log file if need def do_stop(self): if self.modules_manager: # We save what we can but NOT for the scheduler # because the current sched object is a dummy one # and the old one aleady do it! if not hasattr(self, 'sched'): self.hook_point('save_retention') # And we quit logger.log('Stopping all modules') self.modules_manager.stop_all() if self.pyro_daemon: pyro.shutdown(self.pyro_daemon) #.shutdown(True) logger.quit() def request_stop(self): self.unlink() ## unlink first self.do_stop() print("Exiting") sys.exit(0) def do_loop_turn(self): raise NotImplementedError() # Main loop for nearly all daemon # the scheduler is not managed by it :'( def do_mainloop(self): while True: self.do_loop_turn() # If ask us to dump memory, do it if self.need_dump_memory: self.dump_memory() self.need_dump_memory = False # Maybe we ask us to die, if so, do it :) if self.interrupted: break self.request_stop() def do_load_modules(self): self.modules_manager.load_and_init() self.log.log("I correctly loaded the modules : [%s]" % (','.join( [inst.get_name() for inst in self.modules_manager.instances]))) def add(self, elt): """ Dummy method for adding broker to this daemon """ pass def dump_memory(self): logger.log("I dump my memory, it can ask some seconds to do") try: from guppy import hpy hp = hpy() logger.log(hp.heap()) except ImportError: logger.log( 'I do not have the module guppy for memory dump, please install it' ) def load_config_file(self): self.parse_config_file() if self.config_file is not None: # Some paths can be relatives. We must have a full path by taking # the config file by reference self.relative_paths_to_full(os.path.dirname(self.config_file)) # Then start to log all in the local file if asked so self.register_local_log() def change_to_workdir(self): try: os.chdir(self.workdir) except Exception, e: raise InvalidWorkDir(e) print("Successfully changed to workdir: %s" % (self.workdir))