def get_instances(self): """Create, init and then returns the list of module instances that the caller needs. If an instance can't be created or init'ed then only log is done. That instance is skipped. The previous modules instance(s), if any, are all cleaned. Arbiter call this method with start_external=False :return: module instances list :rtype: list """ self.clear_instances() for (mod_conf, module) in self.modules_assoc: mod_conf.properties = module.properties.copy() try: inst = module.get_instance(mod_conf) if not isinstance(inst, BaseModule): raise TypeError('Returned instance is not of type BaseModule (%s) !' % type(inst)) except Exception as err: logger.error("The module %s raised an exception %s, I remove it! traceback=%s", mod_conf.get_name(), err, traceback.format_exc()) else: # Give the module the data to which module it is load from inst.set_loaded_into(self.modules_type) self.instances.append(inst) for inst in self.instances: # External are not init now, but only when they are started if not inst.is_external and not self.try_instance_init(inst): # If the init failed, we put in in the restart queue logger.warning("The module '%s' failed to init, I will try to restart it later", inst.get_name()) self.to_restart.append(inst) return self.instances
def try_best_load(cls, name, package=None): """Try to load module in the bast way possible (using importlib) :param name: module name to load :type name: str :param package: package name to load module from :type package: :return: None | module :rtype: """ try: mod = importlib.import_module(name, package) except Exception as err: logger.warning("Cannot import %s : %s", '%s.%s' % (package, name) if package else name, err) return # if the module have a 'properties' and a 'get_instance' # then we are happy and we'll use that: try: mod.properties mod.get_instance except AttributeError: return return mod
def do_pynag_con_init(self, s_id): """Initialize a connection with scheduler having '_id' Return the new connection to the scheduler if it succeeded, else: any error OR sched is inactive: return None. NB: if sched is inactive then None is directly returned. :param s_id: scheduler s_id to connect to :type s_id: int :return: scheduler connection object or None :rtype: alignak.http.client.HTTPClient """ sched = self.schedulers[s_id] if not sched['active']: return sname = sched['name'] uri = sched['uri'] running_id = sched['running_id'] timeout = sched['timeout'] data_timeout = sched['data_timeout'] logger.info("[%s] Init connection with %s at %s (%ss,%ss)", self.name, sname, uri, timeout, data_timeout) try: sch_con = sched['con'] = HTTPClient( uri=uri, strong_ssl=sched['hard_ssl_name_check'], timeout=timeout, data_timeout=data_timeout) except HTTPEXCEPTIONS, exp: logger.warning("[%s] Scheduler %s is not initialized or has network problem: %s", self.name, sname, str(exp)) sched['con'] = None return
def create_pack(self, buf, name): """ Create pack with data from configuration file :param buf: buffer :type buf: str :param name: name of file :type name: str :return: None """ if not json: logger.warning("[Pack] cannot load the pack file '%s': missing json lib", name) return # Ok, go compile the code try: json_dump = json.loads(buf) if 'name' not in json_dump: logger.error("[Pack] no name in the pack '%s'", name) return pack = Pack({}) pack.pack_name = json_dump['name'] pack.description = json_dump.get('description', '') pack.macros = json_dump.get('macros', {}) pack.templates = json_dump.get('templates', [pack.pack_name]) pack.path = json_dump.get('path', 'various/') pack.doc_link = json_dump.get('doc_link', '') pack.services = json_dump.get('services', {}) pack.commands = json_dump.get('commands', []) if not pack.path.endswith('/'): pack.path += '/' # Ok, add it self[pack._id] = pack except ValueError, exp: logger.error("[Pack] error in loading pack file '%s': '%s'", name, exp)
def is_correct(self): """ Check if the macromodulation is valid and have all properties defined :return: True if valide, otherwise False :rtype: bool """ state = True cls = self.__class__ # Raised all previously saw errors like unknown commands or timeperiods if self.configuration_errors != []: state = False for err in self.configuration_errors: logger.error("[item::%s] %s", self.get_name(), err) for prop, entry in cls.properties.items(): if prop not in cls._special_properties: if not hasattr(self, prop) and entry.required: logger.warning( "[macromodulation::%s] %s property not set", self.get_name(), prop ) state = False # Bad boy... # Ok just put None as modulation_period, means 24x7 if not hasattr(self, 'modulation_period'): self.modulation_period = None return state
def try_very_bad_load(cls, mod_dir): """Try to load module in a bad way (Inserting mod_dir to sys.path) then try to import module (module.py file) in this directory with importlib :param mod_dir: module directory to load :type mod_dir: str :return: None """ prev_module = sys.modules.get('module') # cache locally any previously imported 'module' .. logger.warning( "Trying to load %r as an (very-)old-style alignak \"module\" : " "by adding its path to sys.path. This can be (very) bad in case " "of name conflicts within the files part of %s and others " "top-level python modules; I'll try to limit that.", # by removing the mod_dir from sys.path after while. mod_dir, mod_dir ) sys.path.insert(0, mod_dir) try: return importlib.import_module('module') except Exception as err: logger.exception("Could not import bare 'module.py' from %s : %s", mod_dir, err) return finally: sys.path.remove(mod_dir) if prev_module is not None: # and restore it after we have loaded our one (or not) sys.modules['module'] = prev_module
def manage_host_check_result_brok(self, b): """ Manage a host check result brok (we UPDATE data info with this) :param b: :type b: :return: None """ host_name = b.data['host_name'] logger.debug("[Graphite] host check result: %s", host_name) # If host initial status brok has not been received, ignore ... if host_name not in self.hosts_cache: logger.warning("[Graphite] received service check result for an unknown host: %s", host_name) return # Decode received metrics couples = self.get_metric_and_value('host_check', b.data['perf_data']) # If no values, we can exit now if len(couples) == 0: logger.debug("[Graphite] no metrics to send ...") return # Custom hosts variables hname = self.illegal_char_hostname.sub('_', host_name) if '_GRAPHITE_GROUP' in self.hosts_cache[host_name]: hname = ".".join((self.hosts_cache[host_name]['_GRAPHITE_GROUP'], hname)) if '_GRAPHITE_PRE' in self.hosts_cache[host_name]: hname = ".".join((self.hosts_cache[host_name]['_GRAPHITE_PRE'], hname)) if self.hostcheck: hname = '.'.join((hname, self.hostcheck)) # Checks latency if self.ignore_latency_limit >= b.data['latency'] > 0: check_time = int(b.data['last_chk']) - int(b.data['latency']) logger.info("[Graphite] Ignoring latency for service %s. Latency : %s", b.data['service_description'], b.data['latency']) else: check_time = int(b.data['last_chk']) # Graphite data source if self.graphite_data_source: path = '.'.join((hname, self.graphite_data_source)) else: path = hname lines = [] # Send a bulk of all metrics at once for (metric, value) in couples: lines.append("%s.%s %s %d" % (path, metric, value, check_time)) lines.append("\n") packet = '\n'.join(lines) self.send_packet(packet)
def get_start_and_end_time(self, ref=None): """Generic function to get start time and end time :param ref: time in seconds :type ref: int :return: None """ logger.warning("Calling function get_start_and_end_time which is not implemented") raise NotImplementedError()
def push_external_commands_to_schedulers(self): """Send a HTTP request to the schedulers (POST /run_external_commands) with external command list if the receiver is in direct routing. If not in direct_routing just clear the unprocessed_external_command list and return :return: None """ # If we are not in a direct routing mode, just bailout after # faking resolving the commands if not self.direct_routing: self.external_commands.extend(self.unprocessed_external_commands) self.unprocessed_external_commands = [] return commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] # Now get all external commands and put them into the # good schedulers for ext_cmd in commands_to_process: self.external_command.resolve_command(ext_cmd) # Now for all alive schedulers, send the commands for sched_id in self.schedulers: sched = self.schedulers[sched_id] extcmds = sched['external_commands'] cmds = [extcmd.cmd_line for extcmd in extcmds] con = sched.get('con', None) sent = False if not con: logger.warning("The scheduler is not connected %s", sched) self.pynag_con_init(sched_id) con = sched.get('con', None) # If there are commands and the scheduler is alive if len(cmds) > 0 and con: logger.debug("Sending %d commands to scheduler %s", len(cmds), sched) try: # con.run_external_commands(cmds) con.post('run_external_commands', {'cmds': cmds}) sent = True # Not connected or sched is gone except (HTTPEXCEPTIONS, KeyError), exp: logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) self.pynag_con_init(sched_id) return except AttributeError, exp: # the scheduler must not be initialized logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) except Exception, exp: logger.error( "A satellite raised an unknown exception: %s (%s)", exp, type(exp)) raise
def get_start_and_end_time(self, ref=None): """Generic function to get start time and end time :param ref: time in seconds :type ref: int :return: None """ logger.warning( "Calling function get_start_and_end_time which is not implemented") raise NotImplementedError()
def check_bad_dispatch(self): """Check if we have a bad dispatch For example : a spare started but the master was still alive We need ask the spare to wait a new conf :return: None """ for elt in self.elements: if hasattr(elt, 'conf'): # If element has a conf, I do not care, it's a good dispatch # If dead: I do not ask it something, it won't respond.. if elt.conf is None and elt.reachable: # print "Ask", elt.get_name() , 'if it got conf' if elt.have_conf(): logger.warning("The element %s have a conf and should " "not have one! I ask it to idle now", elt.get_name()) elt.active = False elt.wait_new_conf() # I do not care about order not send or not. If not, # The next loop will resent it # else: # print "No conf" # I ask satellites which sched_id they manage. If I do not agree, I ask # them to remove it for satellite in self.satellites: kind = satellite.get_my_type() if satellite.reachable: cfg_ids = satellite.managed_confs # what_i_managed() # I do not care about satellites that do nothing, they already # do what I want :) if len(cfg_ids) != 0: id_to_delete = [] for cfg_id in cfg_ids: # DBG print kind, ":", satellite.get_name(), "manage cfg id:", cfg_id # Ok, we search for realms that have the conf for realm in self.realms: if cfg_id in realm.confs: # Ok we've got the realm, we check its to_satellites_managed_by # to see if reactionner is in. If not, we remove he sched_id for it if satellite not in realm.to_satellites_managed_by[kind][cfg_id]: id_to_delete.append(cfg_id) # Maybe we removed all cfg_id of this reactionner # We can put it idle, no active and wait_new_conf if len(id_to_delete) == len(cfg_ids): satellite.active = False logger.info("I ask %s to wait a new conf", satellite.get_name()) satellite.wait_new_conf() else: # It is not fully idle, just less cfg for r_id in id_to_delete: logger.info("I ask to remove configuration N%d from %s", r_id, satellite.get_name()) satellite.remove_from_conf(id)
def get_services(self): """ Get services from alignak_backend :return: None """ backend = Backend() all_services = backend.method_get(self.endpoint('service?embedded={"use":1,"host_name":1,' '"servicegroups":1,"check_command":1,' '"check_period":1,"notification_period":1,' '"contacts":1,"contact_groups":1,' '"escalations":1,"maintenance_period":1,' '"service_dependencies":1}')) logger.warning("[Alignak Backend Arbit] Got %d services", len(all_services)) for service in all_services: service['imported_from'] = 'alignakbackend' # check_command if 'check_command' in service: if service['check_command'] is None: del service['check_command'] elif 'command_name' in service['check_command']: service['check_command'] = service['check_command']['command_name'] else: del service['check_command'] if 'check_command_args' in service: if 'check_command' not in service: service['check_command'] = '' else: service['check_command'] += '!' service['check_command'] += service['check_command_args'] del service['check_command_args'] # use self.single_relation(service, 'use', 'name') # host_name self.single_relation(service, 'host_name', 'host_name') # check_period self.single_relation(service, 'check_period', 'timeperiod_name') # notification_period self.single_relation(service, 'notification_period', 'timeperiod_name') # maintenance_period self.single_relation(service, 'maintenance_period', 'timeperiod_name') # servicegroups self.multiple_relation(service, 'servicegroups', 'servicegroup_name') # contacts self.multiple_relation(service, 'contacts', 'contact_name') # contact_groups self.multiple_relation(service, 'contact_groups', 'contactgroup_name') # escalations self.multiple_relation(service, 'escalations', 'escalation_name') # service_dependencies self.multiple_relation(service, 'service_dependencies', 'service_name') self.clean_unusable_keys(service) self.config['services'].append(service)
def get_hosts(self): """ Get hosts from alignak_backend :return: None """ backend = Backend() all_hosts = backend.method_get(self.endpoint('host?embedded={"use":1,"parents":1,' '"hostgroups":1,"check_command":1,' '"contacts":1,"contact_groups":1,' '"escalations":1,"check_period":1,' '"notification_period":1}')) logger.warning("[Alignak Backend Arbit] Got %d hosts", len(all_hosts)) for host in all_hosts: host['imported_from'] = 'alignakbackend' # use self.single_relation(host, 'use', 'name') # check_command if 'check_command' in host: if host['check_command'] is None: host['check_command'] = '' elif 'command_name' in host['check_command']: host['check_command'] = host['check_command']['command_name'] else: host['check_command'] = '' if 'check_command_args' in host: if 'check_command' not in host: host['check_command'] = '' else: host['check_command'] += '!' host['check_command'] += host['check_command_args'] del host['check_command_args'] # check_period self.single_relation(host, 'check_period', 'timeperiod_name') # notification_period self.single_relation(host, 'notification_period', 'timeperiod_name') # parents self.multiple_relation(host, 'parents', 'host_name') # hostgroups self.multiple_relation(host, 'hostgroups', 'hostgroup_name') # contacts self.multiple_relation(host, 'contacts', 'contact_name') # contact_groups self.multiple_relation(host, 'contact_groups', 'contactgroup_name') # escalations self.multiple_relation(host, 'escalations', 'escalation_name') if host['realm'] is None: del host['realm'] self.backend_ids['hosts'][host['_id']] = host['host_name'] self.clean_unusable_keys(host) self.config['hosts'].append(host)
def push_external_commands_to_schedulers(self): """Send a HTTP request to the schedulers (POST /run_external_commands) with external command list if the receiver is in direct routing. If not in direct_routing just clear the unprocessed_external_command list and return :return: None """ # If we are not in a direct routing mode, just bailout after # faking resolving the commands if not self.direct_routing: self.external_commands.extend(self.unprocessed_external_commands) self.unprocessed_external_commands = [] return commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] # Now get all external commands and put them into the # good schedulers for ext_cmd in commands_to_process: self.external_command.resolve_command(ext_cmd) # Now for all alive schedulers, send the commands for sched_id in self.schedulers: sched = self.schedulers[sched_id] extcmds = sched['external_commands'] cmds = [extcmd.cmd_line for extcmd in extcmds] con = sched.get('con', None) sent = False if not con: logger.warning("The scheduler is not connected %s", sched) self.pynag_con_init(sched_id) con = sched.get('con', None) # If there are commands and the scheduler is alive if len(cmds) > 0 and con: logger.debug("Sending %d commands to scheduler %s", len(cmds), sched) try: # con.run_external_commands(cmds) con.post('run_external_commands', {'cmds': cmds}) sent = True # Not connected or sched is gone except (HTTPEXCEPTIONS, KeyError), exp: logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) self.pynag_con_init(sched_id) return except AttributeError, exp: # the scheduler must not be initialized logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) except Exception, exp: logger.error("A satellite raised an unknown exception: %s (%s)", exp, type(exp)) raise
def execute_query(self, query): """ Execute a query against an Oracle database. :param query: the query :type query: str :return: None """ logger.debug("[DBOracle] Execute Oracle query %s\n", query) try: self.db_cursor.execute(query) self.db.commit() except IntegrityError_exp, exp: logger.warning("[DBOracle] Warning: a query raise an integrity error: %s, %s", query, exp)
def add(self, elt): """Add elt to this broker Original comment : Schedulers have some queues. We can simplify the call by adding elements into the proper queue just by looking at their type Brok -> self.broks TODO: better tag ID? External commands -> self.external_commands :param elt: object to add :type elt: object :return: None """ cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.instance_id = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': logger.debug("Enqueuing an external command '%s'", str(ExternalCommand.__dict__)) self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something # like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! logger.debug(str(elt.__dict__)) if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id means: I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] source = elt.source logger.info('The module %s is asking me to get all initial data ' 'from the scheduler %d', source, c_id) # so we just reset the connection and the running_id, # it will just get all new things try: self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0 except KeyError: # maybe this instance was not known, forget it logger.warning("the module %s ask me a full_instance_id " "for an unknown ID (%d)!", source, c_id) # Maybe a module tells me that it's dead, I must log it's last words... if elt.get_type() == 'ICrash': data = elt.get_data() logger.error('the module %s just crash! Please look at the traceback:', data['name']) logger.error(data['trace'])
def linkify_hg_by_realms(self, realms): """ More than an explode function, but we need to already have members so... Will be really linkify just after And we explode realm in ours members, but do not override a host realm value if it's already set :param realms: object Realms :type realms: object :return: None """ # Now we explode the realm value if we've got one # The group realm must not override a host one (warning?) for hostgroup in self: if not hasattr(hostgroup, "realm"): continue # Maybe the value is void? if not hostgroup.realm.strip(): continue realm = realms.find_by_name(hostgroup.realm.strip()) if realm is not None: hostgroup.realm = realm logger.debug("[hostgroups] %s is in %s realm", hostgroup.get_name(), realm.get_name()) else: err = "the hostgroup %s got an unknown realm '%s'" % (hostgroup.get_name(), hostgroup.realm) hostgroup.configuration_errors.append(err) hostgroup.realm = None continue for host in hostgroup: if host is None: continue if host.realm is None or host.got_default_realm: # default not hasattr(h, 'realm'): logger.debug( "[hostgroups] apply a realm %s to host %s from a hostgroup " "rule (%s)", hostgroup.realm.get_name(), host.get_name(), hostgroup.get_name(), ) host.realm = hostgroup.realm else: if host.realm != hostgroup.realm: logger.warning( "[hostgroups] host %s it not in the same realm than it's " "hostgroup %s", host.get_name(), hostgroup.get_name(), )
def linkify_hg_by_realms(self, realms): """ More than an explode function, but we need to already have members so... Will be really linkify just after And we explode realm in ours members, but do not override a host realm value if it's already set :param realms: object Realms :type realms: object :return: None """ # Now we explode the realm value if we've got one # The group realm must not override a host one (warning?) for hostgroup in self: if not hasattr(hostgroup, 'realm'): continue # Maybe the value is void? if not hostgroup.realm.strip(): continue realm = realms.find_by_name(hostgroup.realm.strip()) if realm is not None: hostgroup.realm = realm logger.debug("[hostgroups] %s is in %s realm", hostgroup.get_name(), realm.get_name()) else: err = "the hostgroup %s got an unknown realm '%s'" % \ (hostgroup.get_name(), hostgroup.realm) hostgroup.configuration_errors.append(err) hostgroup.realm = None continue for host in hostgroup: if host is None: continue if host.realm is None or host.got_default_realm: # default not hasattr(h, 'realm'): logger.debug( "[hostgroups] apply a realm %s to host %s from a hostgroup " "rule (%s)", hostgroup.realm.get_name(), host.get_name(), hostgroup.get_name()) host.realm = hostgroup.realm else: if host.realm != hostgroup.realm: logger.warning( "[hostgroups] host %s it not in the same realm than it's " "hostgroup %s", host.get_name(), hostgroup.get_name())
def load(self): """Try to import the requested modules ; put the imported modules in self.imported_modules. The previous imported modules, if any, are cleaned before. :return: None """ if self.modules_path not in sys.path: sys.path.append(self.modules_path) alignak_modules_path = sys.modules['alignak'].__path__[0] + '/modules' if alignak_modules_path not in sys.path: sys.path.append(alignak_modules_path) modules_paths = [alignak_modules_path, self.modules_path] modules_files = [] for path in modules_paths: for fname in listdir(path): if isdir(join(path, fname)): modules_files.append({'path': path, 'name': fname}) del self.imported_modules[:] for module in modules_files: mod_file = abspath(join(module['path'], module['name'], 'module.py')) mod_dir = os.path.normpath(os.path.dirname(mod_file)) mod = self.try_load(module['name'], mod_dir) if not mod: continue try: is_our_type = self.modules_type in mod.properties['daemons'] except Exception as err: logger.warning("Bad module file for %s : cannot check its properties['daemons']" "attribute : %s", mod_file, err) else: # We want to keep only the modules of our type if is_our_type: self.imported_modules.append(mod) # Now we want to find in theses modules the ones we are looking for del self.modules_assoc[:] for mod_conf in self.modules: module_type = uniform_module_type(mod_conf.module_type) for module in self.imported_modules: if uniform_module_type(module.properties['type']) == module_type: self.modules_assoc.append((mod_conf, module)) break else: # No module is suitable, we emit a Warning logger.warning("The module type %s for %s was not found in modules!", module_type, mod_conf.get_name())
def load(self, mod_confs): """ Try to import the requested modules ; put the imported modules in self.imported_modules. """ # Now we want to find in theses modules the ones we are looking for del self.modules_assoc[:] for mod_conf in mod_confs: try: module = importlib.import_module(mod_conf.python_name) self.find_module_properties_and_get_instance(module, mod_conf.python_name) self.modules_assoc.append((mod_conf, module)) except ImportError: logger.warning("Module %s (%s) can't be loaded, not found", mod_conf.python_name, mod_conf.module_alias) except AttributeError: logger.warning("Module %s (%s) can't be loaded because attributes errors", mod_conf.python_name, mod_conf.module_alias)
def get_commands(self): """ Get commands from alignak_backend :return: None """ backend = Backend() all_commands = backend.method_get(self.endpoint('command?embedded={"use":1}')) logger.warning("[Alignak Backend Arbit] Got %d commands", len(all_commands)) for command in all_commands: command['imported_from'] = 'alignakbackend' # use self.single_relation(command, 'use', 'name') self.backend_ids['commands'][command['_id']] = command['command_name'] self.clean_unusable_keys(command) self.config['commands'].append(command)
def is_correct(self): """ Check if dateranges of timeperiod are valid :return: false if at least one datarange is invalid :rtype: bool """ valid = True for daterange in self.dateranges: good = daterange.is_correct() if not good: logger.error("[timeperiod::%s] invalid daterange ", self.get_name()) valid &= good # Warn about non correct entries for entry in self.invalid_entries: logger.warning("[timeperiod::%s] invalid entry '%s'", self.get_name(), entry) return valid
def start_external_instances(self, late_start=False): """Launch external instances that are load correctly :param late_start: If late_start, don't look for last_init_try :type late_start: bool :return: None """ for inst in [inst for inst in self.instances if inst.is_external]: # But maybe the init failed a bit, so bypass this ones from now if not self.try_instance_init(inst, late_start=late_start): logger.warning("The module '%s' failed to init, I will try to restart it later", inst.get_name()) self.to_restart.append(inst) continue # ok, init succeed logger.info("Starting external module %s", inst.get_name()) inst.start()
def add_failed_check_attempt(self, reason=''): """Go in reachable=False and add a failed attempt if we reach the max, go dead :param reason: the reason of adding an attemps (stack trace sometimes) :type reason: str :return: None """ self.reachable = False self.attempt += 1 self.attempt = min(self.attempt, self.max_check_attempts) # Don't need to warn again and again if the satellite is already dead if self.alive: logger.warning("Add failed attempt to %s (%d/%d) %s", self.get_name(), self.attempt, self.max_check_attempts, reason) # check when we just go HARD (dead) if self.attempt == self.max_check_attempts: self.set_dead()
def start_external_instances(self, late_start=False): """Launch external instances that are load correctly :param late_start: If late_start, don't look for last_init_try :type late_start: bool :return: None """ for inst in [inst for inst in self.instances if inst.is_external]: # But maybe the init failed a bit, so bypass this ones from now if not self.try_instance_init(inst, late_start=late_start): logger.warning( "The module '%s' failed to init, I will try to restart it later", inst.get_name()) self.to_restart.append(inst) continue # ok, init succeed logger.info("Starting external module %s", inst.get_name()) inst.start()
def load(self, mod_confs): """ Try to import the requested modules ; put the imported modules in self.imported_modules. """ # Now we want to find in theses modules the ones we are looking for del self.modules_assoc[:] for mod_conf in mod_confs: try: module = importlib.import_module(mod_conf.python_name) self.find_module_properties_and_get_instance( module, mod_conf.python_name) self.modules_assoc.append((mod_conf, module)) except ImportError: logger.warning("Module %s (%s) can't be loaded, not found", mod_conf.python_name, mod_conf.module_alias) except AttributeError: logger.warning( "Module %s (%s) can't be loaded because attributes errors", mod_conf.python_name, mod_conf.module_alias)
def set_dead(self): """Set the satellite into dead state: * Alive -> False * con -> None Create an update Brok :return:None """ was_alive = self.alive self.alive = False self.con = None # We are dead now. Must raise # a brok to say it if was_alive: logger.warning("Setting the satellite %s to a dead state.", self.get_name()) brok = self.get_update_status_brok() self.broks.append(brok)
def add_failed_check_attempt(self, reason=""): """Go in reachable=False and add a failed attempt if we reach the max, go dead :param reason: the reason of adding an attemps (stack trace sometimes) :type reason: str :return: None """ self.reachable = False self.attempt += 1 self.attempt = min(self.attempt, self.max_check_attempts) # Don't need to warn again and again if the satellite is already dead if self.alive: logger.warning( "Add failed attempt to %s (%d/%d) %s", self.get_name(), self.attempt, self.max_check_attempts, reason ) # check when we just go HARD (dead) if self.attempt == self.max_check_attempts: self.set_dead()
def stop_process(self): """Request the module process to stop and release it :return: None """ if self.process: logger.info("I'm stopping module %r (pid=%s)", self.get_name(), self.process.pid) self.process.terminate() self.process.join(timeout=1) if self.process.is_alive(): logger.warning("%r is still alive normal kill, I help it to die", self.get_name()) self.kill() self.process.join(1) if self.process.is_alive(): logger.error("%r still alive after brutal kill, I leave it.", self.get_name()) self.process = None
def execute_query(self, query, do_debug=False): """Just run the query :param query: the query :type query: str :param do_debug: execute in debug or not :type do_debug: bool :return: True if query execution is ok, otherwise False :rtype: bool TODO: finish catch """ if do_debug: logger.debug("[MysqlDB]I run query %s", query) try: self.db_cursor.execute(query) self.db.commit() return True except IntegrityError, exp: logger.warning("[MysqlDB] A query raised an integrity error: %s, %s", query, exp) return False
def check_and_del_zombie_workers(self): """Check if worker are fine and kill them if not. Dispatch the actions in the worker to another one :return: None """ # Active children make a join with everyone, useful :) active_children() w_to_del = [] for worker in self.workers.values(): # If a worker goes down and we did not ask him, it's not # good: we can think that we have a worker and it's not True # So we del it if not worker.is_alive(): logger.warning("[%s] The worker %s goes down unexpectedly!", self.name, worker._id) # Terminate immediately worker.terminate() worker.join(timeout=1) w_to_del.append(worker._id) # OK, now really del workers from queues # And requeue the actions it was managed for w_id in w_to_del: worker = self.workers[w_id] # Del the queue of the module queue del self.q_by_mod[worker.module_name][worker._id] for sched_id in self.schedulers: sched = self.schedulers[sched_id] for act in sched['actions'].values(): if act.status == 'queue' and act.worker_id == w_id: # Got a check that will NEVER return if we do not # restart it self.assign_to_a_queue(act) # So now we can really forgot it del self.workers[w_id]
def execute_query(self, query, do_debug=False): """Just run the query :param query: the query :type query: str :param do_debug: execute in debug or not :type do_debug: bool :return: True if query execution is ok, otherwise False :rtype: bool TODO: finish catch """ if do_debug: logger.debug("[MysqlDB]I run query %s", query) try: self.db_cursor.execute(query) self.db.commit() return True except IntegrityError, exp: logger.warning( "[MysqlDB] A query raised an integrity error: %s, %s", query, exp) return False
def stop_process(self): """Request the module process to stop and release it :return: None """ if self.process: logger.info("I'm stopping module %r (pid=%s)", self.get_name(), self.process.pid) self.process.terminate() self.process.join(timeout=1) if self.process.is_alive(): logger.warning( "%r is still alive normal kill, I help it to die", self.get_name()) self.kill() self.process.join(1) if self.process.is_alive(): logger.error( "%r still alive after brutal kill, I leave it.", self.get_name()) self.process = None
def get_instances(self): """Create, init and then returns the list of module instances that the caller needs. If an instance can't be created or init'ed then only log is done. That instance is skipped. The previous modules instance(s), if any, are all cleaned. Arbiter call this method with start_external=False :return: module instances list :rtype: list """ self.clear_instances() for (mod_conf, module) in self.modules_assoc: mod_conf.properties = module.properties.copy() try: inst = module.get_instance(mod_conf) if not isinstance(inst, BaseModule): raise TypeError( 'Returned instance is not of type BaseModule (%s) !' % type(inst)) except Exception as err: logger.error( "The module %s raised an exception %s, I remove it! traceback=%s", mod_conf.get_name(), err, traceback.format_exc()) else: # Give the module the data to which module it is load from inst.set_loaded_into(self.modules_type) self.instances.append(inst) for inst in self.instances: # External are not init now, but only when they are started if not inst.is_external and not self.try_instance_init(inst): # If the init failed, we put in in the restart queue logger.warning( "The module '%s' failed to init, I will try to restart it later", inst.get_name()) self.to_restart.append(inst) return self.instances
def manage_signal(self, sig, frame): """Manage signals caught by the daemon signal.SIGUSR1 : dump_memory signal.SIGUSR2 : dump_object (nothing) signal.SIGTERM, signal.SIGINT : terminate process :param sig: signal caught by daemon :type sig: str :param frame: current stack frame :type frame: :return: None TODO: Refactor with Daemon one """ logger.warning("%s > Received a SIGNAL %s", process.current_process(), sig) # If we got USR1, just dump memory if sig == signal.SIGUSR1: self.sched.need_dump_memory = True elif sig == signal.SIGUSR2: # usr2, dump objects self.sched.need_objects_dump = True else: # if not, die :) self.sched.die() self.must_run = False Daemon.manage_signal(self, sig, frame)
def send_packet(self, packet): """ Send data to Carbon. In case of failure, try to reconnect and send again. :param packet: metric to send to graphite :type packet: string :return: True on success send, False otherwise :rtype; bool """ if not self.con: self.init() if not self.con: logger.warning("[Graphite] Connection to the Graphite Carbon instance is broken!" " Storing data in module cache ... ") self.cache.append(packet) logger.warning("[Graphite] cached metrics %d packets", len(self.cache)) return False if self.cache: logger.info("[Graphite] %d cached metrics packet(s) to send to Graphite", len(self.cache)) commit_count = 0 now = time() while True: try: self.con.sendall(self.cache.popleft()) commit_count = commit_count + 1 if commit_count >= self.cache_commit_volume: break except IndexError: logger.debug("[Graphite] sent all cached metrics") break except Exception, exp: logger.error("[Graphite] exception: %s", str(exp)) logger.info("[Graphite] time to flush %d cached metrics packet(s) (%2.4f)", commit_count, time() - now)
def create_pack(self, buf, name): """ Create pack with data from configuration file :param buf: buffer :type buf: str :param name: name of file :type name: str :return: None """ if not json: logger.warning( "[Pack] cannot load the pack file '%s': missing json lib", name) return # Ok, go compile the code try: json_dump = json.loads(buf) if 'name' not in json_dump: logger.error("[Pack] no name in the pack '%s'", name) return pack = Pack({}) pack.pack_name = json_dump['name'] pack.description = json_dump.get('description', '') pack.macros = json_dump.get('macros', {}) pack.templates = json_dump.get('templates', [pack.pack_name]) pack.path = json_dump.get('path', 'various/') pack.doc_link = json_dump.get('doc_link', '') pack.services = json_dump.get('services', {}) pack.commands = json_dump.get('commands', []) if not pack.path.endswith('/'): pack.path += '/' # Ok, add it self[pack._id] = pack except ValueError, exp: logger.error("[Pack] error in loading pack file '%s': '%s'", name, exp)
def do_pynag_con_init(self, s_id): """Initialize a connection with scheduler having '_id' Return the new connection to the scheduler if it succeeded, else: any error OR sched is inactive: return None. NB: if sched is inactive then None is directly returned. :param s_id: scheduler s_id to connect to :type s_id: int :return: scheduler connection object or None :rtype: alignak.http.client.HTTPClient """ sched = self.schedulers[s_id] if not sched['active']: return sname = sched['name'] uri = sched['uri'] running_id = sched['running_id'] timeout = sched['timeout'] data_timeout = sched['data_timeout'] logger.info("[%s] Init connection with %s at %s (%ss,%ss)", self.name, sname, uri, timeout, data_timeout) try: sch_con = sched['con'] = HTTPClient( uri=uri, strong_ssl=sched['hard_ssl_name_check'], timeout=timeout, data_timeout=data_timeout) except HTTPEXCEPTIONS, exp: logger.warning( "[%s] Scheduler %s is not initialized or has network problem: %s", self.name, sname, str(exp)) sched['con'] = None return
def is_correct(self): """Check if the CheckModulation definition is correct:: * Check for required attribute * Raise previous configuration errors :return: True if the definition is correct, False otherwise :rtype: bool """ state = True cls = self.__class__ # Raised all previously saw errors like unknown commands or timeperiods if self.configuration_errors != []: state = False for err in self.configuration_errors: logger.error("[item::%s] %s", self.get_name(), err) for prop, entry in cls.properties.items(): if prop not in cls._special_properties: if not hasattr(self, prop) and entry.required: logger.warning("[checkmodulation::%s] %s property not set", self.get_name(), prop) state = False # Bad boy... # Ok now we manage special cases... # Service part if not hasattr(self, 'check_command'): logger.warning("[checkmodulation::%s] do not have any check_command defined", self.get_name()) state = False else: if self.check_command is None: logger.warning("[checkmodulation::%s] a check_command is missing", self.get_name()) state = False if not self.check_command.is_valid(): logger.warning("[checkmodulation::%s] a check_command is invalid", self.get_name()) state = False # Ok just put None as check_period, means 24x7 if not hasattr(self, 'check_period'): self.check_period = None return state
class DBOracle(DB): """Manage connection and query execution against Oracle databases.""" def __init__(self, user, password, database, table_prefix=''): super(DBOracle, self).__init__(table_prefix) self.user = user self.password = password self.database = database def connect_database(self): """Create the database connection :return: None TODO: finish (begin :) ) error catch and conf parameters... """ connstr = '%s/%s@%s' % (self.user, self.password, self.database) self.db = connect_function(connstr) # pylint: disable=C0103 self.db_cursor = self.db.cursor() self.db_cursor.arraysize = 50 def execute_query(self, query): """ Execute a query against an Oracle database. :param query: the query :type query: str :return: None """ logger.debug("[DBOracle] Execute Oracle query %s\n", query) try: self.db_cursor.execute(query) self.db.commit() except IntegrityError_exp, exp: logger.warning("[DBOracle] Warning: a query raise an integrity error: %s, %s", query, exp) except ProgrammingError_exp, exp: logger.warning("[DBOracle] Warning: a query raise a programming error: %s, %s", query, exp)
def manage_brok(self, brok): """Get a brok. We put brok data to the modules :param brok: object with data :type brok: object :return: None """ # Call all modules if they catch the call for mod in self.modules_manager.get_internal_instances(): try: mod.manage_brok(brok) except Exception, exp: logger.debug(str(exp.__dict__)) logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later", mod.get_name(), str(exp)) logger.warning("Exception type: %s", type(exp)) logger.warning("Back trace of this kill: %s", traceback.format_exc()) self.modules_manager.set_to_restart(mod)
def manage_brok(self, brok): """Send brok to modules. Modules have to implement their own manage_brok function. They usually do if they inherits from basemodule REF: doc/receiver-modules.png (4-5) :param brok: brok to manage :type brok: alignak.brok.Brok :return: None """ to_del = [] # Call all modules if they catch the call for mod in self.modules_manager.get_internal_instances(): try: mod.manage_brok(brok) except Exception, exp: logger.warning("The mod %s raise an exception: %s, I kill it", mod.get_name(), str(exp)) logger.warning("Exception type: %s", type(exp)) logger.warning("Back trace of this kill: %s", traceback.format_exc()) to_del.append(mod)
def compensate_system_time_change(self, difference): """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ logger.warning( "A system time change of %d has been detected. Compensating...", difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in self.sched.checks.values(): # Already launch checks should not be touch if chk.status == 'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = chk.ref new_t = max(0, t_to_go + difference) if ref.check_period is not None: # But it's no so simple, we must match the timeperiod new_t = ref.check_period.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = 'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in self.sched.actions.values(): # Already launch checks should not be touch if act.status == 'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == 'notification': if ref.notification_period: # But it's no so simple, we must match the timeperiod new_t = ref.notification_period.get_next_valid_time_from_t( new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t
class Satellite(BaseSatellite): """Satellite class. Subclassed by Receiver, Reactionner and Poller """ do_checks = False do_actions = False my_type = '' def __init__(self, name, config_file, is_daemon, do_replace, debug, debug_file): super(Satellite, self).__init__(name, config_file, is_daemon, do_replace, debug, debug_file) # Keep broks so they can be eaten by a broker self.broks = {} self.workers = {} # dict of active workers # Init stats like Load for workers self.wait_ratio = Load(initial_value=1) self.slave_q = None self.returns_queue = None self.q_by_mod = {} def pynag_con_init(self, _id): """Wrapped function for do_pynag_con_init :param _id: scheduler _id to connect to :type _id: int :return: scheduler connection object or None :rtype: alignak.http.client.HTTPClient """ _t0 = time.time() res = self.do_pynag_con_init(_id) statsmgr.incr('con-init.scheduler', time.time() - _t0) return res def do_pynag_con_init(self, s_id): """Initialize a connection with scheduler having '_id' Return the new connection to the scheduler if it succeeded, else: any error OR sched is inactive: return None. NB: if sched is inactive then None is directly returned. :param s_id: scheduler s_id to connect to :type s_id: int :return: scheduler connection object or None :rtype: alignak.http.client.HTTPClient """ sched = self.schedulers[s_id] if not sched['active']: return sname = sched['name'] uri = sched['uri'] running_id = sched['running_id'] timeout = sched['timeout'] data_timeout = sched['data_timeout'] logger.info("[%s] Init connection with %s at %s (%ss,%ss)", self.name, sname, uri, timeout, data_timeout) try: sch_con = sched['con'] = HTTPClient( uri=uri, strong_ssl=sched['hard_ssl_name_check'], timeout=timeout, data_timeout=data_timeout) except HTTPEXCEPTIONS, exp: logger.warning( "[%s] Scheduler %s is not initialized or has network problem: %s", self.name, sname, str(exp)) sched['con'] = None return # timeout of 3s by default (short one) # and get the running s_id try: new_run_id = sch_con.get('get_running_id') new_run_id = float(new_run_id) except (HTTPEXCEPTIONS, cPickle.PicklingError, KeyError), exp: logger.warning( "[%s] Scheduler %s is not initialized or has network problem: %s", self.name, sname, str(exp)) sched['con'] = None return
links[sched_id]['con'] = None continue logger.debug("%s Broks get in %s", len(tmp_broks), time.time() - t00) for brok in tmp_broks.values(): brok.instance_id = links[sched_id]['instance_id'] # Ok, we can add theses broks to our queues self.add_broks_to_queue(tmp_broks.values()) else: # no con? make the connection self.pynag_con_init(sched_id, i_type=i_type) # Ok, con is not known, so we create it except KeyError, exp: logger.debug("Key error for get_broks : %s", str(exp)) self.pynag_con_init(sched_id, i_type=i_type) except HTTPEXCEPTIONS, exp: logger.warning("Connection problem to the %s %s: %s", i_type, links[sched_id]['name'], str(exp)) links[sched_id]['con'] = None # scheduler must not #be initialized except AttributeError, exp: logger.warning("The %s %s should not be initialized: %s", i_type, links[sched_id]['name'], str(exp)) # scheduler must not have checks # What the F**k? We do not know what happened, # so.. bye bye :) except Exception, err: logger.error(str(err)) logger.error(traceback.format_exc()) sys.exit(1) def get_retention_data(self): """Get all broks
# If we are in the fork module, we do not specify a target target = None if module_name == 'fork': target = None else: for module in self.modules_manager.instances: if module.properties['type'] == module_name: # First, see if the module is a 'worker' one or not if not module.properties.get('worker_capable', False): raise NotWorkerMod target = module.work if target is None: if module_name not in __warned: logger.warning( "No target found for %s, NOT creating a worker for it..", module_name) __warned.add(module_name) return # We want to give to the Worker the name of the daemon (poller or reactionner) cls_name = self.__class__.__name__.lower() worker = Worker( 1, queue, self.returns_queue, self.processes_by_worker, mortal=mortal, max_plugins_output_length=self.max_plugins_output_length, target=target, loaded_into=cls_name, http_daemon=self.http_daemon)
class DBMysql(DB): """DBMysql is a MySQL access database class""" def __init__(self, host, user, password, database, character_set, table_prefix='', port=3306): super(DBMysql, self).__init__(table_prefix='') self.host = host self.user = user self.password = password self.database = database self.character_set = character_set self.port = port def connect_database(self): """Create the database connection :return: None TODO: finish (begin :) ) error catch and conf parameters... Import to catch exception """ # self.db = MySQLdb.connect (host = "localhost", user = "******", # passwd = "root", db = "merlin") # pylint: disable=C0103 self.db = MySQLdb.connect(host=self.host, user=self.user, passwd=self.password, db=self.database, port=self.port) self.db.set_character_set(self.character_set) self.db_cursor = self.db.cursor() self.db_cursor.execute('SET NAMES %s;' % self.character_set) self.db_cursor.execute('SET CHARACTER SET %s;' % self.character_set) self.db_cursor.execute('SET character_set_connection=%s;' % self.character_set) # Thanks: # http://www.dasprids.de/blog/2007/12/17/python-mysqldb-and-utf-8 # for utf8 code :) def execute_query(self, query, do_debug=False): """Just run the query :param query: the query :type query: str :param do_debug: execute in debug or not :type do_debug: bool :return: True if query execution is ok, otherwise False :rtype: bool TODO: finish catch """ if do_debug: logger.debug("[MysqlDB]I run query %s", query) try: self.db_cursor.execute(query) self.db.commit() return True except IntegrityError, exp: logger.warning( "[MysqlDB] A query raised an integrity error: %s, %s", query, exp) return False except ProgrammingError, exp: logger.warning( "[MysqlDB] A query raised a programming error: %s, %s", query, exp) return False
def do_work(self, slave_q, returns_queue, control_q): """ Main function of the worker. * Get checks * Launch new checks * Manage finished checks :param slave_q: Global Queue Master->Slave :type slave_q: Queue.Queue :param returns_queue: queue managed by manager :type returns_queue: Queue.Queue :param control_q: Control Queue for the worker :type control_q: Queue.Queue :return: None """ # restore default signal handler for the workers: signal.signal(signal.SIGTERM, signal.SIG_DFL) self.set_proctitle() timeout = 1.0 self.checks = [] self.returns_queue = returns_queue self.slave_q = slave_q self.t_each_loop = time.time() while True: begin = time.time() msg = None cmsg = None # If we are dying (big problem!) we do not # take new jobs, we just finished the current one if not self.i_am_dying: # REF: doc/alignak-action-queues.png (3) self.get_new_checks() # REF: doc/alignak-action-queues.png (4) self.launch_new_checks() # REF: doc/alignak-action-queues.png (5) self.manage_finished_checks() # Now get order from master try: cmsg = control_q.get(block=False) if cmsg.get_type() == 'Die': logger.debug("[%d] Dad say we are dying...", self._id) break except Exception: pass # Look if we are dying, and if we finish all current checks # if so, we really die, our master poller will launch a new # worker because we were too weak to manage our job :( if len(self.checks) == 0 and self.i_am_dying: logger.warning( "[%d] I DIE because I cannot do my job as I should" "(too many open files?)... forgot me please.", self._id) break # Manage a possible time change (our avant will be change with the diff) diff = self.check_for_system_time_change() begin += diff timeout -= time.time() - begin if timeout < 0: timeout = 1.0
def dispatch(self): """Dispatch configuration to other daemons REF: doc/alignak-conf-dispatching.png (3) :return: None """ # Ok, we pass at least one time in dispatch, so now errors are True errors self.first_dispatch_done = True # If no needed to dispatch, do not dispatch :) if not self.dispatch_ok: for realm in self.realms: conf_to_dispatch = [cfg for cfg in realm.confs.values() if not cfg.is_assigned] nb_conf = len(conf_to_dispatch) if nb_conf > 0: logger.info("Dispatching Realm %s", realm.get_name()) logger.info('[%s] Dispatching %d/%d configurations', realm.get_name(), nb_conf, len(realm.confs)) # Now we get in scheds all scheduler of this realm and upper so # we will send them conf (in this order) scheds = self.get_scheduler_ordered_list(realm) if nb_conf > 0: print_string = '[%s] Schedulers order: %s' % ( realm.get_name(), ','.join([s.get_name() for s in scheds])) logger.info(print_string) # Try to send only for alive members scheds = [s for s in scheds if s.alive] # Now we do the real job # every_one_need_conf = False for conf in conf_to_dispatch: logger.info('[%s] Dispatching configuration %s', realm.get_name(), conf._id) # If there is no alive schedulers, not good... if len(scheds) == 0: logger.info('[%s] but there a no alive schedulers in this realm!', realm.get_name()) # we need to loop until the conf is assigned # or when there are no more schedulers available while True: try: sched = scheds.pop() except IndexError: # No more schedulers.. not good, no loop # need_loop = False # The conf does not need to be dispatch cfg_id = conf._id for kind in ('reactionner', 'poller', 'broker', 'receiver'): realm.to_satellites[kind][cfg_id] = None realm.to_satellites_need_dispatch[kind][cfg_id] = False realm.to_satellites_managed_by[kind][cfg_id] = [] break logger.info('[%s] Trying to send conf %d to scheduler %s', realm.get_name(), conf._id, sched.get_name()) if not sched.need_conf: logger.info('[%s] The scheduler %s do not need conf, sorry', realm.get_name(), sched.get_name()) continue # We tag conf with the instance_name = scheduler_name instance_name = sched.scheduler_name # We give this configuration a new 'flavor' conf.push_flavor = random.randint(1, 1000000) # REF: doc/alignak-conf-dispatching.png (3) # REF: doc/alignak-scheduler-lost.png (2) override_conf = sched.get_override_configuration() satellites_for_sched = realm.get_satellites_links_for_scheduler() s_conf = realm.serialized_confs[conf._id] # Prepare the conf before sending it conf_package = { 'conf': s_conf, 'override_conf': override_conf, 'modules': sched.modules, 'satellites': satellites_for_sched, 'instance_name': sched.scheduler_name, 'push_flavor': conf.push_flavor, 'skip_initial_broks': sched.skip_initial_broks, 'accept_passive_unknown_check_results': sched.accept_passive_unknown_check_results, # shiken.io part 'api_key': self.conf.api_key, 'secret': self.conf.secret, 'http_proxy': self.conf.http_proxy, # statsd one too because OlivierHA love statsd # and after some years of effort he manages to make me # understand the powerfullness of metrics :) 'statsd_host': self.conf.statsd_host, 'statsd_port': self.conf.statsd_port, 'statsd_prefix': self.conf.statsd_prefix, 'statsd_enabled': self.conf.statsd_enabled, } t01 = time.time() is_sent = sched.put_conf(conf_package) logger.debug("Conf is sent in %d", time.time() - t01) if not is_sent: logger.warning('[%s] configuration dispatching error for scheduler %s', realm.get_name(), sched.get_name()) continue logger.info('[%s] Dispatch OK of conf in scheduler %s', realm.get_name(), sched.get_name()) sched.conf = conf sched.push_flavor = conf.push_flavor sched.need_conf = False conf.is_assigned = True conf.assigned_to = sched # We update all data for this scheduler sched.managed_confs = {conf._id: conf.push_flavor} # Now we generate the conf for satellites: cfg_id = conf._id for kind in ('reactionner', 'poller', 'broker', 'receiver'): realm.to_satellites[kind][cfg_id] = sched.give_satellite_cfg() realm.to_satellites_need_dispatch[kind][cfg_id] = True realm.to_satellites_managed_by[kind][cfg_id] = [] # Ok, the conf is dispatched, no more loop for this # configuration break # We pop conf to dispatch, so it must be no more conf... conf_to_dispatch = [cfg for cfg in self.conf.confs.values() if not cfg.is_assigned] nb_missed = len(conf_to_dispatch) if nb_missed > 0: logger.warning("All schedulers configurations are not dispatched, %d are missing", nb_missed) else: logger.info("OK, all schedulers configurations are dispatched :)") self.dispatch_ok = True # Sched without conf in a dispatch ok are set to no need_conf # so they do not raise dispatch where no use if self.dispatch_ok: for sched in self.schedulers.items.values(): if sched.conf is None: # print "Tagging sched", sched.get_name(), # "so it do not ask anymore for conf" sched.need_conf = False arbiters_cfg = {} for arb in self.arbiters: arbiters_cfg[arb._id] = arb.give_satellite_cfg() # We put the satellites conf with the "new" way so they see only what we want for realm in self.realms: for cfg in realm.confs.values(): cfg_id = cfg._id # flavor if the push number of this configuration send to a scheduler flavor = cfg.push_flavor for kind in ('reactionner', 'poller', 'broker', 'receiver'): if realm.to_satellites_need_dispatch[kind][cfg_id]: cfg_for_satellite_part = realm.to_satellites[kind][cfg_id] # make copies of potential_react list for sort satellites = [] for sat in realm.get_potential_satellites_by_type(kind): satellites.append(sat) satellites.sort(alive_then_spare_then_deads) # Only keep alive Satellites and reachable ones satellites = [s for s in satellites if s.alive and s.reachable] # If we got a broker, we make the list to pop a new # item first for each scheduler, so it will smooth the load # But the spare must stay at the end ;) # WARNING : skip this if we are in a complet broker link realm if kind == "broker" and not realm.broker_complete_links: nospare = [s for s in satellites if not s.spare] # Should look over the list, not over if len(nospare) != 0: idx = cfg_id % len(nospare) spares = [s for s in satellites if s.spare] new_satellites = nospare[idx:] for sat in nospare[: -idx + 1]: if sat not in new_satellites: new_satellites.append(sat) satellites = new_satellites satellites.extend(spares) # Dump the order where we will send conf satellite_string = "[%s] Dispatching %s satellite with order: " % ( realm.get_name(), kind) for sat in satellites: satellite_string += '%s (spare:%s), ' % ( sat.get_name(), str(sat.spare)) logger.info(satellite_string) # Now we dispatch cfg to every one ask for it nb_cfg_sent = 0 for sat in satellites: # Send only if we need, and if we can if (nb_cfg_sent < realm.get_nb_of_must_have_satellites(kind) and sat.alive): sat.cfg['schedulers'][cfg_id] = cfg_for_satellite_part if sat.manage_arbiters: sat.cfg['arbiters'] = arbiters_cfg # Brokers should have poller/reactionners links too if kind == "broker": realm.fill_broker_with_poller_reactionner_links(sat) is_sent = False # Maybe this satellite already got this configuration, # so skip it if sat.do_i_manage(cfg_id, flavor): logger.info('[%s] Skipping configuration %d send ' 'to the %s %s: it already got it', realm.get_name(), cfg_id, kind, sat.get_name()) is_sent = True else: # ok, it really need it :) logger.info('[%s] Trying to send configuration to %s %s', realm.get_name(), kind, sat.get_name()) is_sent = sat.put_conf(sat.cfg) if is_sent: sat.active = True logger.info('[%s] Dispatch OK of configuration %s to %s %s', realm.get_name(), cfg_id, kind, sat.get_name()) # We change the satellite configuration, update our data sat.known_conf_managed_push(cfg_id, flavor) nb_cfg_sent += 1 realm.to_satellites_managed_by[kind][cfg_id].append(sat) # If we got a broker, the conf_id must be sent to only ONE # broker in a classic realm. if kind == "broker" and not realm.broker_complete_links: break # If receiver, we must send the hostnames # of this configuration if kind == 'receiver': hnames = [h.get_name() for h in cfg.hosts] logger.debug("[%s] Sending %s hostnames to the " "receiver %s", realm.get_name(), len(hnames), sat.get_name()) sat.push_host_names(cfg_id, hnames) # else: # #I've got enough satellite, the next ones are considered spares if nb_cfg_sent == realm.get_nb_of_must_have_satellites(kind): logger.info("[%s] OK, no more %s sent need", realm.get_name(), kind) realm.to_satellites_need_dispatch[kind][cfg_id] = False # And now we dispatch receivers. It's easier, they need ONE conf # in all their life :) for realm in self.realms: for rec in realm.receivers: if rec.need_conf: logger.info('[%s] Trying to send configuration to receiver %s', realm.get_name(), rec.get_name()) is_sent = False if rec.reachable: is_sent = rec.put_conf(rec.cfg) else: logger.info('[%s] Skyping configuration sent to offline receiver %s', realm.get_name(), rec.get_name()) if is_sent: rec.active = True rec.need_conf = False logger.info('[%s] Dispatch OK of configuration to receiver %s', realm.get_name(), rec.get_name()) else: logger.error('[%s] Dispatching failed for receiver %s', realm.get_name(), rec.get_name())
:param query: the query :type query: str :return: None """ logger.debug("[DBOracle] Execute Oracle query %s\n", query) try: self.db_cursor.execute(query) self.db.commit() except IntegrityError_exp, exp: logger.warning("[DBOracle] Warning: a query raise an integrity error: %s, %s", query, exp) except ProgrammingError_exp, exp: logger.warning("[DBOracle] Warning: a query raise a programming error: %s, %s", query, exp) except DatabaseError_exp, exp: logger.warning("[DBOracle] Warning: a query raise a database error: %s, %s", query, exp) except InternalError_exp, exp: logger.warning("[DBOracle] Warning: a query raise an internal error: %s, %s", query, exp) except DataError_exp, exp: logger.warning("[DBOracle] Warning: a query raise a data error: %s, %s", query, exp) except OperationalError_exp, exp: logger.warning("[DBOracle] Warning: a query raise an operational error: %s, %s", query, exp) except Exception, exp: logger.warning("[DBOracle] Warning: a query raise an unknown error: %s, %s", query, exp) logger.warning(exp.__dict__)
def check_dispatch(self): """Check if all active items are still alive :return: None TODO: finish need conf """ # Check if the other arbiter has a conf, but only if I am a master for arb in self.arbiters: # If not me and I'm a master if arb != self.arbiter and self.arbiter and not self.arbiter.spare: if not arb.have_conf(self.conf.magic_hash): if not hasattr(self.conf, 'whole_conf_pack'): logger.error('CRITICAL: the arbiter try to send a configureion but ' 'it is not a MASTER one?? Look at your configuration.') continue arb.put_conf(self.conf.whole_conf_pack) # Remind it that WE are the master here! arb.do_not_run() else: # Ok, it already has the conf. I remember that # it does not have to run, I'm still alive! arb.do_not_run() # We check for confs to be dispatched on alive scheds. If not dispatched, need dispatch :) # and if dispatch on a failed node, remove the association, and need a new dispatch for realm in self.realms: for cfg_id in realm.confs: push_flavor = realm.confs[cfg_id].push_flavor sched = realm.confs[cfg_id].assigned_to if sched is None: if self.first_dispatch_done: logger.info("Scheduler configuration %d is unmanaged!!", cfg_id) self.dispatch_ok = False else: if not sched.alive: self.dispatch_ok = False # so we ask a new dispatching logger.warning("Scheduler %s had the configuration %d but is dead, " "I am not happy.", sched.get_name(), cfg_id) sched.conf.assigned_to = None sched.conf.is_assigned = False sched.conf.push_flavor = 0 sched.push_flavor = 0 sched.conf = None # Maybe the scheduler restarts, so is alive but without # the conf we think it was managing so ask it what it is # really managing, and if not, put the conf unassigned if not sched.do_i_manage(cfg_id, push_flavor): self.dispatch_ok = False # so we ask a new dispatching logger.warning("Scheduler %s did not managed its configuration %d, " "I am not happy.", sched.get_name(), cfg_id) if sched.conf: sched.conf.assigned_to = None sched.conf.is_assigned = False sched.conf.push_flavor = 0 sched.push_flavor = 0 sched.need_conf = True sched.conf = None # Else: ok the conf is managed by a living scheduler # Maybe satellites are alive, but do not have a cfg yet. # I think so. It is not good. I ask a global redispatch for # the cfg_id I think is not correctly dispatched. for realm in self.realms: for cfg_id in realm.confs: push_flavor = realm.confs[cfg_id].push_flavor try: for kind in ('reactionner', 'poller', 'broker', 'receiver'): # We must have the good number of satellite or we are not happy # So we are sure to raise a dispatch every loop a satellite is missing if (len(realm.to_satellites_managed_by[kind][cfg_id]) < realm.get_nb_of_must_have_satellites(kind)): logger.warning("Missing satellite %s for configuration %d:", kind, cfg_id) # TODO: less violent! Must only resent to who need? # must be caught by satellite who sees that # it already has the conf and do nothing self.dispatch_ok = False # so we will redispatch all realm.to_satellites_need_dispatch[kind][cfg_id] = True realm.to_satellites_managed_by[kind][cfg_id] = [] for satellite in realm.to_satellites_managed_by[kind][cfg_id]: # Maybe the sat was marked as not alive, but still in # to_satellites_managed_by. That means that a new dispatch # is needed # Or maybe it is alive but I thought that this reactionner # managed the conf and it doesn't. # I ask a full redispatch of these cfg for both cases if push_flavor == 0 and satellite.alive: logger.warning('[%s] The %s %s manage a unmanaged configuration', realm.get_name(), kind, satellite.get_name()) continue if not satellite.alive or ( satellite.reachable and not satellite.do_i_manage(cfg_id, push_flavor)): logger.warning('[%s] The %s %s seems to be down, ' 'I must re-dispatch its role to someone else.', realm.get_name(), kind, satellite.get_name()) self.dispatch_ok = False # so we will redispatch all realm.to_satellites_need_dispatch[kind][cfg_id] = True realm.to_satellites_managed_by[kind][cfg_id] = [] # At the first pass, there is no cfg_id in to_satellites_managed_by except KeyError: pass # Look for receivers. If they got conf, it's ok, if not, need a simple # conf for realm in self.realms: for rec in realm.receivers: # If the receiver does not have a conf, must got one :) if rec.reachable and not rec.have_conf(): self.dispatch_ok = False # so we will redispatch all rec.need_conf = True