class SchedulerLink(SatelliteLink): """ Class to manage the scheduler information """ # Ok we lie a little here because we are a mere link in fact my_type = 'scheduler' properties = SatelliteLink.properties.copy() properties.update({ 'type': StringProp(default=u'scheduler', fill_brok=['full_status'], to_send=True), 'scheduler_name': StringProp(default='', fill_brok=['full_status']), 'port': IntegerProp(default=7768, fill_brok=['full_status'], to_send=True), 'weight': IntegerProp(default=1, fill_brok=['full_status']), 'skip_initial_broks': BoolProp(default=False, fill_brok=['full_status'], to_send=True), 'accept_passive_unknown_check_results': BoolProp(default=False, fill_brok=['full_status'], to_send=True), }) running_properties = SatelliteLink.running_properties.copy() running_properties.update({ # 'conf': # StringProp(default=None), # 'cfg': # DictProp(default={}), 'need_conf': StringProp(default=True), 'external_commands': StringProp(default=[]), }) def get_override_configuration(self): """ Some parameters can give as 'overridden parameters' like use_timezone so they will be mixed (in the scheduler) with the standard conf sent by the arbiter :return: dictionary of properties :rtype: dict """ res = {} properties = self.__class__.properties for prop, entry in list(properties.items()): if entry.override: res[prop] = getattr(self, prop) return res
class BrokerLink(SatelliteLink): """ Class to manage the broker information """ my_type = 'broker' properties = SatelliteLink.properties.copy() properties.update({ 'type': StringProp(default=u'broker', fill_brok=['full_status'], to_send=True), 'broker_name': StringProp(default='', fill_brok=['full_status']), 'port': IntegerProp(default=7772, fill_brok=['full_status'], to_send=True), 'initialized': BoolProp(default=False, fill_brok=['full_status'], to_send=True), }) def prepare_for_conf(self): """Initialize the pushed configuration dictionary with the inner properties that are to be propagated to the satellite link. :return: None """ super(BrokerLink, self).prepare_for_conf() self.cfg.update({ 'satellites': { 'receivers': {}, 'pollers': {}, 'reactionners': {} } })
class ArbiterLink(SatelliteLink): """ Class to manage the link to Arbiter daemon. With it, a master arbiter can communicate with a spare Arbiter daemon """ my_type = 'arbiter' properties = SatelliteLink.properties.copy() properties.update({ 'type': StringProp(default=u'arbiter', fill_brok=['full_status'], to_send=True), 'arbiter_name': StringProp(default='', fill_brok=['full_status']), 'host_name': StringProp(default=socket.gethostname(), to_send=True), 'port': IntegerProp(default=7770, to_send=True), 'last_master_speak': FloatProp(default=0.0) }) def is_me(self): # pragma: no cover, seems not to be used anywhere """Check if parameter name if same than name of this object TODO: is it useful? :return: true if parameter name if same than this name :rtype: bool """ logger.info( "And arbiter is launched with the hostname:%s " "from an arbiter point of view of addr:%s", self.host_name, socket.getfqdn()) return self.host_name == socket.getfqdn( ) or self.host_name == socket.gethostname() def do_not_run(self): """Check if satellite running or not If not, try to run :return: true if satellite not running :rtype: bool """ logger.debug("[%s] do_not_run", self.name) try: self.con.get('_do_not_run') return True except HTTPClientConnectionException as exp: # pragma: no cover, simple protection self.add_failed_check_attempt("Connection error when " "sending do not run: %s" % str(exp)) self.set_dead() except HTTPClientTimeoutException as exp: # pragma: no cover, simple protection self.add_failed_check_attempt("Connection timeout when " "sending do not run: %s" % str(exp)) except HTTPClientException as exp: self.add_failed_check_attempt("Error when " "sending do not run: %s" % str(exp)) return False
class Reactionner(Satellite): """ This class is an application that launches actions for the schedulers Actions can be: Notifications Event handlers When running the Reactionner will : Respond to pings from Arbiter Listen for new configurations from Arbiter The configuration consists of a list of Schedulers for which the Reactionner will launch actions for. """ do_checks = False # I do not do checks do_actions = True my_type = 'reactionner' properties = Satellite.properties.copy() properties.update({ 'type': StringProp(default='reactionner'), 'port': IntegerProp(default=7769) }) def __init__(self, **kwargs): """Reactionner daemon initialisation :param kwargs: command line arguments """ super(Reactionner, self).__init__(kwargs.get('daemon_name', 'Default-reactionner'), **kwargs)
class PollerLink(SatelliteLink): """ Class to manage the link between Arbiter and Poller. With this, an arbiter can communicate with a poller """ my_type = 'poller' # To_send: send or not to satellite conf properties = SatelliteLink.properties.copy() properties.update({ 'type': StringProp(default=u'poller', fill_brok=['full_status'], to_send=True), 'poller_name': StringProp(default='', fill_brok=['full_status']), 'port': IntegerProp(default=7771, fill_brok=['full_status'], to_send=True), # 'min_workers': # IntegerProp(default=0, fill_brok=['full_status'], to_send=True), # 'max_workers': # IntegerProp(default=30, fill_brok=['full_status'], to_send=True), # 'processes_by_worker': # IntegerProp(default=256, fill_brok=['full_status'], to_send=True), # 'worker_polling_interval': # IntegerProp(default=1, to_send=True), 'poller_tags': ListProp(default=['None'], to_send=True), })
class Businessimpactmodulation(Item): """Businessimpactmodulation class is simply a modulation of the business impact value (of a Host/Service) during a modulation period. """ my_type = 'businessimpactmodulation' properties = Item.properties.copy() properties.update({ 'business_impact_modulation_name': StringProp(), 'business_impact': IntegerProp(), 'modulation_period': StringProp(default=''), }) def __init__(self, params=None, parsing=True): super(Businessimpactmodulation, self).__init__(params, parsing=parsing) # Ok just put None as modulation_period, means 24x7 if not hasattr(self, 'modulation_period'): self.modulation_period = '24x7' def get_name(self): """Accessor to business_impact_modulation_name attribute :return: business impact modulation name :rtype: str """ if hasattr(self, 'business_impact_modulation_name'): return self.business_impact_modulation_name return 'Unnamed'
class Serviceescalation(Item): """Serviceescalation class is used to implement notification escalation for services TODO: Why this class does not inherit from fusionsupervision.objects.Escalation. Maybe we can merge it """ my_type = 'serviceescalation' properties = Item.properties.copy() properties.update({ 'host_name': StringProp(), 'hostgroup_name': StringProp(), 'service_description': StringProp(), 'first_notification': IntegerProp(), 'last_notification': IntegerProp(), 'notification_interval': IntegerProp(default=30), # like Nagios value 'escalation_period': StringProp(default=''), 'escalation_options': ListProp(default=['w', 'x', 'c', 'r'], split_on_comma=True), 'contacts': ListProp(default=[], merging='join', split_on_comma=True), 'contact_groups': ListProp(default=[], merging='join', split_on_comma=True), 'first_notification_time': IntegerProp(), 'last_notification_time': IntegerProp(), }) def __init__(self, params=None, parsing=True): if params is None: params = {} for prop in ['escalation_options']: if prop in params: params[prop] = [p.replace('u', 'x') for p in params[prop]] super(Serviceescalation, self).__init__(params, parsing=parsing)
class ReceiverLink(SatelliteLink): """ Class to manage the receiver information """ my_type = 'receiver' properties = SatelliteLink.properties.copy() properties.update({ 'type': StringProp(default='receiver', fill_brok=['full_status'], to_send=True), 'receiver_name': StringProp(default='', fill_brok=['full_status'], to_send=True), 'port': IntegerProp(default=7772, fill_brok=['full_status'], to_send=True), })
class Poller(Satellite): """Poller class. Referenced as "app" in most Interface """ do_checks = True # I do checks do_actions = False # but no actions my_type = 'poller' properties = Satellite.properties.copy() properties.update({ 'type': StringProp(default='poller'), 'port': IntegerProp(default=7771) }) def __init__(self, **kwargs): """Poller daemon initialisation :param kwargs: command line arguments """ super(Poller, self).__init__(kwargs.get('daemon_name', 'Default-poller'), **kwargs)
class ReactionnerLink(SatelliteLink): """ Class to manage the reactionner information """ my_type = 'reactionner' properties = SatelliteLink.properties.copy() properties.update({ 'type': StringProp(default='reactionner', fill_brok=['full_status'], to_send=True), 'reactionner_name': StringProp(default='', fill_brok=['full_status']), 'port': IntegerProp(default=7769, fill_brok=['full_status'], to_send=True), # 'min_workers': # IntegerProp(default=1, fill_brok=['full_status'], to_send=True), # 'max_workers': # IntegerProp(default=30, fill_brok=['full_status'], to_send=True), # 'processes_by_worker': # IntegerProp(default=256, fill_brok=['full_status'], to_send=True), # 'worker_polling_interval': # IntegerProp(default=1, to_send=True), 'reactionner_tags': ListProp(default=['None'], to_send=True), })
class ActionBase(FusionsupervisionObject): # pylint: disable=too-many-instance-attributes """ This abstract class is used to have a common base for both actions (event handlers and notifications) and checks. The Action may be on internal one if it does require to use a Worker process to run the action because the Scheduler is able to resolve the action by itseld. This class is specialized according to the running OS. Currently, only Linux/Unix like OSes are tested """ process = None properties = { 'is_a': StringProp(default=u''), 'type': StringProp(default=u''), 'internal': BoolProp(default=False), 'creation_time': FloatProp(default=0.0), '_is_orphan': BoolProp(default=False), '_in_timeout': BoolProp(default=False), 'status': StringProp(default=ACT_STATUS_SCHEDULED), 'exit_status': IntegerProp(default=3), 'output': StringProp(default=u'', fill_brok=['full_status']), 'long_output': StringProp(default=u'', fill_brok=['full_status']), 'perf_data': StringProp(default=u'', fill_brok=['full_status']), 't_to_go': FloatProp(default=0.0), 'check_time': IntegerProp(default=0), 'last_poll': IntegerProp(default=0), 'execution_time': FloatProp(default=0.0), 'wait_time': FloatProp(default=0.001), 'u_time': FloatProp(default=0.0), 's_time': FloatProp(default=0.0), 'reactionner_tag': StringProp(default=u'None'), 'env': DictProp(default={}), 'module_type': StringProp(default=u'fork', fill_brok=['full_status']), 'my_worker': StringProp(default=u'none'), 'command': StringProp(default=''), 'timeout': IntegerProp(default=10), 'ref': StringProp(default=u'unset'), 'ref_type': StringProp(default=u'unset'), 'my_scheduler': StringProp(default=u'unassigned'), } def __init__(self, params=None, parsing=False): super(ActionBase, self).__init__(params, parsing=parsing) # Set a creation time only if not provided if not params or 'creation_time' not in params: self.creation_time = time.time() # Set actions log only if not provided if not params or 'log_actions' not in params: self.log_actions = 'ALIGNAK_LOG_ACTIONS' in os.environ # Fill default parameters self.fill_default() def is_launchable(self, timestamp): """Check if this action can be launched based on current time :param timestamp: time to compare :type timestamp: int :return: True if timestamp >= self.t_to_go, False otherwise :rtype: bool """ if self.t_to_go is None: return False return timestamp >= self.t_to_go def get_local_environnement(self): """ Mix the environment and the environment variables into a new local environment dictionary Note: We cannot just update the global os.environ because this would effect all other checks. :return: local environment variables :rtype: dict """ # Do not use copy.copy() here, as the resulting copy still # changes the real environment (it is still a os._Environment # instance). local_env = os.environ.copy() for local_var in self.env: local_env[local_var] = self.env[local_var] return local_env def execute(self): """Start this action command in a subprocess. :raise: ActionError 'toomanyopenfiles' if too many opened files on the system 'no_process_launched' if arguments parsing failed 'process_launch_failed': if the process launch failed :return: reference to the started process :rtype: psutil.Process """ self.status = ACT_STATUS_LAUNCHED self.check_time = time.time() self.wait_time = 0.0001 self.last_poll = self.check_time # Get a local env variables with our additional values self.local_env = self.get_local_environnement() # Initialize stdout and stderr. self.stdoutdata = '' self.stderrdata = '' logger.debug("Launch command: '%s', ref: %s, timeout: %s", self.command, self.ref, self.timeout) if self.log_actions: if os.environ['ALIGNAK_LOG_ACTIONS'] == 'WARNING': logger.warning("Launch command: '%s'", self.command) else: logger.info("Launch command: '%s'", self.command) return self._execute() # OS specific part def get_outputs(self, out, max_plugins_output_length): """Get check outputs from single output (split perfdata etc). Updates output, perf_data and long_output attributes. :param out: output data of a check :type out: str :param max_output: max plugin data length :type max_output: int :return: None """ # Squeeze all output after max_plugins_output_length out = out[:max_plugins_output_length] # manage escaped pipes out = out.replace(r'\|', '___PROTECT_PIPE___') # Then cuts by lines elts = out.split('\n') # For perf data elts_line1 = elts[0].split('|') # First line before | is output, strip it self.output = elts_line1[0].strip().replace('___PROTECT_PIPE___', '|') try: self.output = self.output.decode('utf8', 'ignore') except UnicodeEncodeError: pass except AttributeError: pass # Init perfdata as empty self.perf_data = '' # After | it is perfdata, strip it if len(elts_line1) > 1: self.perf_data = elts_line1[1].strip().replace( '___PROTECT_PIPE___', '|') # Now manage others lines. Before the | it's long_output # And after it's all perf_data, \n joined long_output = [] in_perfdata = False for line in elts[1:]: # if already in perfdata, direct append if in_perfdata: self.perf_data += ' ' + line.strip().replace( '___PROTECT_PIPE___', '|') else: # not already in perf_data, search for the | part :) elts = line.split('|', 1) # The first part will always be long_output long_output.append(elts[0].strip().replace( '___PROTECT_PIPE___', '|')) if len(elts) > 1: in_perfdata = True self.perf_data += ' ' + elts[1].strip().replace( '___PROTECT_PIPE___', '|') # long_output is all non output and performance data, joined with \n self.long_output = '\n'.join(long_output) # Get sure the performance data are stripped self.perf_data = self.perf_data.strip() logger.debug("Command result for '%s': %d, %s", self.command, self.exit_status, self.output) if self.log_actions: if os.environ['ALIGNAK_LOG_ACTIONS'] == 'WARNING': logger.warning("Check result for '%s': %d, %s", self.command, self.exit_status, self.output) if self.perf_data: logger.warning("Performance data for '%s': %s", self.command, self.perf_data) else: logger.info("Check result for '%s': %d, %s", self.command, self.exit_status, self.output) if self.perf_data: logger.info("Performance data for '%s': %s", self.command, self.perf_data) def check_finished(self, max_plugins_output_length): # pylint: disable=too-many-branches """Handle action if it is finished (get stdout, stderr, exit code...) :param max_plugins_output_length: max plugin data length :type max_plugins_output_length: int :return: None """ self.last_poll = time.time() _, _, child_utime, child_stime, _ = os.times() # Not yet finished... if self.process.poll() is None: # We must wait, but checks are variable in time so we do not wait the same # for a little check or a long ping. So we do like TCP: slow start with a very # shot time (0.0001 s) increased *2 but do not wait more than 0.5 s. self.wait_time = min(self.wait_time * 2, 0.5) now = time.time() # This log is really spamming... uncomment if you really need this information :) # logger.debug("%s - Process pid=%d is still alive", now, self.process.pid) # Get standard outputs in non blocking mode from the process streams stdout = no_block_read(self.process.stdout) stderr = no_block_read(self.process.stderr) try: self.stdoutdata += stdout.decode("utf-8") self.stderrdata += stderr.decode("utf-8") except AttributeError: pass if (now - self.check_time) > self.timeout: logger.warning( "Process pid=%d spent too much time: %.2f seconds", self.process.pid, now - self.check_time) self._in_timeout = True self._kill() self.status = ACT_STATUS_TIMEOUT self.execution_time = now - self.check_time self.exit_status = 3 if self.log_actions: if os.environ['ALIGNAK_LOG_ACTIONS'] == 'WARNING': logger.warning("Action '%s' exited on timeout (%d s)", self.command, self.timeout) else: logger.info("Action '%s' exited on timeout (%d s)", self.command, self.timeout) # Do not keep the process objcet del self.process # Replace stdout with stderr if stdout is empty self.stdoutdata = self.stdoutdata.strip() if not self.stdoutdata: self.stdoutdata = self.stderrdata # Now grep what we want in the output self.get_outputs(self.stdoutdata, max_plugins_output_length) # We can clean the useless properties now del self.stdoutdata del self.stderrdata # Get the user and system time _, _, n_child_utime, n_child_stime, _ = os.times() self.u_time = n_child_utime - child_utime self.s_time = n_child_stime - child_stime return return logger.debug("Process pid=%d exited with %d", self.process.pid, self.process.returncode) if fcntl: # Get standard outputs in non blocking mode from the process streams stdout = no_block_read(self.process.stdout) stderr = no_block_read(self.process.stderr) else: # Get standard outputs from the communicate function (stdout, stderr) = self.process.communicate() try: self.stdoutdata += stdout.decode("utf-8") except (UnicodeDecodeError, AttributeError): self.stdoutdata += stdout try: self.stderrdata += stderr.decode("utf-8") except (UnicodeDecodeError, AttributeError): self.stderrdata += stderr self.exit_status = self.process.returncode if self.log_actions: if os.environ['ALIGNAK_LOG_ACTIONS'] == 'WARNING': logger.warning("Action '%s' exited with code %d", self.command, self.exit_status) else: logger.info("Action '%s' exited with code %d", self.command, self.exit_status) # We do not need the process now del self.process # check for bad syntax in command line: if (self.stderrdata.find('sh: -c: line 0: unexpected EOF') >= 0 or (self.stderrdata.find('sh: -c: ') >= 0 and self.stderrdata.find(': Syntax') >= 0 or self.stderrdata.find('Syntax error: Unterminated quoted string') >= 0)): logger.warning("Bad syntax in command line!") # Very, very ugly. But subprocess._handle_exitstatus does # not see a difference between a regular "exit 1" and a # bailing out shell. Strange, because strace clearly shows # a difference. (exit_group(1) vs. exit_group(257)) self.stdoutdata = self.stdoutdata + self.stderrdata self.exit_status = 3 # Make sure that exit code is a valid exit code if self.exit_status not in VALID_EXIT_STATUS: self.exit_status = 3 # Replace stdout with stderr if stdout is empty self.stdoutdata = self.stdoutdata.strip() if not self.stdoutdata: self.stdoutdata = self.stderrdata # Now grep what we want in the output self.get_outputs(self.stdoutdata, max_plugins_output_length) # We can clean the useless properties now del self.stdoutdata del self.stderrdata self.status = ACT_STATUS_DONE self.execution_time = time.time() - self.check_time # Also get the system and user times _, _, n_child_utime, n_child_stime, _ = os.times() self.u_time = n_child_utime - child_utime self.s_time = n_child_stime - child_stime def copy_shell__(self, new_i): """Create all attributes listed in 'ONLY_COPY_PROP' and return `self` with these attributes. :param new_i: object to :type new_i: object :return: object with new properties added :rtype: object """ for prop in ONLY_COPY_PROP: setattr(new_i, prop, getattr(self, prop)) return new_i def got_shell_characters(self): """Check if the command_attribute (command line) has shell characters Shell characters are : '!', '$', '^', '&', '*', '(', ')', '~', '[', ']', '|', '{', '}', ';', '<', '>', '?', '`' :return: True if one shell character is found, False otherwise :rtype: bool """ return any(c in SHELLCHARS for c in self.command) def _execute(self, force_shell=False): """Execute action in a subprocess :return: None """ pass def _kill(self): """Kill the action and close fds :return: None """ pass
class Acknowledge(FusionsupervisionObject): # pylint: disable=R0903 """ Allows you to acknowledge the current problem for the specified service. By acknowledging the current problem, future notifications (for the same service state) are disabled. If the acknowledge is "sticky", the acknowledgement will remain until the service returns to an OK state. Otherwise the acknowledgement will automatically be removed when the service state changes. If the acknowledge is "notify", a notification will be sent out to contacts indicating that the current service problem has been acknowledged and when the acknowledge is cleared. """ my_type = 'acknowledge' properties = { 'sticky': BoolProp(default=True), 'notify': BoolProp(default=False), 'end_time': IntegerProp(default=0), 'author': StringProp(default=u'FusionSupervision Engine'), 'comment': StringProp(default=u''), 'comment_id': StringProp(default=u'') } def __init__(self, params=None, parsing=False): super(Acknowledge, self).__init__(params, parsing=parsing) self.fill_default() def serialize(self): """This function serialize into a simple dict object. It is used when transferring data to other daemons over the network (http) Here we directly return all attributes :return: json representation of a Acknowledge :rtype: dict """ return { 'uuid': self.uuid, 'ref': self.ref, 'sticky': self.sticky, 'notify': self.notify, 'end_time': self.end_time, 'author': self.author, 'comment': self.comment } def get_raise_brok(self, host_name, service_name=''): """Get a start acknowledge brok :param host_name: :param service_name: :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ data = self.serialize() data['host'] = host_name if service_name != '': data['service'] = service_name return Brok({'type': 'acknowledge_raise', 'data': data}) def get_expire_brok(self, host_name, service_name=''): """Get an expire acknowledge brok :type item: item :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ data = self.serialize() data['host'] = host_name if service_name != '': data['service'] = service_name return Brok({'type': 'acknowledge_expire', 'data': data})
class Timeperiod(Item): """ Class to manage a timeperiod A timeperiod is defined with range time (hours) of week to do action and add day exceptions (like non working days) """ my_type = 'timeperiod' properties = Item.properties.copy() properties.update({ 'timeperiod_name': StringProp(fill_brok=['full_status']), 'alias': StringProp(default=u'', fill_brok=['full_status']), 'use': ListProp(default=[]), 'register': IntegerProp(default=1), # These are needed if a broker module calls methods on timeperiod objects 'dateranges': ListProp(default=[], fill_brok=['full_status']), 'exclude': ListProp(default=[], fill_brok=['full_status']), 'unresolved': ListProp(default=[], fill_brok=['full_status']), 'invalid_entries': ListProp(default=[], fill_brok=['full_status']), 'is_active': BoolProp(default=False), 'activated_once': BoolProp(default=False), }) running_properties = Item.running_properties.copy() def __init__(self, params=None, parsing=True): if params is None: params = {} # Get standard params standard_params = dict([(k, v) for k, v in list(params.items()) if k in self.__class__.properties]) # Get timeperiod params (monday, tuesday, ...) timeperiod_params = dict([(k, v) for k, v in list(params.items()) if k not in self.__class__.properties]) if 'dateranges' in standard_params and isinstance(standard_params['dateranges'], list) \ and standard_params['dateranges'] \ and isinstance(standard_params['dateranges'][0], dict): new_list = [] for elem in standard_params['dateranges']: cls = get_fusionsupervision_class( elem['__sys_python_module__']) if cls: new_list.append(cls(elem['content'])) # We recreate the object self.dateranges = new_list # And remove prop, to prevent from being overridden del standard_params['dateranges'] # Handle standard params super(Timeperiod, self).__init__(params=standard_params, parsing=parsing) self.cache = {} # For tuning purpose only self.invalid_cache = {} # same but for invalid search # We use the uuid presence to assume we are reserializing if 'uuid' in params: self.uuid = params['uuid'] else: # Initial creation here, uuid already created in super self.unresolved = [] self.dateranges = [] self.exclude = [] self.invalid_entries = [] self.is_active = False self.activated_once = False # Handle timeperiod params for key, value in list(timeperiod_params.items()): if isinstance(value, list): if value: value = value[-1] else: value = '' self.unresolved.append(key + ' ' + value) def serialize(self): """This function serialize into a simple dict object. It is used when transferring data to other daemons over the network (http) Here we directly return all attributes :return: json representation of a Timeperiod :rtype: dict """ res = super(Timeperiod, self).serialize() res['dateranges'] = [] for elem in self.dateranges: res['dateranges'].append({ '__sys_python_module__': "%s.%s" % (elem.__module__, elem.__class__.__name__), 'content': elem.serialize() }) return res def get_name(self): """ Get the name of the timeperiod :return: the timeperiod name string :rtype: str """ return getattr(self, 'timeperiod_name', 'unknown_timeperiod') def get_raw_import_values(self): # pragma: no cover, deprecation """ Get some properties of timeperiod (timeperiod is a bit different from classic item) TODO: never called anywhere, still useful? :return: a dictionnary of some properties :rtype: dict """ properties = ['timeperiod_name', 'alias', 'use', 'register'] res = {} for prop in properties: if hasattr(self, prop): val = getattr(self, prop) res[prop] = val # Now the unresolved one. The only way to get ride of same key things is to put # directly the full value as the key for other in self.unresolved: res[other] = '' return res def is_time_valid(self, timestamp): """ Check if a time is valid or not :return: time is valid or not :rtype: bool """ if hasattr(self, 'exclude'): for daterange in self.exclude: if daterange.is_time_valid(timestamp): return False for daterange in self.dateranges: if daterange.is_time_valid(timestamp): return True return False # will give the first time > t which is valid def get_min_from_t(self, timestamp): """ Get the first time > timestamp which is valid :param timestamp: number of seconds :type timestamp: int :return: number of seconds :rtype: int TODO: not used, so delete it """ mins_incl = [] for daterange in self.dateranges: mins_incl.append(daterange.get_min_from_t(timestamp)) return min(mins_incl) # will give the first time > t which is not valid def get_not_in_min_from_t(self, first): """ :return: None TODO: not used, so delete it """ pass def find_next_valid_time_from_cache(self, timestamp): """ Get the next valid time from cache :param timestamp: number of seconds :type timestamp: int :return: Nothing or time in seconds :rtype: None or int """ try: return self.cache[timestamp] except KeyError: return None def find_next_invalid_time_from_cache(self, timestamp): """ Get the next invalid time from cache :param timestamp: number of seconds :type timestamp: int :return: Nothing or time in seconds :rtype: None or int """ try: return self.invalid_cache[timestamp] except KeyError: return None def check_and_log_activation_change(self): """ Will look for active/un-active change of timeperiod. In case it change, we log it like: [1327392000] TIMEPERIOD TRANSITION: <name>;<from>;<to> States of is_active: -1: default value when start 0: when timeperiod end 1: when timeperiod start :return: None or a brok if TP changed """ now = int(time.time()) was_active = self.is_active self.is_active = self.is_time_valid(now) # If we got a change, log it! if self.is_active != was_active: _from = 0 _to = 0 # If it's the start, get a special value for was if not self.activated_once: _from = -1 self.activated_once = True if was_active: _from = 1 if self.is_active: _to = 1 # Now raise the log brok = make_monitoring_log( 'info', 'TIMEPERIOD TRANSITION: %s;%d;%d' % (self.get_name(), _from, _to)) return brok return None def clean_cache(self): """ Clean cache with entries older than now because not used in future ;) :return: None """ now = int(time.time()) t_to_del = [] for timestamp in self.cache: if timestamp < now: t_to_del.append(timestamp) for timestamp in t_to_del: del self.cache[timestamp] # same for the invalid cache t_to_del = [] for timestamp in self.invalid_cache: if timestamp < now: t_to_del.append(timestamp) for timestamp in t_to_del: del self.invalid_cache[timestamp] def get_next_valid_time_from_t(self, timestamp): # pylint: disable=too-many-branches """ Get next valid time. If it's in cache, get it, otherwise define it. The limit to find it is 1 year. :param timestamp: number of seconds :type timestamp: int or float :return: Nothing or time in seconds :rtype: None or int """ timestamp = int(timestamp) original_t = timestamp res_from_cache = self.find_next_valid_time_from_cache(timestamp) if res_from_cache is not None: return res_from_cache still_loop = True # Loop for all minutes... while still_loop: local_min = None # Ok, not in cache... dr_mins = [] for daterange in self.dateranges: dr_mins.append(daterange.get_next_valid_time_from_t(timestamp)) s_dr_mins = sorted([d for d in dr_mins if d is not None]) for t01 in s_dr_mins: if not self.exclude and still_loop: # No Exclude so we are good local_min = t01 still_loop = False else: for timeperiod in self.exclude: if not timeperiod.is_time_valid(t01) and still_loop: # OK we found a date that is not valid in any exclude timeperiod local_min = t01 still_loop = False if local_min is None: # Looking for next invalid date exc_mins = [] if s_dr_mins != []: for timeperiod in self.exclude: exc_mins.append( timeperiod.get_next_invalid_time_from_t( s_dr_mins[0])) s_exc_mins = sorted([d for d in exc_mins if d is not None]) if s_exc_mins != []: local_min = s_exc_mins[0] if local_min is None: still_loop = False else: timestamp = local_min # No loop more than one year if timestamp > original_t + 3600 * 24 * 366 + 1: still_loop = False local_min = None # Ok, we update the cache... self.cache[original_t] = local_min return local_min def get_next_invalid_time_from_t(self, timestamp): # pylint: disable=too-many-branches """ Get the next invalid time :param timestamp: timestamp in seconds (of course) :type timestamp: int or float :return: timestamp of next invalid time :rtype: int or float """ timestamp = int(timestamp) original_t = timestamp dr_mins = [] for daterange in self.dateranges: timestamp = original_t cont = True while cont: start = daterange.get_next_valid_time_from_t(timestamp) if start is not None: end = daterange.get_next_invalid_time_from_t(start) dr_mins.append((start, end)) timestamp = end else: cont = False if timestamp > original_t + (3600 * 24 * 365): cont = False periods = merge_periods(dr_mins) # manage exclude periods dr_mins = [] for exclude in self.exclude: for daterange in exclude.dateranges: timestamp = original_t cont = True while cont: start = daterange.get_next_valid_time_from_t(timestamp) if start is not None: end = daterange.get_next_invalid_time_from_t(start) dr_mins.append((start, end)) timestamp = end else: cont = False if timestamp > original_t + (3600 * 24 * 365): cont = False if not dr_mins: periods_exclude = [] else: periods_exclude = merge_periods(dr_mins) if len(periods) >= 1: # if first valid period is after original timestamp, the first invalid time # is the original timestamp if periods[0][0] > original_t: return original_t # check the first period + first period of exclude if len(periods_exclude) >= 1: if periods_exclude[0][0] < periods[0][1]: return periods_exclude[0][0] return periods[0][1] return original_t def is_correct(self): """Check if this object configuration is correct :: * Check if dateranges of timeperiod are valid * Call our parent class is_correct checker :return: True if the configuration is correct, otherwise False if at least one daterange is not correct :rtype: bool """ state = True for daterange in self.dateranges: good = daterange.is_correct() if not good: self.add_error("[timeperiod::%s] invalid daterange '%s'" % (self.get_name(), daterange)) state &= good # Warn about non correct entries for entry in self.invalid_entries: self.add_error("[timeperiod::%s] invalid entry '%s'" % (self.get_name(), entry)) return super(Timeperiod, self).is_correct() and state def __str__(self): # pragma: no cover """ Get readable object :return: this object in readable format :rtype: str """ string = '' string += str(self.__dict__) + '\n' for elt in self.dateranges: string += str(elt) (start, end) = elt.get_start_and_end_time() start = time.asctime(time.localtime(start)) end = time.asctime(time.localtime(end)) string += "\nStart and end:" + str((start, end)) string += '\nExclude' for elt in self.exclude: string += str(elt) return string def resolve_daterange(self, dateranges, entry): # pylint: disable=too-many-return-statements,too-many-statements, # pylint: disable=too-many-branches,too-many-locals """ Try to solve dateranges (special cases) :param dateranges: dateranges :type dateranges: list :param entry: property of timeperiod :type entry: string :return: None """ res = re.search( r'(\d{4})-(\d{2})-(\d{2}) - (\d{4})-(\d{2})-(\d{2}) / (\d+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (syear, smon, smday, eyear, emon, emday, skip_interval, other) = res.groups() data = { 'syear': syear, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': eyear, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': skip_interval, 'other': other } dateranges.append(CalendarDaterange(data)) return res = re.search(r'(\d{4})-(\d{2})-(\d{2}) / (\d+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (syear, smon, smday, skip_interval, other) = res.groups() eyear = syear emon = smon emday = smday data = { 'syear': syear, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': eyear, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': skip_interval, 'other': other } dateranges.append(CalendarDaterange(data)) return res = re.search( r'(\d{4})-(\d{2})-(\d{2}) - (\d{4})-(\d{2})-(\d{2})[\s\t]*([0-9:, -]+)', entry) if res is not None: (syear, smon, smday, eyear, emon, emday, other) = res.groups() data = { 'syear': syear, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': eyear, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(CalendarDaterange(data)) return res = re.search(r'(\d{4})-(\d{2})-(\d{2})[\s\t]*([0-9:, -]+)', entry) if res is not None: (syear, smon, smday, other) = res.groups() eyear = syear emon = smon emday = smday data = { 'syear': syear, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': eyear, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(CalendarDaterange(data)) return res = re.search( r'([a-z]*) ([\d-]+) ([a-z]*) - ([a-z]*) ([\d-]+) ([a-z]*) / (\d+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (swday, swday_offset, smon, ewday, ewday_offset, emon, skip_interval, other) = res.groups() smon_id = Daterange.get_month_id(smon) emon_id = Daterange.get_month_id(emon) swday_id = Daterange.get_weekday_id(swday) ewday_id = Daterange.get_weekday_id(ewday) data = { 'syear': 0, 'smon': smon_id, 'smday': 0, 'swday': swday_id, 'swday_offset': swday_offset, 'eyear': 0, 'emon': emon_id, 'emday': 0, 'ewday': ewday_id, 'ewday_offset': ewday_offset, 'skip_interval': skip_interval, 'other': other } dateranges.append(MonthWeekDayDaterange(data)) return res = re.search( r'([a-z]*) ([\d-]+) - ([a-z]*) ([\d-]+) / (\d+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (t00, smday, t01, emday, skip_interval, other) = res.groups() if t00 in Daterange.weekdays and t01 in Daterange.weekdays: swday = Daterange.get_weekday_id(t00) ewday = Daterange.get_weekday_id(t01) swday_offset = smday ewday_offset = emday data = { 'syear': 0, 'smon': 0, 'smday': 0, 'swday': swday, 'swday_offset': swday_offset, 'eyear': 0, 'emon': 0, 'emday': 0, 'ewday': ewday, 'ewday_offset': ewday_offset, 'skip_interval': skip_interval, 'other': other } dateranges.append(WeekDayDaterange(data)) return if t00 in Daterange.months and t01 in Daterange.months: smon = Daterange.get_month_id(t00) emon = Daterange.get_month_id(t01) data = { 'syear': 0, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': skip_interval, 'other': other } dateranges.append(MonthDateDaterange(data)) return if t00 == 'day' and t01 == 'day': data = { 'syear': 0, 'smon': 0, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': 0, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': skip_interval, 'other': other } dateranges.append(MonthDayDaterange(data)) return res = re.search( r'([a-z]*) ([\d-]+) - ([\d-]+) / (\d+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (t00, smday, emday, skip_interval, other) = res.groups() if t00 in Daterange.weekdays: swday = Daterange.get_weekday_id(t00) swday_offset = smday ewday = swday ewday_offset = emday data = { 'syear': 0, 'smon': 0, 'smday': 0, 'swday': swday, 'swday_offset': swday_offset, 'eyear': 0, 'emon': 0, 'emday': 0, 'ewday': ewday, 'ewday_offset': ewday_offset, 'skip_interval': skip_interval, 'other': other } dateranges.append(WeekDayDaterange(data)) return if t00 in Daterange.months: smon = Daterange.get_month_id(t00) emon = smon data = { 'syear': 0, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': skip_interval, 'other': other } dateranges.append(MonthDateDaterange(data)) return if t00 == 'day': data = { 'syear': 0, 'smon': 0, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': 0, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': skip_interval, 'other': other } dateranges.append(MonthDayDaterange(data)) return res = re.search( r'([a-z]*) ([\d-]+) ([a-z]*) - ([a-z]*) ([\d-]+) ([a-z]*) [\s\t]*([0-9:, -]+)', entry) if res is not None: (swday, swday_offset, smon, ewday, ewday_offset, emon, other) = res.groups() smon_id = Daterange.get_month_id(smon) emon_id = Daterange.get_month_id(emon) swday_id = Daterange.get_weekday_id(swday) ewday_id = Daterange.get_weekday_id(ewday) data = { 'syear': 0, 'smon': smon_id, 'smday': 0, 'swday': swday_id, 'swday_offset': swday_offset, 'eyear': 0, 'emon': emon_id, 'emday': 0, 'ewday': ewday_id, 'ewday_offset': ewday_offset, 'skip_interval': 0, 'other': other } dateranges.append(MonthWeekDayDaterange(data)) return res = re.search(r'([a-z]*) ([\d-]+) - ([\d-]+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (t00, smday, emday, other) = res.groups() if t00 in Daterange.weekdays: swday = Daterange.get_weekday_id(t00) swday_offset = smday ewday = swday ewday_offset = emday data = { 'syear': 0, 'smon': 0, 'smday': 0, 'swday': swday, 'swday_offset': swday_offset, 'eyear': 0, 'emon': 0, 'emday': 0, 'ewday': ewday, 'ewday_offset': ewday_offset, 'skip_interval': 0, 'other': other } dateranges.append(WeekDayDaterange(data)) return if t00 in Daterange.months: smon = Daterange.get_month_id(t00) emon = smon data = { 'syear': 0, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(MonthDateDaterange(data)) return if t00 == 'day': data = { 'syear': 0, 'smon': 0, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': 0, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(MonthDayDaterange(data)) return res = re.search( r'([a-z]*) ([\d-]+) - ([a-z]*) ([\d-]+)[\s\t]*([0-9:, -]+)', entry) if res is not None: (t00, smday, t01, emday, other) = res.groups() if t00 in Daterange.weekdays and t01 in Daterange.weekdays: swday = Daterange.get_weekday_id(t00) ewday = Daterange.get_weekday_id(t01) swday_offset = smday ewday_offset = emday data = { 'syear': 0, 'smon': 0, 'smday': 0, 'swday': swday, 'swday_offset': swday_offset, 'eyear': 0, 'emon': 0, 'emday': 0, 'ewday': ewday, 'ewday_offset': ewday_offset, 'skip_interval': 0, 'other': other } dateranges.append(WeekDayDaterange(data)) return if t00 in Daterange.months and t01 in Daterange.months: smon = Daterange.get_month_id(t00) emon = Daterange.get_month_id(t01) data = { 'syear': 0, 'smon': smon, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(MonthDateDaterange(data)) return if t00 == 'day' and t01 == 'day': data = { 'syear': 0, 'smon': 0, 'smday': smday, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': 0, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(MonthDayDaterange(data)) return res = re.search(r'([a-z]*) ([\d-]+) ([a-z]*)[\s\t]*([0-9:, -]+)', entry) if res is not None: (t00, t02, t01, other) = res.groups() if t00 in Daterange.weekdays and t01 in Daterange.months: swday = Daterange.get_weekday_id(t00) smon = Daterange.get_month_id(t01) emon = smon ewday = swday ewday_offset = t02 data = { 'syear': 0, 'smon': smon, 'smday': 0, 'swday': swday, 'swday_offset': t02, 'eyear': 0, 'emon': emon, 'emday': 0, 'ewday': ewday, 'ewday_offset': ewday_offset, 'skip_interval': 0, 'other': other } dateranges.append(MonthWeekDayDaterange(data)) return if not t01: if t00 in Daterange.weekdays: swday = Daterange.get_weekday_id(t00) swday_offset = t02 ewday = swday ewday_offset = swday_offset data = { 'syear': 0, 'smon': 0, 'smday': 0, 'swday': swday, 'swday_offset': swday_offset, 'eyear': 0, 'emon': 0, 'emday': 0, 'ewday': ewday, 'ewday_offset': ewday_offset, 'skip_interval': 0, 'other': other } dateranges.append(WeekDayDaterange(data)) return if t00 in Daterange.months: smon = Daterange.get_month_id(t00) emon = smon emday = t02 data = { 'syear': 0, 'smon': smon, 'smday': t02, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': emon, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(MonthDateDaterange(data)) return if t00 == 'day': emday = t02 data = { 'syear': 0, 'smon': 0, 'smday': t02, 'swday': 0, 'swday_offset': 0, 'eyear': 0, 'emon': 0, 'emday': emday, 'ewday': 0, 'ewday_offset': 0, 'skip_interval': 0, 'other': other } dateranges.append(MonthDayDaterange(data)) return res = re.search(r'([a-z]*)[\s\t]+([0-9:, -]+)', entry) if res is not None: (t00, other) = res.groups() if t00 in Daterange.weekdays: day = t00 data = {'day': day, 'other': other} dateranges.append(StandardDaterange(data)) return logger.info("[timeentry::%s] no match for %s", self.get_name(), entry) self.invalid_entries.append(entry) def apply_inheritance(self): """ Inherite no properties and no custom variables for timeperiod :return: None """ pass def explode(self): """ Try to resolve all unresolved elements :return: None """ for entry in self.unresolved: self.resolve_daterange(self.dateranges, entry) self.unresolved = [] def linkify(self, timeperiods): """ Will make timeperiod in exclude with id of the timeperiods :param timeperiods: Timeperiods object :type timeperiods: :return: None """ new_exclude = [] if hasattr(self, 'exclude') and self.exclude != []: logger.debug("[timeentry::%s] have excluded %s", self.get_name(), self.exclude) excluded_tps = self.exclude for tp_name in excluded_tps: timepriod = timeperiods.find_by_name(tp_name.strip()) if timepriod is not None: new_exclude.append(timepriod.uuid) else: msg = "[timeentry::%s] unknown %s timeperiod" % ( self.get_name(), tp_name) self.add_error(msg) self.exclude = new_exclude def check_exclude_rec(self): # pylint: disable=access-member-before-definition """ Check if this timeperiod is tagged :return: if tagged return false, if not true :rtype: bool """ if self.rec_tag: msg = "[timeentry::%s] is in a loop in exclude parameter" % ( self.get_name()) self.add_error(msg) return False self.rec_tag = True for timeperiod in self.exclude: timeperiod.check_exclude_rec() return True def fill_data_brok_from(self, data, brok_type): """ Add timeperiods from brok :param data: timeperiod dictionnary :type data: dict :param brok_type: brok type :type brok_type: string :return: None """ cls = self.__class__ # Now config properties for prop, entry in list(cls.properties.items()): # Is this property intended for broking? # if 'fill_brok' in entry: if brok_type in entry.fill_brok: if hasattr(self, prop): data[prop] = getattr(self, prop) elif entry.has_default: data[prop] = entry.default
class Module(Item): """ Class to manage a module """ my_type = 'module' properties = Item.properties.copy() properties.update({ 'name': StringProp(default=u'unset'), 'type': StringProp(default=u'unset'), 'daemon': StringProp(default=u'unset'), 'python_name': StringProp(), 'enabled': BoolProp(default=True), # Old "deprecated" property - replaced with name 'module_alias': StringProp(), # Old "deprecated" property - replaced with type 'module_types': ListProp(default=[u''], split_on_comma=True), # Allow a module to be related some other modules 'modules': ListProp(default=[''], split_on_comma=True), # Module log level 'log_level': StringProp(default=u'INFO'), # Local statsd daemon for collecting daemon metrics 'statsd_host': StringProp(default=u'localhost'), 'statsd_port': IntegerProp(default=8125), 'statsd_prefix': StringProp(default=u'fusionsupervision'), 'statsd_enabled': BoolProp(default=False) }) macros = {} def __init__(self, params=None, parsing=True): # Must be declared in this function rather than as class variable. This because the # modules may have some properties that are not the same from one instance to another. # Other objects very often have the same properties... but not the modules! self.properties = Item.properties.copy() self.properties.update({ 'name': StringProp(default=u'unset'), 'type': StringProp(default=u'unset'), 'daemon': StringProp(default=u'unset'), 'python_name': StringProp(), # Old "deprecated" property - replaced with name 'module_alias': StringProp(), # Old "deprecated" property - replaced with type 'module_types': ListProp(default=[''], split_on_comma=True), # Allow a module to be related some other modules 'modules': ListProp(default=[''], split_on_comma=True), 'enabled': BoolProp(default=True), # Module log level 'log_level': StringProp(default=u'INFO'), # Local statsd daemon for collecting daemon metrics 'statsd_host': StringProp(default=u'localhost'), 'statsd_port': IntegerProp(default=8125), 'statsd_prefix': StringProp(default=u'fusionsupervision'), 'statsd_enabled': BoolProp(default=False) }) # Manage the missing module name if params and 'name' not in params: if 'module_alias' in params: params['name'] = params['module_alias'] else: params['name'] = "Unnamed" if params and 'module_alias' not in params: if 'name' in params: params['module_alias'] = params['name'] else: params['module_alias'] = "Unnamed" super(Module, self).__init__(params, parsing=parsing) self.fill_default() # Remove extra Item base class properties... for prop in [ 'customs', 'plus', 'downtimes', 'old_properties', 'configuration_errors', 'configuration_warnings' ]: if getattr(self, prop, None): delattr(self, prop) def __repr__(self): # pragma: no cover return '<%r %r, module: %r, type(s): %r />' % \ (self.__class__.__name__, self.name, getattr(self, 'python_name', 'Unknown'), getattr(self, 'type', 'Unknown')) __str__ = __repr__ def get_name(self): """ Get name of module :return: Name of module :rtype: str """ return getattr(self, 'name', self.module_alias) def get_types(self): """ Get types of the module :return: Types of the module :rtype: str """ return getattr(self, 'module_types', 'Untyped module') def is_a_module(self, module_type): """ Is the module of the required type? :param module_type: module type to check :type: str :return: True / False """ if hasattr(self, 'type'): return module_type in self.type return module_type in self.module_types def serialize(self): """A module may have some properties that are not defined in the class properties list. Serializing a module is the same as serializing an Item but we also also include all the existing properties that are not defined in the properties or running_properties class list. We must also exclude the reference to the daemon that loaded the module! """ res = super(Module, self).serialize() cls = self.__class__ for prop in self.__dict__: if prop in cls.properties or prop in cls.running_properties or prop in [ 'properties', 'my_daemon' ]: continue res[prop] = getattr(self, prop) return res
def __init__(self, params=None, parsing=True): # Must be declared in this function rather than as class variable. This because the # modules may have some properties that are not the same from one instance to another. # Other objects very often have the same properties... but not the modules! self.properties = Item.properties.copy() self.properties.update({ 'name': StringProp(default=u'unset'), 'type': StringProp(default=u'unset'), 'daemon': StringProp(default=u'unset'), 'python_name': StringProp(), # Old "deprecated" property - replaced with name 'module_alias': StringProp(), # Old "deprecated" property - replaced with type 'module_types': ListProp(default=[''], split_on_comma=True), # Allow a module to be related some other modules 'modules': ListProp(default=[''], split_on_comma=True), 'enabled': BoolProp(default=True), # Module log level 'log_level': StringProp(default=u'INFO'), # Local statsd daemon for collecting daemon metrics 'statsd_host': StringProp(default=u'localhost'), 'statsd_port': IntegerProp(default=8125), 'statsd_prefix': StringProp(default=u'fusionsupervision'), 'statsd_enabled': BoolProp(default=False) }) # Manage the missing module name if params and 'name' not in params: if 'module_alias' in params: params['name'] = params['module_alias'] else: params['name'] = "Unnamed" if params and 'module_alias' not in params: if 'name' in params: params['module_alias'] = params['name'] else: params['module_alias'] = "Unnamed" super(Module, self).__init__(params, parsing=parsing) self.fill_default() # Remove extra Item base class properties... for prop in [ 'customs', 'plus', 'downtimes', 'old_properties', 'configuration_errors', 'configuration_warnings' ]: if getattr(self, prop, None): delattr(self, prop)
class Resultmodulation(Item): """Resultmodulation class is simply a modulation of a check result exit code during a modulation_period. """ my_type = 'resultmodulation' properties = Item.properties.copy() properties.update({ 'resultmodulation_name': StringProp(), 'exit_codes_match': IntListProp(default=[]), 'exit_code_modulation': IntegerProp(default=None), 'modulation_period': StringProp(default=None), }) special_properties = ('modulation_period', ) def get_name(self): """Accessor to resultmodulation_name attribute :return: result modulation name :rtype: str """ if hasattr(self, 'resultmodulation_name'): return self.resultmodulation_name return 'Unnamed' def is_active(self, timperiods): """ Know if this result modulation is active now :return: True is we are in the period, otherwise False :rtype: bool """ now = int(time.time()) timperiod = timperiods[self.modulation_period] if not timperiod or timperiod.is_time_valid(now): return True return False def module_return(self, return_code, timeperiods): """Module the exit code if necessary :: * modulation_period is legit * exit_code_modulation * return_code in exit_codes_match :param return_code: actual code returned by the check :type return_code: int :return: return_code modulated if necessary (exit_code_modulation) :rtype: int """ # Only if in modulation_period of modulation_period == None if self.is_active(timeperiods): # Try to change the exit code only if a new one is defined if self.exit_code_modulation is not None: # First with the exit_code_match if return_code in self.exit_codes_match: return_code = self.exit_code_modulation return return_code
class Command(Item): """ Class to manage a command A command is an external command that a poller module runs to check if something is ok or not """ __metaclass__ = AutoSlots my_type = "command" properties = Item.properties.copy() properties.update({ 'command_name': StringProp(fill_brok=['full_status']), 'command_line': StringProp(fill_brok=['full_status']), 'poller_tag': StringProp(default=u'None'), 'reactionner_tag': StringProp(default=u'None'), 'module_type': StringProp(default=None), 'timeout': IntegerProp(default=-1), 'enable_environment_macros': BoolProp(default=False), }) def __init__(self, params=None, parsing=True): if params is None: params = {} super(Command, self).__init__(params, parsing=parsing) if not hasattr(self, 'timeout'): self.timeout = -1 if not hasattr(self, 'enable_environment_macros'): self.enable_environment_macros = False if not hasattr(self, 'poller_tag'): self.poller_tag = u'None' if not hasattr(self, 'reactionner_tag'): self.reactionner_tag = u'None' if not hasattr(self, 'module_type'): # If the command start with a _, set the module_type # as the name of the command, without the _ if getattr(self, 'command_line', '').startswith('_'): # For an internal command... self.module_type = u'internal' # module_type = getattr(self, 'command_line', '').split(' ')[0] # # and we remove the first _ # self.module_type = module_type[1:] # If no command starting with _, be fork :) else: self.module_type = u'fork' def get_name(self): """ Get the name of the command :return: the command name string :rtype: str """ return self.command_name def fill_data_brok_from(self, data, brok_type): """ Add properties to data if fill_brok of these class properties is same as brok_type :param data: dictionnary of this command :type data: dict :param brok_type: type of brok :type brok_type: str :return: None """ cls = self.__class__ # Now config properties for prop, entry in list(cls.properties.items()): # Is this property intended for broking? # if 'fill_brok' in entry[prop]: if brok_type in entry.fill_brok: if hasattr(self, prop): data[prop] = getattr(self, prop) # elif 'default' in entry[prop]: # data[prop] = entry.default def is_correct(self): """Check if this object configuration is correct :: * Check our own specific properties * Call our parent class is_correct checker :return: True if the configuration is correct, otherwise False :rtype: bool """ state = True # _internal_host_check is for having an host check result # without running a check plugin if self.command_name.startswith('_internal_host_check'): # Command line may contain: [state_id][;output] parameters = self.command_line.split(';') if len(parameters) < 2: self.command_name = "_internal_host_check;0;Host assumed to be UP" self.add_warning( "[%s::%s] has no defined state nor output. Changed to %s" % (self.my_type, self.command_name, self.command_name)) elif len(parameters) < 3: state = 3 try: state = int(parameters[1]) except ValueError: self.add_warning( "[%s::%s] required a non integer state: %s. Using 3." % (self.my_type, self.command_name, parameters[1])) if state > 4: self.add_warning( "[%s::%s] required an impossible state: %d. Using 3." % (self.my_type, self.command_name, state)) output = { 0: "UP", 1: "DOWN", 2: "DOWN", 3: "UNKNOWN", 4: "UNREACHABLE", }[state] self.command_name = "_internal_host_check;Host assumed to be %s" % output self.add_warning( "[%s::%s] has no defined output. Changed to %s" % (self.my_type, self.command_name, self.command_name)) elif len(parameters) > 3: self.command_name = "%s;%s;%s" % (parameters[0], parameters[1], parameters[2]) self.add_warning( "[%s::%s] has too many parameters. Changed to %s" % (self.my_type, self.command_name, self.command_name)) return super(Command, self).is_correct() and state
class Notification(Action): # pylint: disable=R0902 """Notification class, inherits from action class. Used to notify contacts and execute notification command defined in configuration """ # AutoSlots create the __slots__ with properties and # running_properties names __metaclass__ = AutoSlots my_type = 'notification' properties = Action.properties.copy() properties.update({ 'is_a': StringProp(default=u'notification'), 'start_time': IntegerProp(default=0, fill_brok=['full_status']), 'end_time': IntegerProp(default=0, fill_brok=['full_status']), 'contact_name': StringProp(default=u'', fill_brok=['full_status']), 'host_name': StringProp(default=u'', fill_brok=['full_status']), 'service_description': StringProp(default=u'', fill_brok=['full_status']), 'reason_type': IntegerProp(default=1, fill_brok=['full_status']), 'state': IntegerProp(default=0, fill_brok=['full_status']), 'ack_author': StringProp(default=u'', fill_brok=['full_status']), 'ack_data': StringProp(default=u'', fill_brok=['full_status']), 'escalated': BoolProp(default=False, fill_brok=['full_status']), 'command_call': StringProp(default=None), 'contact': StringProp(default=None), 'notif_nb': IntegerProp(default=1), 'command': StringProp(default=u'UNSET'), 'enable_environment_macros': BoolProp(default=False), # Keep a list of currently active escalations 'already_start_escalations': SetProp(default=set()), 'type': StringProp(default=u'PROBLEM'), # For authored notifications (eg. downtime...) 'author': StringProp(default=u'n/a', fill_brok=['full_status']), 'author_name': StringProp(default=u'n/a', fill_brok=['full_status']), 'author_alias': StringProp(default=u'n/a', fill_brok=['full_status']), 'author_comment': StringProp(default=u'n/a', fill_brok=['full_status']), # All contacts that were notified 'recipients': ListProp(default=[]) }) macros = { 'NOTIFICATIONTYPE': 'type', 'NOTIFICATIONRECIPIENTS': 'recipients', 'NOTIFICATIONISESCALATED': 'escalated', 'NOTIFICATIONAUTHOR': 'author', 'NOTIFICATIONAUTHORNAME': 'author_name', 'NOTIFICATIONAUTHORALIAS': 'author_alias', 'NOTIFICATIONCOMMENT': 'author_comment', 'NOTIFICATIONNUMBER': 'notif_nb', 'NOTIFICATIONID': 'uuid', 'HOSTNOTIFICATIONNUMBER': 'notif_nb', 'HOSTNOTIFICATIONID': 'uuid', 'SERVICENOTIFICATIONNUMBER': 'notif_nb', 'SERVICENOTIFICATIONID': 'uuid' } def __init__(self, params=None, parsing=False): super(Notification, self).__init__(params, parsing=parsing) self.fill_default() def __str__(self): # pragma: no cover return "Notification %s, item: %s, type: %s, status: %s, command:'%s'" \ % (self.uuid, self.ref, self.type, self.status, self.command) def is_administrative(self): """Check if this notification is "administrative" :return: True in type not in ('PROBLEM', 'RECOVERY'), False otherwise :rtype: bool """ if self.type in ('PROBLEM', 'RECOVERY'): return False return True def get_return_from(self, notif): """Setter of exit_status and execution_time attributes :param notif: notification to get data from :type notif: fusionsupervision.notification.Notification :return: None """ self.exit_status = notif.exit_status self.execution_time = notif.execution_time def fill_data_brok_from(self, data, brok_type): """Fill data with info of item by looking at brok_type in props of properties or running_properties :param data: data to fill :type data: :param brok_type: type of brok :type brok_type: :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ cls = self.__class__ # Now config properties for prop, entry in list(cls.properties.items()): if brok_type in entry.fill_brok: data[prop] = getattr(self, prop) def get_initial_status_brok(self): """Get a initial status brok :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ data = {'uuid': self.uuid} self.fill_data_brok_from(data, 'full_status') return Brok({'type': 'notification_raise', 'data': data}) def serialize(self): """This function serialize into a simple dict object. It is used when transferring data to other daemons over the network (http) Here we directly return all attributes :return: json representation of a Timeperiod :rtype: dict """ res = super(Notification, self).serialize() if res['command_call'] is not None: if not isinstance(res['command_call'], string_types) and \ not isinstance(res['command_call'], dict): res['command_call'] = res['command_call'].serialize() return res
class Escalation(Item): """Escalation class is used to implement notification escalation """ my_type = 'escalation' properties = Item.properties.copy() properties.update({ 'escalation_name': StringProp(), 'host_name': StringProp(default=''), 'hostgroup_name': StringProp(''), 'service_description': StringProp(default=''), 'first_notification': IntegerProp(), 'last_notification': IntegerProp(), 'first_notification_time': IntegerProp(), 'last_notification_time': IntegerProp(), # As a default don't use the notification_interval defined in # the escalation, but the one defined in the object 'notification_interval': IntegerProp(default=-1), 'escalation_period': StringProp(default=''), 'escalation_options': ListProp(default=['d', 'x', 'r', 'w', 'c'], split_on_comma=True), 'contacts': ListProp(default=[], split_on_comma=True), 'contact_groups': ListProp(default=[], split_on_comma=True), }) running_properties = Item.running_properties.copy() running_properties.update({ 'time_based': BoolProp(default=False), }) special_properties = ('contacts', 'contact_groups', 'first_notification_time', 'last_notification_time') special_properties_time_based = ('contacts', 'contact_groups', 'first_notification', 'last_notification') def __init__(self, params=None, parsing=True): if params is None: params = {} for prop in ['escalation_options']: if prop in params: params[prop] = [p.replace('u', 'x') for p in params[prop]] super(Escalation, self).__init__(params, parsing=parsing) def get_name(self): """Accessor to escalation_name attribute :return: escalation name :rtype: str """ return self.escalation_name def is_eligible(self, timestamp, status, notif_number, in_notif_time, interval, escal_period): # pylint: disable=too-many-return-statements """Check if the escalation is eligible (notification is escalated or not) Escalation is NOT eligible in ONE of the following condition is fulfilled:: * escalation is not time based and notification number not in range [first_notification;last_notification] (if last_notif == 0, it's infinity) * escalation is time based and notification time not in range [first_notification_time;last_notification_time] (if last_notif_time == 0, it's infinity) * status does not matches escalation_options ('WARNING' <=> 'w' ...) * escalation_period is not legit for this time (now usually) :param timestamp: timestamp to check if timeperiod is valid :type timestamp: int :param status: item status (one of the small_states key) :type status: str :param notif_number: current notification number :type notif_number: int :param in_notif_time: current notification time :type in_notif_time: int :param interval: time interval length :type interval: int :return: True if no condition has been fulfilled, otherwise False :rtype: bool """ short_states = { u'WARNING': 'w', u'UNKNOWN': 'u', u'CRITICAL': 'c', u'RECOVERY': 'r', u'FLAPPING': 'f', u'DOWNTIME': 's', u'DOWN': 'd', u'UNREACHABLE': 'x', u'OK': 'o', u'UP': 'o' } # If we are not time based, we check notification numbers: if not self.time_based: # Begin with the easy cases if notif_number < self.first_notification: return False # self.last_notification = 0 mean no end if self.last_notification and notif_number > self.last_notification: return False # Else we are time based, we must check for the good value else: # Begin with the easy cases if in_notif_time < self.first_notification_time * interval: return False if self.last_notification_time and \ in_notif_time > self.last_notification_time * interval: return False # If our status is not good, we bail out too if status in short_states and short_states[status] not in self.escalation_options: return False # Maybe the time is not in our escalation_period if escal_period is not None and not escal_period.is_time_valid(timestamp): return False # Ok, I do not see why not escalade. So it's True :) return True def get_next_notif_time(self, t_wished, status, creation_time, interval, escal_period): """Get the next notification time for the escalation Only legit for time based escalation :param t_wished: time we would like to send a new notification (usually now) :type t_wished: :param status: status of the host or service :type status: :param creation_time: time the notification was created :type creation_time: :param interval: time interval length :type interval: int :return: timestamp for next notification or None :rtype: int | None """ short_states = {u'WARNING': 'w', u'UNKNOWN': 'u', u'CRITICAL': 'c', u'RECOVERY': 'r', u'FLAPPING': 'f', u'DOWNTIME': 's', u'DOWN': 'd', u'UNREACHABLE': 'u', u'OK': 'o', u'UP': 'o'} # If we are not time based, we bail out! if not self.time_based: return None # Check if we are valid if status in short_states and short_states[status] not in self.escalation_options: return None # Look for the min of our future validity start = self.first_notification_time * interval + creation_time # If we are after the classic next time, we are not asking for a smaller interval if start > t_wished: return None # Maybe the time we found is not a valid one.... if escal_period is not None and not escal_period.is_time_valid(start): return None # Ok so I ask for my start as a possibility for the next notification time return start def is_correct(self): """Check if this object configuration is correct :: * Check our own specific properties * Call our parent class is_correct checker :return: True if the configuration is correct, otherwise False :rtype: bool """ state = True # Internal checks before executing inherited function... # If we got the _time parameters, we are time based. Unless, we are not :) if hasattr(self, 'first_notification_time') or hasattr(self, 'last_notification_time'): self.time_based = True # Ok now we manage special cases... if not hasattr(self, 'contacts') and not hasattr(self, 'contact_groups'): self.add_error('%s: I do not have contacts nor contact_groups' % (self.get_name())) state = False # If time_based or not, we do not check all properties if self.time_based: if not hasattr(self, 'first_notification_time'): self.add_error('%s: I do not have first_notification_time' % (self.get_name())) state = False if not hasattr(self, 'last_notification_time'): self.add_error('%s: I do not have last_notification_time' % (self.get_name())) state = False else: # we check classical properties if not hasattr(self, 'first_notification'): self.add_error('%s: I do not have first_notification' % (self.get_name())) state = False if not hasattr(self, 'last_notification'): self.add_error('%s: I do not have last_notification' % (self.get_name())) state = False # Change the special_properties definition according to time_based ... save_special_properties = self.special_properties if self.time_based: self.special_properties = self.special_properties_time_based state_parent = super(Escalation, self).is_correct() if self.time_based: self.special_properties = save_special_properties return state_parent and state
class Check(Action): # pylint: disable=R0902 """Check class implements monitoring concepts of checks :(status, state, output) Check instance are used to store monitoring plugins data (exit status, output) and used by schedule to raise alert, reschedule check etc. """ # AutoSlots create the __slots__ with properties and # running_properties names # FIXME : re-enable AutoSlots if possible # __metaclass__ = AutoSlots my_type = 'check' properties = Action.properties.copy() properties.update({ 'is_a': StringProp(default=u'check'), 'state': IntegerProp(default=0), 'depend_on': ListProp(default=[]), 'depend_on_me': ListProp(default=[], split_on_comma=False), 'passive_check': BoolProp(default=False), 'freshness_expiry_check': BoolProp(default=False), 'poller_tag': StringProp(default=u'None'), 'dependency_check': BoolProp(default=False), }) def __init__(self, params=None, parsing=False): super(Check, self).__init__(params, parsing=parsing) if self.command.startswith('_'): self.internal = True def __str__(self): # pragma: no cover return "Check %s %s, item: %s, status: %s, command:'%s'" % \ (self.uuid, "active" if not self.passive_check else "passive", self.ref, self.status, self.command) def get_return_from(self, check): """Update check data from action (notification for instance) :param check: action to get data from :type check: fusionsupervision.action.Action :return: None """ for prop in [ 'exit_status', 'output', 'long_output', 'check_time', 'execution_time', 'perf_data', 'u_time', 's_time' ]: setattr(self, prop, getattr(check, prop)) def set_type_active(self): """Set this check as an active one (indeed, not passive) :return: None """ self.passive_check = False def set_type_passive(self): """Set this check as a passive one :return: None """ self.passive_check = True def is_dependent(self): """Getter for dependency_check attribute :return: True if this check was created for a dependent one, False otherwise :rtype: bool """ return self.dependency_check def serialize(self): """This function serializes into a simple dict object. The only usage is to send to poller, and it does not need to have the depend_on and depend_on_me properties. :return: json representation of a Check :rtype: dict """ res = super(Check, self).serialize() if 'depend_on' in res: del res['depend_on'] if 'depend_on_me' in res: del res['depend_on_me'] return res
class Contact(Item): """Host class implements monitoring concepts for contact. For example it defines host_notification_period, service_notification_period etc. """ my_type = 'contact' properties = Item.properties.copy() properties.update({ 'contact_name': StringProp(fill_brok=['full_status']), 'alias': StringProp(default=u'', fill_brok=['full_status']), 'contactgroups': ListProp(default=[], fill_brok=['full_status']), 'host_notifications_enabled': BoolProp(default=True, fill_brok=['full_status']), 'service_notifications_enabled': BoolProp(default=True, fill_brok=['full_status']), 'host_notification_period': StringProp(default='', fill_brok=['full_status']), 'service_notification_period': StringProp(default='', fill_brok=['full_status']), 'host_notification_options': ListProp(default=[''], fill_brok=['full_status'], split_on_comma=True), 'service_notification_options': ListProp(default=[''], fill_brok=['full_status'], split_on_comma=True), # To be consistent with notificationway object attributes 'host_notification_commands': ListProp(default=[], fill_brok=['full_status']), 'service_notification_commands': ListProp(default=[], fill_brok=['full_status']), 'min_business_impact': IntegerProp(default=0, fill_brok=['full_status']), 'email': StringProp(default=u'none', fill_brok=['full_status']), 'pager': StringProp(default=u'none', fill_brok=['full_status']), 'address1': StringProp(default=u'none', fill_brok=['full_status']), 'address2': StringProp(default=u'none', fill_brok=['full_status']), 'address3': StringProp(default=u'none', fill_brok=['full_status']), 'address4': StringProp(default=u'none', fill_brok=['full_status']), 'address5': StringProp(default=u'none', fill_brok=['full_status']), 'address6': StringProp(default=u'none', fill_brok=['full_status']), 'can_submit_commands': BoolProp(default=False, fill_brok=['full_status']), 'is_admin': BoolProp(default=False, fill_brok=['full_status']), 'expert': BoolProp(default=False, fill_brok=['full_status']), 'retain_status_information': BoolProp(default=True, fill_brok=['full_status']), 'notificationways': ListProp(default=[], fill_brok=['full_status']), 'password': StringProp(default=u'NOPASSWORDSET', fill_brok=['full_status']), }) running_properties = Item.running_properties.copy() running_properties.update({ 'modified_attributes': IntegerProp(default=0, fill_brok=['full_status'], retention=True), 'modified_host_attributes': IntegerProp(default=0, fill_brok=['full_status'], retention=True), 'modified_service_attributes': IntegerProp(default=0, fill_brok=['full_status'], retention=True), 'in_scheduled_downtime': BoolProp(default=False, fill_brok=['full_status', 'check_result'], retention=True), 'broks': ListProp(default=[]), # and here broks raised 'customs': DictProp(default={}, fill_brok=['full_status']), }) # This tab is used to transform old parameters name into new ones # so from Nagios2 format, to Nagios3 ones. # Or FusionSupervision Engine deprecated names like criticity old_properties = { 'min_criticity': 'min_business_impact', } macros = { 'CONTACTNAME': 'contact_name', 'CONTACTALIAS': 'alias', 'CONTACTEMAIL': 'email', 'CONTACTPAGER': 'pager', 'CONTACTADDRESS1': 'address1', 'CONTACTADDRESS2': 'address2', 'CONTACTADDRESS3': 'address3', 'CONTACTADDRESS4': 'address4', 'CONTACTADDRESS5': 'address5', 'CONTACTADDRESS6': 'address6', 'CONTACTGROUPNAME': 'get_groupname', 'CONTACTGROUPNAMES': 'get_groupnames' } special_properties = ('service_notification_commands', 'host_notification_commands', 'service_notification_period', 'host_notification_period', 'service_notification_options', 'host_notification_options', 'contact_name') simple_way_parameters = ('service_notification_period', 'host_notification_period', 'service_notification_options', 'host_notification_options', 'service_notification_commands', 'host_notification_commands', 'min_business_impact') def __init__(self, params=None, parsing=True): if params is None: params = {} # At deserialization, thoses are dict # TODO: Separate parsing instance from recreated ones for prop in [ 'service_notification_commands', 'host_notification_commands' ]: if prop in params and isinstance(params[prop], list) and params[prop] \ and isinstance(params[prop][0], dict): new_list = [ CommandCall(elem, parsing=parsing) for elem in params[prop] ] # We recreate the object setattr(self, prop, new_list) # And remove prop, to prevent from being overridden del params[prop] super(Contact, self).__init__(params, parsing=parsing) def __str__(self): # pragma: no cover return '<Contact %s, uuid=%s, use: %s />' \ % (self.get_name(), self.uuid, getattr(self, 'use', None)) __repr__ = __str__ def serialize(self): res = super(Contact, self).serialize() for prop in [ 'service_notification_commands', 'host_notification_commands' ]: if getattr(self, prop) is None: res[prop] = None else: res[prop] = [elem.serialize() for elem in getattr(self, prop)] return res def get_name(self): """Get contact name :return: contact name :rtype: str """ if self.is_tpl(): return "tpl-%s" % (getattr(self, 'name', 'unnamed')) return getattr(self, 'contact_name', 'unnamed') def get_groupname(self): """ Get the first group name whose contact belongs to :return: group name :rtype: str """ if self.contactgroups: return self.contactgroups[0] return 'Unknown' def get_groupnames(self): """ Get all the groups name whose contact belongs to :return: comma separated list of the groups names :rtype: str """ if self.contactgroups: return ', '.join(self.contactgroups) return 'Unknown' def want_service_notification(self, notifways, timeperiods, timestamp, state, n_type, business_impact, cmd=None): """Check if notification options match the state of the service :param timestamp: time we want to notify the contact (usually now) :type timestamp: int :param state: host or service state ("WARNING", "CRITICAL" ..) :type state: str :param n_type: type of notification ("PROBLEM", "RECOVERY" ..) :type n_type: str :param business_impact: impact of this service :type business_impact: int :param cmd: command launched to notify the contact :type cmd: str :return: True if contact wants notification, otherwise False :rtype: bool """ if not self.service_notifications_enabled: return False # If we are in downtime, we do not want notification for downtime_id in self.downtimes: downtime = self.downtimes[downtime_id] if downtime.is_in_effect: self.in_scheduled_downtime = True return False self.in_scheduled_downtime = False # Now the rest is for sub notificationways. If one is OK, we are ok # We will filter in another phase for notifway_id in self.notificationways: notifway = notifways[notifway_id] nw_b = notifway.want_service_notification(timeperiods, timestamp, state, n_type, business_impact, cmd) if nw_b: return True # Oh... no one is ok for it? so no, sorry return False def want_host_notification(self, notifways, timeperiods, timestamp, state, n_type, business_impact, cmd=None): """Check if notification options match the state of the host :param timestamp: time we want to notify the contact (usually now) :type timestamp: int :param state: host or service state ("UP", "DOWN" ..) :type state: str :param n_type: type of notification ("PROBLEM", "RECOVERY" ..) :type n_type: str :param business_impact: impact of this host :type business_impact: int :param cmd: command launch to notify the contact :type cmd: str :return: True if contact wants notification, otherwise False :rtype: bool """ if not self.host_notifications_enabled: return False # If we are in downtime, we do not want notification for downtime in self.downtimes: if downtime.is_in_effect: self.in_scheduled_downtime = True return False self.in_scheduled_downtime = False # Now it's all for sub notificationways. If one is OK, we are OK # We will filter in another phase for notifway_id in self.notificationways: notifway = notifways[notifway_id] nw_b = notifway.want_host_notification(timeperiods, timestamp, state, n_type, business_impact, cmd) if nw_b: return True # Oh, nobody..so NO :) return False def get_notification_commands(self, notifways, n_type, command_name=False): """Get notification commands for object type :param notifways: list of fusionsupervision.objects.NotificationWay objects :type notifways: NotificationWays :param n_type: object type (host or service) :type n_type: string :param command_name: True to update the inner property with the name of the command, False to update with the Command objects list :type command_name: bool :return: command list :rtype: list[fusionsupervision.objects.command.Command] """ res = [] for notifway_id in self.notificationways: notifway = notifways[notifway_id] res.extend(notifway.get_notification_commands(n_type)) # Update inner notification commands property with command name or command if command_name: setattr(self, n_type + '_notification_commands', [c.get_name() for c in res]) else: setattr(self, n_type + '_notification_commands', res) return res def is_correct(self): """Check if this object configuration is correct :: * Check our own specific properties * Call our parent class is_correct checker :return: True if the configuration is correct, otherwise False :rtype: bool """ state = True cls = self.__class__ # Internal checks before executing inherited function... # There is a case where there is no nw: when there is not special_prop defined # at all!! if not self.notificationways: for prop in self.special_properties: if not hasattr(self, prop): msg = "[contact::%s] %s property is missing" % ( self.get_name(), prop) self.add_error(msg) state = False if not hasattr(self, 'contact_name'): if hasattr(self, 'alias'): # Use the alias if we miss the contact_name self.contact_name = self.alias for char in cls.illegal_object_name_chars: if char not in self.contact_name: continue msg = "[contact::%s] %s character not allowed in contact_name" \ % (self.get_name(), char) self.add_error(msg) state = False return super(Contact, self).is_correct() and state def raise_enter_downtime_log_entry(self): """Raise CONTACT DOWNTIME ALERT entry (info level) Format is : "CONTACT DOWNTIME ALERT: *get_name()*;STARTED; Contact has entered a period of scheduled downtime" Example : "CONTACT DOWNTIME ALERT: test_contact;STARTED; Contact has entered a period of scheduled downtime" :return: None """ brok = make_monitoring_log( 'info', "CONTACT DOWNTIME ALERT: %s;STARTED; " "Contact has entered a period of scheduled downtime" % self.get_name()) self.broks.append(brok) def raise_exit_downtime_log_entry(self): """Raise CONTACT DOWNTIME ALERT entry (info level) Format is : "CONTACT DOWNTIME ALERT: *get_name()*;STOPPED; Contact has entered a period of scheduled downtime" Example : "CONTACT DOWNTIME ALERT: test_contact;STOPPED; Contact has entered a period of scheduled downtime" :return: None """ brok = make_monitoring_log( 'info', "CONTACT DOWNTIME ALERT: %s;STOPPED; " "Contact has exited from a period of scheduled downtime" % self.get_name()) self.broks.append(brok) def raise_cancel_downtime_log_entry(self): """Raise CONTACT DOWNTIME ALERT entry (info level) Format is : "CONTACT DOWNTIME ALERT: *get_name()*;CANCELLED; Contact has entered a period of scheduled downtime" Example : "CONTACT DOWNTIME ALERT: test_contact;CANCELLED; Contact has entered a period of scheduled downtime" :return: None """ brok = make_monitoring_log( 'info', "CONTACT DOWNTIME ALERT: %s;CANCELLED; " "Scheduled downtime for contact has been cancelled." % self.get_name()) self.broks.append(brok)
class Fusionsupervision(BaseSatellite): # pylint: disable=too-many-instance-attributes """Scheduler class. Referenced as "app" in most Interface """ properties = BaseSatellite.properties.copy() properties.update({ 'type': StringProp(default='scheduler'), 'port': IntegerProp(default=7768) }) def __init__(self, **kwargs): """Scheduler daemon initialisation :param kwargs: command line arguments """ super(Fusionsupervision, self).__init__(kwargs.get('daemon_name', 'Default-scheduler'), **kwargs) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) # stats part # --- copied from scheduler.py self.nb_pulled_checks = 0 self.nb_pulled_actions = 0 # self.nb_checks_send = 0 self.nb_pushed_checks = 0 self.nb_pushed_actions = 0 self.nb_pulled_broks = 0 # --- # And possible links for satellites self.brokers = {} self.pollers = {} self.reactionners = {} self.receivers = {} # This because it is the Satellite that has thes properties and I am a Satellite # todo: change this? # Broks are stored in each broker link, not locally # self.broks = [] self.broks_lock = threading.RLock() # Modules are only loaded one time self.have_modules = False self.first_scheduling = False def get_broks(self, broker_name): """Send broks to a specific broker :param broker_name: broker name to send broks :type broker_name: str :greturn: dict of brok for this broker :rtype: dict[fusionsupervision.brok.Brok] """ logger.debug("Broker %s requests my broks list", broker_name) res = [] if not broker_name: return res for broker_link in list(self.brokers.values()): if broker_name == broker_link.name: for brok in sorted(broker_link.broks, key=lambda x: x.creation_time): # Only provide broks that did not yet sent to our external modules if getattr(brok, 'sent_to_externals', False): res.append(brok) brok.got = True broker_link.broks = [b for b in broker_link.broks if not getattr(b, 'got', False)] logger.debug("Providing %d broks to %s", len(res), broker_name) break else: logger.warning("Got a brok request from an unknown broker: %s", broker_name) return res def compensate_system_time_change(self, difference): # pragma: no cover, # pylint: disable=too-many-branches # not with unit tests """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ super(Fusionsupervision, self).compensate_system_time_change(difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in list(self.sched.checks.values()): # Already launch checks should not be touch if chk.status == u'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = self.sched.find_item_by_id(chk.ref) new_t = max(0, t_to_go + difference) timeperiod = self.sched.timeperiods[ref.check_period] if timeperiod is not None: # But it's no so simple, we must match the timeperiod new_t = timeperiod.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = u'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in list(self.sched.actions.values()): # Already launch checks should not be touch if act.status == u'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref_id = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == u'notification': ref = self.sched.find_item_by_id(ref_id) if ref.notification_period: # But it's no so simple, we must match the timeperiod notification_period = self.sched.timeperiods[ref.notification_period] new_t = notification_period.get_next_valid_time_from_t(new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t def do_before_loop(self): """Stop the scheduling process""" if self.sched: self.sched.stop_scheduling() def do_loop_turn(self): """Scheduler loop turn Simply run the FusionSupervision Engine scheduler loop This is called when a configuration got received by the scheduler daemon. As of it, check if the first scheduling has been done... and manage this. :return: None """ if not self.first_scheduling: # Ok, now all is initialized, we can make the initial broks logger.info("First scheduling launched") _t0 = time.time() # Program start brok self.sched.initial_program_status() # First scheduling self.sched.schedule() statsmgr.timer('first_scheduling', time.time() - _t0) logger.info("First scheduling done") # Connect to our passive satellites if needed for satellite in [s for s in list(self.pollers.values()) if s.passive]: if not self.daemon_connection_init(satellite): logger.error("Passive satellite connection failed: %s", satellite) for satellite in [s for s in list(self.reactionners.values()) if s.passive]: if not self.daemon_connection_init(satellite): logger.error("Passive satellite connection failed: %s", satellite) # Ticks are for recurrent function call like consume, del zombies etc self.sched.ticks = 0 self.first_scheduling = True # Each loop turn, execute the daemon specific treatment... # only if the daemon has a configuration to manage if self.sched.pushed_conf: # If scheduling is not yet enabled, enable scheduling if not self.sched.must_schedule: self.sched.start_scheduling() self.sched.before_run() self.sched.run() else: logger.warning("#%d - No monitoring configuration to scheduler...", self.loop_count) def get_managed_configurations(self): """Get the configurations managed by this scheduler The configuration managed by a scheduler is the self configuration got by the scheduler during the dispatching. :return: a dict of scheduler links with instance_id as key and hash, push_flavor and configuration identifier as values :rtype: dict """ # for scheduler_link in list(self.schedulers.values()): # res[scheduler_link.instance_id] = { # 'hash': scheduler_link.hash, # 'push_flavor': scheduler_link.push_flavor, # 'managed_conf_id': scheduler_link.managed_conf_id # } res = {} if self.sched.pushed_conf and self.cur_conf and 'instance_id' in self.cur_conf: res[self.cur_conf['instance_id']] = { 'hash': self.cur_conf['hash'], 'push_flavor': self.cur_conf['push_flavor'], 'managed_conf_id': self.cur_conf['managed_conf_id'] } logger.debug("Get managed configuration: %s", res) return res def setup_new_conf(self): # pylint: disable=too-many-statements, too-many-branches, too-many-locals """Setup new conf received for scheduler :return: None """ # Execute the base class treatment... super(Fusionsupervision, self).setup_new_conf() # ...then our own specific treatment! with self.conf_lock: # self_conf is our own configuration from the fusionsupervision environment # self_conf = self.cur_conf['self_conf'] logger.debug("Got config: %s", self.cur_conf) if 'conf_part' not in self.cur_conf: self.cur_conf['conf_part'] = None conf_part = self.cur_conf['conf_part'] # Ok now we can save the retention data if self.sched.pushed_conf is not None: self.sched.update_retention() # Get the monitored objects configuration t00 = time.time() received_conf_part = None try: received_conf_part = unserialize(conf_part) assert received_conf_part is not None except AssertionError as exp: # This to indicate that no configuration is managed by this scheduler... logger.warning("No managed configuration received from arbiter") except FusionsupervisionClassLookupException as exp: # pragma: no cover # This to indicate that the new configuration is not managed... self.new_conf = { "_status": "Cannot un-serialize configuration received from arbiter", "_error": str(exp) } logger.error(self.new_conf) logger.error("Back trace of the error:\n%s", traceback.format_exc()) return except Exception as exp: # pylint: disable=broad-except # This to indicate that the new configuration is not managed... self.new_conf = { "_status": "Cannot un-serialize configuration received from arbiter", "_error": str(exp) } logger.error(self.new_conf) self.exit_on_exception(exp, str(self.new_conf)) # if not received_conf_part: # return logger.info("Monitored configuration %s received at %d. Un-serialized in %d secs", received_conf_part, t00, time.time() - t00) logger.info("Scheduler received configuration : %s", received_conf_part) # Now we create our pollers, reactionners and brokers for link_type in ['pollers', 'reactionners', 'brokers']: if link_type not in self.cur_conf['satellites']: logger.error("Missing %s in the configuration!", link_type) continue my_satellites = getattr(self, link_type, {}) received_satellites = self.cur_conf['satellites'][link_type] for link_uuid in received_satellites: rs_conf = received_satellites[link_uuid] logger.debug("- received %s - %s: %s", rs_conf['instance_id'], rs_conf['type'], rs_conf['name']) # Must look if we already had a configuration and save our broks already_got = rs_conf['instance_id'] in my_satellites broks = [] actions = {} wait_homerun = {} external_commands = {} running_id = 0 if already_got: logger.warning("I already got: %s", rs_conf['instance_id']) # Save some information running_id = my_satellites[link_uuid].running_id (broks, actions, wait_homerun, external_commands) = \ my_satellites[link_uuid].get_and_clear_context() # Delete the former link del my_satellites[link_uuid] # My new satellite link... new_link = SatelliteLink.get_a_satellite_link(link_type[:-1], rs_conf) my_satellites[new_link.uuid] = new_link logger.info("I got a new %s satellite: %s", link_type[:-1], new_link) new_link.running_id = running_id new_link.external_commands = external_commands new_link.broks = broks new_link.wait_homerun = wait_homerun new_link.actions = actions # Replacing the satellite address and port by those defined in satellite_map if new_link.name in self.cur_conf['override_conf'].get('satellite_map', {}): override_conf = self.cur_conf['override_conf'] overriding = override_conf.get('satellite_map')[new_link.name] logger.warning("Do not override the configuration for: %s, with: %s. " "Please check whether this is necessary!", new_link.name, overriding) # First mix conf and override_conf to have our definitive conf for prop in getattr(self.cur_conf, 'override_conf', []): logger.debug("Overriden: %s / %s ", prop, getattr(received_conf_part, prop, None)) logger.debug("Overriding: %s / %s ", prop, self.cur_conf['override_conf']) setattr(received_conf_part, prop, self.cur_conf['override_conf'].get(prop, None)) # Scheduler modules if not self.have_modules: try: logger.debug("Modules configuration: %s", self.cur_conf['modules']) self.modules = unserialize(self.cur_conf['modules'], no_load=True) except FusionsupervisionClassLookupException as exp: # pragma: no cover, simple protection logger.error('Cannot un-serialize modules configuration ' 'received from arbiter: %s', exp) if self.modules: logger.debug("I received some modules configuration: %s", self.modules) self.have_modules = True self.do_load_modules(self.modules) # and start external modules too self.modules_manager.start_external_instances() else: logger.info("I do not have modules") if received_conf_part: logger.info("Loading configuration...") # Propagate the global parameters to the configuration items received_conf_part.explode_global_conf() # We give the configuration to our scheduler self.sched.reset() self.sched.load_conf(self.cur_conf['instance_id'], self.cur_conf['instance_name'], received_conf_part) # Once loaded, the scheduler has an inner pushed_conf object logger.info("Loaded: %s", self.sched.pushed_conf) # Update the scheduler ticks according to the daemon configuration self.sched.update_recurrent_works_tick(self) # We must update our pushed configuration macros with correct values # from the configuration parameters # self.sched.pushed_conf.fill_resource_macros_names_macros() # Creating the Macroresolver Class & unique instance m_solver = MacroResolver() m_solver.init(received_conf_part) # Now create the external commands manager # We are an applyer: our role is not to dispatch commands, but to apply them ecm = ExternalCommandManager( received_conf_part, 'applyer', self.sched, received_conf_part.accept_passive_unknown_check_results, received_conf_part.log_external_commands) # Scheduler needs to know about this external command manager to use it if necessary self.sched.external_commands_manager = ecm # Ok now we can load the retention data self.sched.retention_load() # Log hosts/services initial states self.sched.log_initial_states() # Create brok new conf brok = Brok({'type': 'new_conf', 'data': {}}) self.sched.add_brok(brok) # Initialize connection with all our satellites logger.info("Initializing connection with my satellites:") my_satellites = self.get_links_of_type(s_type='') for satellite in list(my_satellites.values()): logger.info("- : %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) if received_conf_part: # Enable the scheduling process logger.info("Loaded: %s", self.sched.pushed_conf) self.sched.start_scheduling() # Now I have a configuration! self.have_conf = True def clean_previous_run(self): """Clean variables from previous configuration :return: None """ # Execute the base class treatment... super(Fusionsupervision, self).clean_previous_run() # Clean all lists self.pollers.clear() self.reactionners.clear() self.brokers.clear() def get_daemon_stats(self, details=False): """Increase the stats provided by the Daemon base class :return: stats dictionary :rtype: dict """ # Call the base Daemon one res = super(Fusionsupervision, self).get_daemon_stats(details=details) res.update({'name': self.name, 'type': self.type, 'monitored_objects': {}}) counters = res['counters'] # Satellites counters counters['brokers'] = len(self.brokers) counters['pollers'] = len(self.pollers) counters['reactionners'] = len(self.reactionners) counters['receivers'] = len(self.receivers) if not self.sched: return res # # Hosts/services problems counters # m_solver = MacroResolver() # counters['hosts_problems'] = m_solver._get_total_host_problems() # counters['hosts_unhandled_problems'] = m_solver._get_total_host_problems_unhandled() # counters['services_problems'] = m_solver._get_total_service_problems() # counters['services_unhandled_problems'] = m_solver._get_total_service_problems_unhandled() # Get statistics from the scheduler scheduler_stats = self.sched.get_scheduler_stats(details=details) res['counters'].update(scheduler_stats['counters']) scheduler_stats.pop('counters') res.update(scheduler_stats) return res def get_monitoring_problems(self): """Get the current scheduler livesynthesis :return: live synthesis and problems dictionary :rtype: dict """ res = {} if not self.sched: return res # Get statistics from the scheduler scheduler_stats = self.sched.get_scheduler_stats(details=True) if 'livesynthesis' in scheduler_stats: res['livesynthesis'] = scheduler_stats['livesynthesis'] if 'problems' in scheduler_stats: res['problems'] = scheduler_stats['problems'] return res def main(self): """Main function for Scheduler, launch after the init:: * Init daemon * Load module manager * Launch main loop * Catch any Exception that occurs :return: None """ try: # Start the daemon mode if not self.do_daemon_init_and_start(): self.exit_on_error(message="Daemon initialization error", exit_code=3) # We wait for initial conf self.wait_for_initial_conf() if self.new_conf: # Setup the received configuration self.setup_new_conf() # Now the main loop self.do_main_loop() logger.info("Exited from the main loop.") # On main loop exit, call the scheduler after run process self.sched.after_run() self.request_stop() except Exception: # pragma: no cover, this should never happen indeed ;) self.exit_on_exception(traceback.format_exc()) raise
class Broker(BaseSatellite): """ Class to manage a Broker daemon A Broker is used to get data from Scheduler and send them to modules. These modules in most cases export to other software, databases... """ properties = BaseSatellite.properties.copy() properties.update({ 'type': StringProp(default='broker'), 'port': IntegerProp(default=7772), 'got_initial_broks': BoolProp(default=False) }) def __init__(self, **kwargs): """Broker daemon initialisation :param kwargs: command line arguments """ super(Broker, self).__init__(kwargs.get('daemon_name', 'Default-broker'), **kwargs) # Our schedulers and arbiters are initialized in the base class # Our pollers, reactionners and receivers self.pollers = {} self.reactionners = {} self.receivers = {} # Modules are load one time self.have_modules = False # All broks to manage self.external_broks = [] # broks to manage # broks raised internally by the broker self.internal_broks = [] # broks raised by the arbiters, we need a lock so the push can be in parallel # to our current activities and won't lock the arbiter self.arbiter_broks = [] self.arbiter_broks_lock = threading.RLock() self.timeout = 1.0 self.http_interface = BrokerInterface(self) def add(self, elt): """Generic function to add objects to the daemon internal lists. Manage Broks, External commands and Messages (from modules queues) :param elt: object to add :type elt: fusionsupervision.FusionsupervisionObject :return: None """ if isinstance(elt, Brok): # For brok, we tag the brok with our instance_id elt.instance_id = self.instance_id if elt.type == 'monitoring_log': # The brok is a monitoring event with self.events_lock: self.events.append(elt) statsmgr.counter('events', 1) else: with self.broks_lock: self.broks.append(elt) statsmgr.counter('broks.added', 1) elif isinstance(elt, ExternalCommand): logger.debug("Queuing an external command '%s'", str(elt.__dict__)) with self.external_commands_lock: self.external_commands.append(elt) statsmgr.counter('external-commands.added', 1) # Maybe we got a Message from the modules, it's way to ask something # like from now a full data from a scheduler for example. elif isinstance(elt, Message): # We got a message, great! logger.debug(str(elt.__dict__)) if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id means: I got no data for this scheduler # so give me all dumb-ass! if 'full_instance_id' in data: c_id = data['full_instance_id'] source = getattr(elt, 'source', getattr(elt, '_source', None)) logger.info( 'The module %s is asking me to get all initial data ' 'from the scheduler %d', source, c_id) # so we just reset the connection and the running_id, # it will just get all new things try: self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0 except KeyError: # maybe this instance was not known, forget it logger.warning( "the module %s ask me a full_instance_id " "for an unknown ID (%d)!", source, c_id) # Maybe a module tells me that it's dead, I must log its last words... if elt.get_type() == 'ICrash': data = elt.get_data() logger.error( 'the module %s just crash! Please look at the traceback:', data['name']) logger.error(data['trace']) statsmgr.counter('message.added', 1) # The module death will be looked for elsewhere and restarted. def manage_brok(self, brok): """Get a brok. We put brok data to the modules :param brok: object with data :type brok: object :return: None """ # Unserialize the brok before consuming it brok.prepare() for module in self.modules_manager.get_internal_instances(): try: _t0 = time.time() module.manage_brok(brok) statsmgr.timer('manage-broks.internal.%s' % module.get_name(), time.time() - _t0) except Exception as exp: # pylint: disable=broad-except logger.warning( "The module %s raised an exception: %s, " "I'm tagging it to restart later", module.get_name(), str(exp)) logger.exception(exp) self.modules_manager.set_to_restart(module) def get_internal_broks(self): """Get all broks from self.broks_internal_raised and append them to our broks to manage :return: None """ statsmgr.gauge('get-new-broks-count.broker', len(self.internal_broks)) # Add the broks to our global list self.external_broks.extend(self.internal_broks) self.internal_broks = [] def get_arbiter_broks(self): """Get the broks from the arbiters, but as the arbiter_broks list can be push by arbiter without Global lock, we must protect this with a lock TODO: really? check this arbiter behavior! :return: None """ with self.arbiter_broks_lock: statsmgr.gauge('get-new-broks-count.arbiter', len(self.arbiter_broks)) # Add the broks to our global list self.external_broks.extend(self.arbiter_broks) self.arbiter_broks = [] def get_new_broks(self): """Get new broks from our satellites :return: None """ for satellites in [ self.schedulers, self.pollers, self.reactionners, self.receivers ]: for satellite_link in list(satellites.values()): logger.debug("Getting broks from %s", satellite_link) _t0 = time.time() try: tmp_broks = satellite_link.get_broks(self.name) except LinkError: logger.warning( "Daemon %s connection failed, I could not get the broks!", satellite_link) else: if tmp_broks: logger.debug("Got %d Broks from %s in %s", len(tmp_broks), satellite_link.name, time.time() - _t0) statsmgr.gauge( 'get-new-broks-count.%s' % (satellite_link.name), len(tmp_broks)) statsmgr.timer( 'get-new-broks-time.%s' % (satellite_link.name), time.time() - _t0) for brok in tmp_broks: brok.instance_id = satellite_link.instance_id # Add the broks to our global list self.external_broks.extend(tmp_broks) # def do_stop(self): # """Stop all children of this process # # :return: None # """ # # my_active_children = active_children() # # for child in my_active_children: # # child.terminate() # # child.join(1) # super(Broker, self).do_stop() def setup_new_conf(self): # pylint: disable=too-many-branches, too-many-locals """Broker custom setup_new_conf method This function calls the base satellite treatment and manages the configuration needed for a broker daemon: - get and configure its pollers, reactionners and receivers relation - configure the modules :return: None """ # Execute the base class treatment... super(Broker, self).setup_new_conf() # ...then our own specific treatment! with self.conf_lock: # # self_conf is our own configuration from the fusionsupervision environment # self_conf = self.cur_conf['self_conf'] self.got_initial_broks = False # Now we create our pollers, reactionners and receivers for link_type in ['pollers', 'reactionners', 'receivers']: if link_type not in self.cur_conf['satellites']: logger.error("No %s in the configuration!", link_type) continue my_satellites = getattr(self, link_type, {}) received_satellites = self.cur_conf['satellites'][link_type] for link_uuid in received_satellites: rs_conf = received_satellites[link_uuid] logger.debug("- received %s - %s: %s", rs_conf['instance_id'], rs_conf['type'], rs_conf['name']) # Must look if we already had a configuration and save our broks already_got = rs_conf['instance_id'] in my_satellites broks = [] actions = {} wait_homerun = {} external_commands = {} running_id = 0 if already_got: logger.warning("I already got: %s", rs_conf['instance_id']) # Save some information running_id = my_satellites[link_uuid].running_id (broks, actions, wait_homerun, external_commands) = \ my_satellites[link_uuid].get_and_clear_context() # Delete the former link del my_satellites[link_uuid] # My new satellite link... new_link = SatelliteLink.get_a_satellite_link( link_type[:-1], rs_conf) my_satellites[new_link.uuid] = new_link logger.info("I got a new %s satellite: %s", link_type[:-1], new_link) new_link.running_id = running_id new_link.external_commands = external_commands new_link.broks = broks new_link.wait_homerun = wait_homerun new_link.actions = actions # Replace satellite address and port by those defined in satellite_map # todo: check if it is really necessary! Add a unit test for this # Not sure about this because of the daemons/satellites configuration # if new_link.name in self_conf.get('satellite_map', {}): # new_link = dict(new_link) # make a copy # new_link.update(self_conf.get('satellite_map', {})[new_link.name]) if not self.have_modules: try: self.modules = unserialize(self.cur_conf['modules'], no_load=True) except FusionsupervisionClassLookupException as exp: # pragma: no cover, simple protection logger.error( 'Cannot un-serialize modules configuration ' 'received from arbiter: %s', exp) if self.modules: logger.info("I received some modules configuration: %s", self.modules) self.have_modules = True # Ok now start, or restart them! # Set modules, init them and start external ones self.do_load_modules(self.modules) # and start external modules too self.modules_manager.start_external_instances() else: logger.info("I do not have modules") # Initialize connection with my schedulers first logger.info("Initializing connection with my schedulers:") my_satellites = self.get_links_of_type(s_type='scheduler') for satellite in list(my_satellites.values()): logger.info("- %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) # Initialize connection with all our satellites logger.info("Initializing connection with my satellites:") for sat_type in ['arbiter', 'reactionner', 'poller', 'receiver']: my_satellites = self.get_links_of_type(s_type=sat_type) for satellite in list(my_satellites.values()): logger.info("- %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) # Now I have a configuration! self.have_conf = True def clean_previous_run(self): """Clean all (when we received new conf) :return: None """ # Execute the base class treatment... super(Broker, self).clean_previous_run() # Clean all satellites relations self.pollers.clear() self.reactionners.clear() self.receivers.clear() # Clean our internal objects self.external_broks = self.external_broks[:] self.internal_broks = self.internal_broks[:] with self.arbiter_broks_lock: self.arbiter_broks = self.arbiter_broks[:] self.external_commands = self.external_commands[:] # And now modules # self.have_modules = False # self.modules_manager.clear_instances() def do_loop_turn(self): # pylint: disable=too-many-branches """Loop used to: * get initial status broks * check if modules are alive, if not restart them * get broks from ourself, the arbiters and our satellites * add broks to the queue of each external module * manage broks with each internal module If the internal broks management is longer than 0.8 seconds, postpone to hte next loop turn to avoid overloading the broker daemon. :return: None """ if not self.got_initial_broks: # Asking initial broks from my schedulers my_satellites = self.get_links_of_type(s_type='scheduler') for satellite in list(my_satellites.values()): logger.info("Asking my initial broks from '%s'", satellite.name) _t0 = time.time() try: my_initial_broks = satellite.get_initial_broks(self.name) statsmgr.timer('broks.initial.%s.time' % satellite.name, time.time() - _t0) if not my_initial_broks: logger.info("No initial broks were raised, " "my scheduler is not yet ready...") return self.got_initial_broks = True logger.debug("Got %d initial broks from '%s'", my_initial_broks, satellite.name) statsmgr.gauge('broks.initial.%s.count' % satellite.name, my_initial_broks) except LinkError as exp: logger.warning( "Scheduler connection failed, I could not get initial broks!" ) logger.debug("Begin Loop: still some old broks to manage (%d)", len(self.external_broks)) if self.external_broks: statsmgr.gauge('unmanaged.broks', len(self.external_broks)) # Try to see if one of my module is dead, and restart previously dead modules self.check_and_del_zombie_modules() # Call modules that manage a starting tick pass _t0 = time.time() self.hook_point('tick') statsmgr.timer('hook.tick', time.time() - _t0) # Maybe the last loop we did raised some broks internally self.get_internal_broks() # Also reap broks sent from the arbiters self.get_arbiter_broks() # Now get broks from our distant daemons self.get_new_broks() # Get the list of broks not yet sent to our external modules _t0 = time.time() broks_to_send = [ brok for brok in self.external_broks if getattr(brok, 'to_be_sent', True) ] statsmgr.gauge('get-new-broks-count.to_send', len(broks_to_send)) # Send the broks to all external modules to_q queue so they can get the whole packet # beware, the sub-process/queue can be die/close, so we put to restart the whole module # instead of killing ourselves :) for module in self.modules_manager.get_external_instances(): try: _t00 = time.time() queue_size = module.to_q.qsize() statsmgr.gauge( 'queues.external.%s.to.size' % module.get_name(), queue_size) module.to_q.put(broks_to_send) statsmgr.timer('queues.external.%s.to.put' % module.get_name(), time.time() - _t00) except Exception as exp: # pylint: disable=broad-except # first we must find the modules logger.warning( "Module %s queue exception: %s, I'm tagging it to restart later", module.get_name(), str(exp)) logger.exception(exp) self.modules_manager.set_to_restart(module) # No more need to send them for brok in broks_to_send: brok.to_be_sent = False logger.debug("Time to send %s broks (%d secs)", len(broks_to_send), time.time() - _t0) # Make the internal modules manage the broks start = time.time() while self.external_broks: now = time.time() # Do not 'manage' more than 0.8s, we must get new broks almost every second if now - start > 0.8: logger.info( "I did not yet managed all my broks, still %d broks", len(self.external_broks)) break # Get the first brok in the list brok = self.external_broks.pop(0) if self.modules_manager.get_internal_instances(): self.manage_brok(brok) # Make a very short pause to avoid overloading self.make_a_pause(0.01, check_time_change=False) else: if getattr(brok, 'to_be_sent', False): self.external_broks.append(brok) # Maybe our external modules raised 'objects', so get them if self.get_objects_from_from_queues(): statsmgr.gauge('external-commands.got.count', len(self.external_commands)) statsmgr.gauge('broks.got.count', len(self.external_broks)) def get_daemon_stats(self, details=False): """Increase the stats provided by the Daemon base class :return: stats dictionary :rtype: dict """ # Call the base Daemon one res = super(Broker, self).get_daemon_stats(details=details) res.update({'name': self.name, 'type': self.type}) counters = res['counters'] counters['broks-external'] = len(self.external_broks) counters['broks-internal'] = len(self.internal_broks) counters['broks-arbiter'] = len(self.arbiter_broks) counters['satellites.pollers'] = len(self.pollers) counters['satellites.reactionners'] = len(self.reactionners) counters['satellites.receivers'] = len(self.receivers) return res def main(self): """Main function, will loop forever :return: None """ try: # Start the daemon mode if not self.do_daemon_init_and_start(): self.exit_on_error(message="Daemon initialization error", exit_code=3) # We wait for initial conf self.wait_for_initial_conf() if self.new_conf: # Setup the received configuration self.setup_new_conf() # Restore retention data self.hook_point('load_retention') # Now the main loop self.do_main_loop() logger.info("Exited from the main loop.") self.request_stop() except Exception: # pragma: no cover, this should never happen indeed ;) self.exit_on_exception(traceback.format_exc()) raise
class Satellite(BaseSatellite): # pylint: disable=R0902 """Satellite class. Sub-classed by Receiver, Reactionner and Poller """ do_checks = False do_actions = False my_type = '' properties = BaseSatellite.properties.copy() properties.update({ 'passive': BoolProp(default=False), 'max_plugins_output_length': IntegerProp(default=8192), 'min_workers': IntegerProp(default=0, fill_brok=['full_status'], to_send=True), 'max_workers': IntegerProp(default=0, fill_brok=['full_status'], to_send=True), 'processes_by_worker': IntegerProp(default=256, fill_brok=['full_status'], to_send=True), 'worker_polling_interval': IntegerProp(default=1, to_send=True), 'poller_tags': ListProp(default=['None'], to_send=True), 'reactionner_tags': ListProp(default=['None'], to_send=True), }) def __init__(self, name, **kwargs): super(Satellite, self).__init__(name, **kwargs) # Move these properties to the base Daemon ? # todo: change this? # Keep broks so they can be eaten by a broker self.broks = [] self.broks_lock = threading.RLock() # My active workers self.workers = {} # May be we are a passive daemon if self.passive: self.pre_log.append(("INFO", "Passive mode enabled.")) # Our tags # ['None'] is the default tags if self.type in ['poller'] and self.poller_tags: self.pre_log.append(("INFO", "Poller tags: %s" % self.poller_tags)) if self.type in ['reactionner'] and self.reactionner_tags: self.pre_log.append(("INFO", "Reactionner tags: %s" % self.reactionner_tags)) # Now the limit part, 0 means the number of cpu of this machine :) cpu_count = psutil.cpu_count() # Do not use the logger in this function because it is not yet initialized... self.pre_log.append(("INFO", "Detected %d CPUs" % cpu_count)) if self.max_workers == 0: try: # Preserve one CPU if more than one detected self.max_workers = max(cpu_count - 1, 1) except NotImplementedError: # pragma: no cover, simple protection self.max_workers = 1 if self.min_workers == 0: try: self.min_workers = max(cpu_count - 1, 1) except NotImplementedError: # pragma: no cover, simple protection self.min_workers = 1 self.pre_log.append(("INFO", "Using minimum %d workers, maximum %d workers, %d processes/worker" % (self.min_workers, self.max_workers, self.processes_by_worker))) self.slave_q = None self.returns_queue = None self.q_by_mod = {} # Modules are only loaded one time self.have_modules = False # round robin queue ic self.rr_qid = 0 def manage_action_return(self, action): """Manage action return from Workers We just put them into the corresponding sched and we clean unused properties like my_scheduler :param action: the action to manage :type action: fusionsupervision.action.Action :return: None """ # Maybe our workers send us something else than an action # if so, just add this in other queues and return # todo: test a class instance if action.__class__.my_type not in ['check', 'notification', 'eventhandler']: self.add(action) return # Ok, it's a result. Get the concerned scheduler uuid scheduler_uuid = action.my_scheduler logger.debug("Got action return: %s / %s", scheduler_uuid, action.uuid) try: # Now that we know where to put the action result, we do not need any reference to # the scheduler nor the worker del action.my_scheduler del action.my_worker except AttributeError: # pragma: no cover, simple protection logger.error("AttributeError Got action return: %s / %s", scheduler_uuid, action) # And we remove it from the actions queue of the scheduler too try: del self.schedulers[scheduler_uuid].actions[action.uuid] except KeyError as exp: logger.error("KeyError del scheduler action: %s / %s - %s", scheduler_uuid, action.uuid, str(exp)) # We tag it as "return wanted", and move it in the wait return queue try: self.schedulers[scheduler_uuid].wait_homerun[action.uuid] = action except KeyError: # pragma: no cover, simple protection logger.error("KeyError Add home run action: %s / %s - %s", scheduler_uuid, action.uuid, str(exp)) def push_results(self): """Push the checks/actions results to our schedulers :return: None """ # For all schedulers, we check for wait_homerun # and we send back results for scheduler_link_uuid in self.schedulers: scheduler_link = self.schedulers[scheduler_link_uuid] if not scheduler_link.active: logger.warning("My scheduler '%s' is not active currently", scheduler_link.name) continue if not scheduler_link.wait_homerun: # Nothing to push back... continue # NB: it's **mostly** safe for us to not use some lock around # this 'results' / sched['wait_homerun']. # Because it can only be modified (for adding new values) by the # same thread running this function (that is the main satellite # thread), and this occurs exactly in self.manage_action_return(). # Another possibility is for the sched['wait_homerun'] to be # cleared within/by : # ISchedulers.get_results() -> Satelitte.get_return_for_passive() # This can so happen in an (http) client thread. results = scheduler_link.wait_homerun logger.debug("Pushing %d results to '%s'", len(results), scheduler_link.name) # So, at worst, some results would be received twice on the # scheduler level, which shouldn't be a problem given they are # indexed by their "action_id". scheduler_link.push_results(list(results.values()), self.name) results.clear() def create_and_launch_worker(self, module_name='fork'): """Create and launch a new worker, and put it into self.workers It can be mortal or not :param module_name: the module name related to the worker default is "fork" for no module Indeed, it is actually the module 'python_name' :type module_name: str :return: None """ logger.info("Allocating new '%s' worker...", module_name) # If we are in the fork module, we do not specify a target target = None __warned = [] if module_name == 'fork': target = None else: for module in self.modules_manager.instances: # First, see if the module name matches... if module.get_name() == module_name: # ... and then if is a 'worker' module one or not if not module.properties.get('worker_capable', False): raise NotWorkerMod target = module.work if target is None: if module_name not in __warned: logger.warning("No target found for %s, NOT creating a worker for it...", module_name) __warned.append(module_name) return # We give to the Worker the instance name of the daemon (eg. poller-master) # and not the daemon type (poller) queue = Queue() worker = Worker(module_name, queue, self.returns_queue, self.processes_by_worker, max_plugins_output_length=self.max_plugins_output_length, target=target, loaded_into=self.name) # worker.module_name = module_name # save this worker self.workers[worker.get_id()] = worker # And save the Queue of this worker, with key = worker id # self.q_by_mod[module_name][worker.uuid] = queue self.q_by_mod[module_name][worker.get_id()] = queue # Ok, all is good. Start it! worker.start() logger.info("Started '%s' worker: %s (pid=%d)", module_name, worker.get_id(), worker.get_pid()) def do_stop_workers(self): """Stop all workers :return: None """ logger.info("Stopping all workers (%d)", len(self.workers)) for worker in list(self.workers.values()): try: logger.info(" - stopping '%s'", worker.get_id()) worker.terminate() worker.join(timeout=1) logger.info(" - stopped") # A already dead worker or in a worker except (AttributeError, AssertionError): pass except Exception as exp: # pylint: disable=broad-except logger.error("exception: %s", str(exp)) def do_stop(self): """Stop my workers and stop :return: None """ self.do_stop_workers() super(Satellite, self).do_stop() def add(self, elt): """Generic function to add objects to the daemon internal lists. Manage Broks, External commands :param elt: object to add :type elt: fusionsupervision.FusionsupervisionObject :return: None """ if isinstance(elt, Brok): # For brok, we tag the brok with our instance_id elt.instance_id = self.instance_id if elt.type == 'monitoring_log': # The brok is a monitoring event with self.events_lock: self.events.append(elt) statsmgr.counter('events', 1) else: with self.broks_lock: self.broks.append(elt) statsmgr.counter('broks.added', 1) elif isinstance(elt, ExternalCommand): logger.debug("Queuing an external command '%s'", str(elt.__dict__)) with self.external_commands_lock: self.external_commands.append(elt) statsmgr.counter('external-commands.added', 1) def get_broks(self): """Get brok list from satellite :return: A copy of the broks list :rtype: list """ res = copy.copy(self.broks) del self.broks[:] return res def check_and_del_zombie_workers(self): # pragma: no cover, not with unit tests... # pylint: disable= not-callable """Check if worker are fine and kill them if not. Dispatch the actions in the worker to another one TODO: see if unit tests would allow to check this code? :return: None """ # Active children make a join with everyone, useful :) # active_children() for p in active_children(): logger.debug("got child: %s", p) w_to_del = [] for worker in list(self.workers.values()): # If a worker goes down and we did not ask him, it's not # good: we can think that we have a worker and it's not True # So we del it logger.debug("checking if worker %s (pid=%d) is alive", worker.get_id(), worker.get_pid()) if not self.interrupted and not worker.is_alive(): logger.warning("The worker %s (pid=%d) went down unexpectedly!", worker.get_id(), worker.get_pid()) # Terminate immediately worker.terminate() worker.join(timeout=1) w_to_del.append(worker.get_id()) # OK, now really del workers from queues # And requeue the actions it was managed for worker_id in w_to_del: worker = self.workers[worker_id] # Del the queue of the module queue del self.q_by_mod[worker.module_name][worker.get_id()] for scheduler_uuid in self.schedulers: sched = self.schedulers[scheduler_uuid] for act in list(sched.actions.values()): if act.status == ACT_STATUS_QUEUED and act.my_worker == worker_id: # Got a check that will NEVER return if we do not restart it self.assign_to_a_queue(act) # So now we can really forgot it del self.workers[worker_id] def adjust_worker_number_by_load(self): """Try to create the minimum workers specified in the configuration :return: None """ if self.interrupted: logger.debug("Trying to adjust worker number. Ignoring because we are stopping.") return to_del = [] logger.debug("checking worker count." " Currently: %d workers, min per module : %d, max per module : %d", len(self.workers), self.min_workers, self.max_workers) # I want at least min_workers by module then if I can, I add worker for load balancing for mod in self.q_by_mod: # At least min_workers todo = max(0, self.min_workers - len(self.q_by_mod[mod])) for _ in range(todo): try: self.create_and_launch_worker(module_name=mod) # Maybe this modules is not a true worker one. # if so, just delete if from q_by_mod except NotWorkerMod: to_del.append(mod) break for mod in to_del: logger.warning("The module %s is not a worker one, I remove it from the worker list.", mod) del self.q_by_mod[mod] # TODO: if len(workers) > 2*wish, maybe we can kill a worker? def _get_queue_for_the_action(self, action): """Find action queue for the action depending on the module. The id is found with action modulo on action id :param a: the action that need action queue to be assigned :type action: object :return: worker id and queue. (0, None) if no queue for the module_type :rtype: tuple """ # get the module name, if not, take fork mod = getattr(action, 'module_type', 'fork') queues = list(self.q_by_mod[mod].items()) # Maybe there is no more queue, it's very bad! if not queues: return (0, None) # if not get action round robin index to get action queue based # on the action id self.rr_qid = (self.rr_qid + 1) % len(queues) (worker_id, queue) = queues[self.rr_qid] # return the id of the worker (i), and its queue return (worker_id, queue) def add_actions(self, actions_list, scheduler_instance_id): """Add a list of actions to the satellite queues :param actions_list: Actions list to add :type actions_list: list :param scheduler_instance_id: sheduler link to assign the actions to :type scheduler_instance_id: SchedulerLink :return: None """ # We check for new check in each schedulers and put the result in new_checks scheduler_link = None for scheduler_id in self.schedulers: logger.debug("Trying to add an action, scheduler: %s", self.schedulers[scheduler_id]) if scheduler_instance_id == self.schedulers[scheduler_id].instance_id: scheduler_link = self.schedulers[scheduler_id] break else: logger.error("Trying to add actions from an unknwown scheduler: %s", scheduler_instance_id) return if not scheduler_link: logger.error("Trying to add actions, but scheduler link is not found for: %s, " "actions: %s", scheduler_instance_id, actions_list) return logger.debug("Found scheduler link: %s", scheduler_link) for action in actions_list: # First we look if the action is identified uuid = getattr(action, 'uuid', None) if uuid is None: try: action = unserialize(action, no_load=True) uuid = action.uuid except FusionsupervisionClassLookupException: logger.error('Cannot un-serialize action: %s', action) continue # If we already have this action, we are already working for it! if uuid in scheduler_link.actions: continue # Action is attached to a scheduler action.my_scheduler = scheduler_link.uuid scheduler_link.actions[action.uuid] = action self.assign_to_a_queue(action) def assign_to_a_queue(self, action): """Take an action and put it to a worker actions queue :param action: action to put :type action: fusionsupervision.action.Action :return: None """ (worker_id, queue) = self._get_queue_for_the_action(action) if not worker_id: return # Tag the action as "in the worker i" action.my_worker = worker_id action.status = ACT_STATUS_QUEUED msg = Message(_type='Do', data=action, source=self.name) logger.debug("Queuing message: %s", msg) queue.put_nowait(msg) logger.debug("Queued") def get_new_actions(self): """ Wrapper function for do_get_new_actions For stats purpose :return: None TODO: Use a decorator for timing this function """ try: _t0 = time.time() self.do_get_new_actions() statsmgr.timer('actions.got.time', time.time() - _t0) except RuntimeError: logger.error("Exception like issue #1007") def do_get_new_actions(self): """Get new actions from schedulers Create a Message and put into the module queue REF: doc/fusionsupervision-action-queues.png (1) :return: None """ # Here are the differences between a poller and a reactionner: # Poller will only do checks, # Reactionner will do actions (notifications and event handlers) do_checks = self.__class__.do_checks do_actions = self.__class__.do_actions # We check and get the new actions to execute in each of our schedulers for scheduler_link_uuid in self.schedulers: scheduler_link = self.schedulers[scheduler_link_uuid] if not scheduler_link.active: logger.warning("My scheduler '%s' is not active currently", scheduler_link.name) continue logger.debug("get new actions, scheduler: %s", scheduler_link.name) # OK, go for it :) _t0 = time.time() actions = scheduler_link.get_actions({'do_checks': do_checks, 'do_actions': do_actions, 'poller_tags': self.poller_tags, 'reactionner_tags': self.reactionner_tags, 'worker_name': self.name, 'module_types': list(self.q_by_mod.keys())}) if actions: logger.debug("Got %d actions from %s", len(actions), scheduler_link.name) # We 'tag' them with my_scheduler and put into queue for workers self.add_actions(actions, scheduler_link.instance_id) logger.debug("Got %d actions from %s in %s", len(actions), scheduler_link.name, time.time() - _t0) statsmgr.gauge('actions.added.count.%s' % (scheduler_link.name), len(actions)) def clean_previous_run(self): """Clean variables from previous configuration, such as schedulers, broks and external commands :return: None """ # Execute the base class treatment... super(Satellite, self).clean_previous_run() # Clean my lists del self.broks[:] del self.events[:] def do_loop_turn(self): # pylint: disable=too-many-branches """Satellite main loop:: * Check and delete zombies actions / modules * Get returns from queues * Adjust worker number * Get new actions :return: None """ # Try to see if one of my module is dead, and restart previously dead modules self.check_and_del_zombie_modules() # Also if some zombie workers exist... self.check_and_del_zombie_workers() # Call modules that manage a starting tick pass self.hook_point('tick') # Print stats for debug for _, sched in self.schedulers.items(): for mod in self.q_by_mod: # In workers we've got actions sent to queue - queue size for (worker_id, queue) in list(self.q_by_mod[mod].items()): try: actions_count = queue.qsize() results_count = self.returns_queue.qsize() logger.debug("[%s][%s][%s] actions queued: %d, results queued: %d", sched.name, mod, worker_id, actions_count, results_count) # Update the statistics statsmgr.gauge('worker.%s.actions-queue-size' % worker_id, actions_count) statsmgr.gauge('worker.%s.results-queue-size' % worker_id, results_count) except (IOError, EOFError): pass # todo temporaray deactivate all this stuff! # Before return or get new actions, see how we managed # the former ones: are they still in queue(s)? If so, we # must wait more or at least have more workers # wait_ratio = self.wait_ratio.get_load() # total_q = 0 # try: # for mod in self.q_by_mod: # for queue in list(self.q_by_mod[mod].values()): # total_q += queue.qsize() # except (IOError, EOFError): # pass # if total_q != 0 and wait_ratio < 2 * self.worker_polling_interval: # logger.debug("I decide to increase the wait ratio") # self.wait_ratio.update_load(wait_ratio * 2) # # self.wait_ratio.update_load(self.worker_polling_interval) # else: # # Go to self.worker_polling_interval on normal run, if wait_ratio # # was >2*self.worker_polling_interval, # # it make it come near 2 because if < 2, go up :) # self.wait_ratio.update_load(self.worker_polling_interval) # wait_ratio = self.wait_ratio.get_load() # statsmgr.timer('core.wait-ratio', wait_ratio) # if self.log_loop: # logger.debug("[%s] wait ratio: %f", self.name, wait_ratio) # Maybe we do not have enough workers, we check for it # and launch the new ones if needed self.adjust_worker_number_by_load() # Manage all messages we've got in the last timeout # for queue in self.return_messages: try: logger.debug("[%s] manage action results: %d results", self.name, self.returns_queue.qsize()) while self.returns_queue.qsize(): msg = self.returns_queue.get_nowait() if msg is None: continue if not isinstance(msg, Message): logger.warning("Should have received a Message, got a %s!", type(msg)) continue logger.debug("Got a message: %s", msg) if msg.get_type() == 'Done': logger.debug("Got (from %s) an action result: %s", msg.get_source(), msg.get_data()) self.manage_action_return(msg.get_data()) elif msg.get_type() == 'Stats': logger.debug("Got (from %s) stats: %s", msg.get_source(), msg.get_data()) if msg.get_source() in self.workers: self.workers[msg.get_source()].stats = msg.get_data() else: logger.warning("Ignoring message of type: %s", msg.get_type()) except Full: logger.warning("Returns queue is full") except Empty: logger.debug("Returns queue is empty") except (IOError, EOFError) as exp: logger.warning("My returns queue is no more available: %s", str(exp)) except Exception as exp: # pylint: disable=broad-except logger.error("Failed getting messages in returns queue: %s", str(exp)) logger.error(traceback.format_exc()) for _, sched in self.schedulers.items(): if sched.wait_homerun: logger.debug("scheduler home run: %d results", len(sched.wait_homerun)) if not self.passive: # If we are an active satellite, we do not initiate the check getting # and return try: # We send to our schedulers the results of all finished checks logger.debug("pushing results...") self.push_results() except LinkError as exp: logger.warning("Scheduler connection failed, I could not send my results!") try: # And we get the new actions from our schedulers logger.debug("getting new actions...") self.get_new_actions() except LinkError as exp: logger.warning("Scheduler connection failed, I could not get new actions!") # Get objects from our modules that are not Worker based if self.log_loop: logger.debug("[%s] get objects from queues", self.name) self.get_objects_from_from_queues() statsmgr.gauge('external-commands.count', len(self.external_commands)) statsmgr.gauge('broks.count', len(self.broks)) statsmgr.gauge('events.count', len(self.events)) def do_post_daemon_init(self): """Do this satellite (poller or reactionner) post "daemonize" init :return: None """ # We can open the Queue for fork AFTER self.q_by_mod['fork'] = {} # todo: check if this is always useful? self.returns_queue = Queue() def setup_new_conf(self): # pylint: disable=too-many-branches """Setup the new configuration received from Arbiter This function calls the base satellite treatment and manages the configuration needed for a simple satellite daemon that executes some actions (eg. poller or reactionner): - configure the passive mode - configure the workers - configure the tags - configure the modules :return: None """ # Execute the base class treatment... super(Satellite, self).setup_new_conf() # ...then our own specific treatment! with self.conf_lock: logger.info("Received a new configuration") # self_conf is our own configuration from the fusionsupervision environment # self_conf = self.cur_conf['self_conf'] # Now manage modules if not self.have_modules: try: self.modules = unserialize(self.cur_conf['modules'], no_load=True) except FusionsupervisionClassLookupException as exp: # pragma: no cover, simple protection logger.error('Cannot un-serialize modules configuration ' 'received from arbiter: %s', exp) if self.modules: logger.info("I received some modules configuration: %s", self.modules) self.have_modules = True for module in self.modules: if module.name not in self.q_by_mod: self.q_by_mod[module.name] = {} self.do_load_modules(self.modules) # and start external modules too self.modules_manager.start_external_instances() else: logger.info("I do not have modules") # Initialize connection with all our satellites logger.info("Initializing connection with my satellites:") my_satellites = self.get_links_of_type(s_type='') for satellite in list(my_satellites.values()): logger.info("- : %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) # Now I have a configuration! self.have_conf = True def get_daemon_stats(self, details=False): """Increase the stats provided by the Daemon base class :return: stats dictionary :rtype: dict """ # call the daemon one res = super(Satellite, self).get_daemon_stats(details=details) counters = res['counters'] counters['broks'] = len(self.broks) counters['events'] = len(self.events) counters['workers'] = len(self.workers) if self.workers: res['workers'] = {} for worker in list(self.workers.values()): stats = getattr(self.workers[worker.get_id()], 'stats', None) if stats: res['workers'][worker.get_id()] = stats return res def main(self): """Main satellite function. Do init and then mainloop :return: None """ try: # Start the daemon mode if not self.do_daemon_init_and_start(): self.exit_on_error(message="Daemon initialization error", exit_code=3) self.do_post_daemon_init() # We wait for initial conf self.wait_for_initial_conf() if self.new_conf: # Setup the received configuration self.setup_new_conf() # Allocate Mortal Threads self.adjust_worker_number_by_load() # Now main loop self.do_main_loop() logger.info("Exited from the main loop.") self.request_stop() except Exception: # pragma: no cover, this should never happen indeed ;) self.exit_on_exception(traceback.format_exc()) raise
class SatelliteLink(Item): # pylint: disable=too-many-instance-attributes """SatelliteLink is a common Class for links between Arbiter and other satellites. Used by the Dispatcher object. """ # Next value used for auto generated instance_id _next_id = 1 # All the class properties that are 'to_send' are stored in the 'global' # configuration to be pushed to the satellite when the configuration is dispatched properties = Item.properties.copy() properties.update({ 'instance_id': StringProp(to_send=True), # When this property is set, the Arbiter will launch the corresponding daemon 'fusionsupervision_launched': BoolProp(default=False, fill_brok=['full_status'], to_send=True), # This property is set by the Arbiter when it detects that this daemon # is needed but not declared in the configuration 'missing_daemon': BoolProp(default=False, fill_brok=['full_status']), # Sent to the satellites and used to check the managed configuration # Those are not to_send=True because they are updated by the configuration Dispatcher # and set when the daemon receives its configuration 'managed_conf_id': StringProp(default=u''), 'push_flavor': StringProp(default=u''), 'hash': StringProp(default=u''), # A satellite link has the type/name of the daemon it is related to 'type': StringProp(default=u'', fill_brok=['full_status'], to_send=True), 'name': StringProp(default=u'', fill_brok=['full_status'], to_send=True), # Listening interface and address used by the other daemons 'host': StringProp(default=u'0.0.0.0', to_send=True), 'address': StringProp(default=u'127.0.0.1', fill_brok=['full_status'], to_send=True), 'active': BoolProp(default=True, fill_brok=['full_status'], to_send=True), 'short_timeout': IntegerProp(default=3, fill_brok=['full_status'], to_send=True), 'long_timeout': IntegerProp(default=120, fill_brok=['full_status'], to_send=True), # the delay (seconds) between two ping retries 'ping_period': IntegerProp(default=5), # The maximum number of retries before setting the daemon as dead 'max_check_attempts': IntegerProp(default=3, fill_brok=['full_status']), # For a spare daemon link 'spare': BoolProp(default=False, fill_brok=['full_status'], to_send=True), 'spare_check_interval': IntegerProp(default=5, fill_brok=['full_status']), 'spare_max_check_attempts': IntegerProp(default=3, fill_brok=['full_status']), 'manage_sub_realms': BoolProp(default=True, fill_brok=['full_status'], to_send=True), 'manage_arbiters': BoolProp(default=False, fill_brok=['full_status'], to_send=True), 'modules': ListProp(default=[''], split_on_comma=True), 'polling_interval': IntegerProp(default=5, fill_brok=['full_status'], to_send=True), 'use_timezone': StringProp(default=u'NOTSET', to_send=True), 'realm': StringProp(default=u'', fill_brok=['full_status'], brok_transformation=get_obj_name_two_args_and_void), 'realm_name': StringProp(default=u''), 'satellite_map': DictProp(default={}, elts_prop=AddrProp, to_send=True, override=True), 'use_ssl': BoolProp(default=False, fill_brok=['full_status'], to_send=True), 'hard_ssl_name_check': BoolProp(default=True, fill_brok=['full_status'], to_send=True), 'passive': BoolProp(default=False, fill_brok=['full_status'], to_send=True), }) running_properties = Item.running_properties.copy() running_properties.update( { 'con': StringProp(default=None), 'uri': StringProp(default=None), 'reachable': # Can be reached - assumed True as default ;) BoolProp(default=False, fill_brok=['full_status']), 'alive': # Is alive (attached process s launched...) BoolProp(default=False, fill_brok=['full_status']), 'valid': # Is valid (the daemon is the expected one) BoolProp(default=False, fill_brok=['full_status']), 'need_conf': # The daemon needs to receive a configuration BoolProp(default=True, fill_brok=['full_status']), 'have_conf': # The daemon has received a configuration BoolProp(default=False, fill_brok=['full_status']), 'stopping': # The daemon is requested to stop BoolProp(default=False, fill_brok=['full_status']), 'running_id': # The running identifier of my related daemon FloatProp(default=0, fill_brok=['full_status']), # the number of poll attempt from the arbiter dispatcher 'attempt': IntegerProp(default=0, fill_brok=['full_status']), # the last connection attempt timestamp 'last_connection': IntegerProp(default=0, fill_brok=['full_status']), # the number of failed attempt for the connection 'connection_attempt': IntegerProp(default=0, fill_brok=['full_status']), 'last_check': IntegerProp(default=0, fill_brok=['full_status']), 'cfg_managed': DictProp(default=None), 'cfg_to_manage': DictProp(default={}), 'configuration_sent': BoolProp(default=False), 'statistics': DictProp(default={}), }) def __init__(self, params=None, parsing=True): """Initialize a SatelliteLink If parsing is True, we are initializing from a configuration, else we are initializing from a copy of another satellite link data. This is used when the daemons receive their configuration from the arbiter. When initializing from an arbiter configuration, an instance_id property must exist else a LinkError exception is raised! If a satellite_map property exists in the provided parameters, it will update the default existing one """ super(SatelliteLink, self).__init__(params, parsing) logger.debug("Initialize a %s, params: %s", self.__class__.__name__, params) # My interface context self.broks = [] self.actions = {} self.wait_homerun = {} self.pushed_commands = [] self.init_running_properties() if parsing: # Create a new satellite link identifier self.instance_id = u'%s_%d' % (self.__class__.__name__, self.__class__._next_id) self.__class__._next_id += 1 elif 'instance_id' not in params: raise LinkError( "When not parsing a configuration, " "an instance_id must exist in the provided parameters") self.fill_default() # Hack for ascending compatibility with Shinken configuration try: # We received a configuration with a 'name' property... if self.name: setattr(self, "%s_name" % self.type, self.name) else: # We received a configuration without a 'name' property... old form! if getattr(self, "%s_name" % self.type, None): setattr(self, 'name', getattr(self, "%s_name" % self.type)) else: self.name = "Unnamed %s" % self.type setattr(self, "%s_name" % self.type, self.name) except KeyError: setattr(self, 'name', getattr(self, "%s_name" % self.type)) # Initialize our satellite map, and update if required self.set_arbiter_satellite_map(params.get('satellite_map', {})) self.cfg = {'self_conf': {}, 'schedulers': {}, 'arbiters': {}} # Create the daemon connection self.create_connection() def __repr__(self): # pragma: no cover return '<%s - %s/%s, %s//%s:%s, rid: %s, spare: %s, realm: %s, sub-realms: %s, ' \ 'managing: %s (%s) />' \ % (self.instance_id, self.type, self.name, self.scheme, self.address, self.port, self.running_id, self.spare, self.realm, self.manage_sub_realms, self.managed_conf_id, self.push_flavor) __str__ = __repr__ @property def scheme(self): """Daemon interface scheme :return: http or https if the daemon uses SSL :rtype: str """ _scheme = 'http' if self.use_ssl: _scheme = 'https' return _scheme @staticmethod def get_a_satellite_link(sat_type, sat_dict): """Get a SatelliteLink object for a given satellite type and a dictionary :param sat_type: type of satellite :param sat_dict: satellite configuration data :return: """ cls = get_fusionsupervision_class( 'fusionsupervision.objects.%slink.%sLink' % (sat_type, sat_type.capitalize())) return cls(params=sat_dict, parsing=False) def get_livestate(self): """Get the SatelliteLink live state. The live state is a tuple information containing a state identifier and a message, where: state is: - 0 for an up and running satellite - 1 if the satellite is not reachale - 2 if the satellite is dead - 3 else (not active) :return: tuple """ livestate = 0 if self.active: if not self.reachable: livestate = 1 elif not self.alive: livestate = 2 else: livestate = 3 livestate_output = "%s/%s is %s" % (self.type, self.name, [ "up and running.", "warning because not reachable.", "critical because not responding.", "not active by configuration." ][livestate]) return (livestate, livestate_output) def set_arbiter_satellite_map(self, satellite_map=None): """ satellite_map is the satellites map in current context: - A SatelliteLink is owned by an Arbiter - satellite_map attribute of a SatelliteLink is the map defined IN THE satellite configuration but for creating connections, we need to have the satellites map from the Arbiter point of view :return: None """ self.satellite_map = { 'address': self.address, 'port': self.port, 'use_ssl': self.use_ssl, 'hard_ssl_name_check': self.hard_ssl_name_check } if satellite_map: self.satellite_map.update(satellite_map) def get_and_clear_context(self): """Get and clean all of our broks, actions, external commands and homerun :return: list of all broks of the satellite link :rtype: list """ res = (self.broks, self.actions, self.wait_homerun, self.pushed_commands) self.broks = [] self.actions = {} self.wait_homerun = {} self.pushed_commands = [] return res def get_and_clear_broks(self): """Get and clean all of our broks :return: list of all broks of the satellite link :rtype: list """ res = self.broks self.broks = [] return res def prepare_for_conf(self): """Initialize the pushed configuration dictionary with the inner properties that are to be propagated to the satellite link. :return: None """ logger.debug("- preparing: %s", self) self.cfg = { 'self_conf': self.give_satellite_cfg(), 'schedulers': {}, 'arbiters': {} } logger.debug("- prepared: %s", self.cfg) def give_satellite_cfg(self): """Get the default information for a satellite. Overridden by the specific satellites links :return: dictionary of information common to all the links :rtype: dict """ # All the satellite link class properties that are 'to_send' are stored in a # dictionary to be pushed to the satellite when the configuration is dispatched res = {} properties = self.__class__.properties for prop, entry in list(properties.items()): if hasattr(self, prop) and entry.to_send: res[prop] = getattr(self, prop) return res def give_satellite_json(self): """Get the json information for a satellite. This to provide information that will be exposed by a daemon on its HTTP interface. :return: dictionary of information common to all the links :rtype: dict """ daemon_properties = [ 'type', 'name', 'uri', 'spare', 'configuration_sent', 'realm_name', 'manage_sub_realms', 'active', 'reachable', 'alive', 'passive', 'last_check', 'polling_interval', 'max_check_attempts' ] (livestate, livestate_output) = self.get_livestate() res = {"livestate": livestate, "livestate_output": livestate_output} for sat_prop in daemon_properties: res[sat_prop] = getattr(self, sat_prop, 'not_yet_defined') return res def manages(self, cfg_part): """Tell if the satellite is managing this configuration part The managed configuration is formed as a dictionary indexed on the link instance_id: { u'SchedulerLink_1': { u'hash': u'4d08630a3483e1eac7898e7a721bd5d7768c8320', u'push_flavor': u'4d08630a3483e1eac7898e7a721bd5d7768c8320', u'managed_conf_id': [u'Config_1'] } } Note that the managed configuration is a string array rather than a simple string... no special for this reason, probably due to the serialization when the configuration is pushed :/ :param cfg_part: configuration part as prepare by the Dispatcher :type cfg_part: Conf :return: True if the satellite manages this configuration :rtype: bool """ logger.debug( "Do I (%s/%s) manage: %s, my managed configuration(s): %s", self.type, self.name, cfg_part, self.cfg_managed) # If we do not yet manage a configuration if not self.cfg_managed: logger.info("I (%s/%s) do not manage (yet) any configuration!", self.type, self.name) return False # Check in the schedulers list configurations for managed_cfg in list(self.cfg_managed.values()): # If not even the cfg_id in the managed_conf, bail out if managed_cfg['managed_conf_id'] == cfg_part.instance_id \ and managed_cfg['push_flavor'] == cfg_part.push_flavor: logger.debug("I do manage this configuration: %s", cfg_part) break else: logger.warning("I (%s/%s) do not manage this configuration: %s", self.type, self.name, cfg_part) return False return True def create_connection(self): """Initialize HTTP connection with a satellite (con attribute) and set its uri attribute This is called on the satellite link initialization :return: None """ # Create the HTTP client for the connection try: self.con = HTTPClient( address=self.satellite_map['address'], port=self.satellite_map['port'], short_timeout=self.short_timeout, long_timeout=self.long_timeout, use_ssl=self.satellite_map['use_ssl'], strong_ssl=self.satellite_map['hard_ssl_name_check']) self.uri = self.con.uri except HTTPClientException as exp: # logger.error("Error with '%s' when creating client: %s", self.name, str(exp)) # Set the satellite as dead self.set_dead() raise LinkError("Error with '%s' when creating client: %s" % (self.name, str(exp))) def set_alive(self): """Set alive, reachable, and reset attempts. If we change state, raise a status brok update alive, means the daemon is prenset in the system reachable, means that the HTTP connection is valid With this function we confirm that the daemon is reachable and, thus, we assume it is alive! :return: None """ was_alive = self.alive self.alive = True self.reachable = True self.attempt = 0 # We came from dead to alive! We must propagate the good news if not was_alive: logger.info("Setting %s satellite as alive :)", self.name) self.broks.append(self.get_update_status_brok()) def set_dead(self): """Set the satellite into dead state: If we change state, raise a status brok update :return:None """ was_alive = self.alive self.alive = False self.reachable = False self.attempt = 0 # We will have to create a new connection... self.con = None # We are dead now! We must propagate the sad news... if was_alive and not self.stopping: logger.warning("Setting the satellite %s as dead :(", self.name) self.broks.append(self.get_update_status_brok()) def add_failed_check_attempt(self, reason=''): """Set the daemon as unreachable and add a failed attempt if we reach the maximum attempts, set the daemon as dead :param reason: the reason of adding an attempts (stack trace sometimes) :type reason: str :return: None """ self.reachable = False self.attempt = self.attempt + 1 logger.debug("Failed attempt for %s (%d/%d), reason: %s", self.name, self.attempt, self.max_check_attempts, reason) # Don't need to warn again and again if the satellite is already dead # Only warn when it is alive if self.alive: if not self.stopping: logger.warning("Add failed attempt for %s (%d/%d) - %s", self.name, self.attempt, self.max_check_attempts, reason) else: logger.info( "Stopping... failed attempt for %s (%d/%d) - also probably stopping", self.name, self.attempt, self.max_check_attempts) # If we reached the maximum attempts, set the daemon as dead if self.attempt >= self.max_check_attempts: if not self.stopping: logger.warning( "Set %s as dead, too much failed attempts (%d), last problem is: %s", self.name, self.max_check_attempts, reason) else: logger.info( "Stopping... set %s as dead, too much failed attempts (%d)", self.name, self.max_check_attempts) self.set_dead() def valid_connection(*outer_args, **outer_kwargs): # pylint: disable=unused-argument, no-method-argument """Check if the daemon connection is established and valid""" def decorator(func): # pylint: disable=missing-docstring def decorated(*args, **kwargs): # pylint: disable=missing-docstring # outer_args and outer_kwargs are the decorator arguments # args and kwargs are the decorated function arguments link = args[0] if not link.con: raise LinkError("The connection is not created for %s" % link.name) if not link.running_id: raise LinkError( "The connection is not initialized for %s" % link.name) return func(*args, **kwargs) return decorated return decorator def communicate(*outer_args, **outer_kwargs): # pylint: disable=unused-argument, no-method-argument """Check if the daemon connection is authorized and valid""" def decorator(func): # pylint: disable=missing-docstring def decorated(*args, **kwargs): # pylint: disable=missing-docstring # outer_args and outer_kwargs are the decorator arguments # args and kwargs are the decorated function arguments fn_name = func.__name__ link = args[0] if not link.alive: logger.warning("%s is not alive for %s", link.name, fn_name) return None try: if not link.reachable: raise LinkError("The %s %s is not reachable" % (link.type, link.name)) logger.debug("[%s] Calling: %s, %s, %s", link.name, fn_name, args, kwargs) return func(*args, **kwargs) except HTTPClientConnectionException as exp: # A Connection error is raised when the daemon connection cannot be established # No way with the configuration parameters! if not link.stopping: logger.warning( "A daemon (%s/%s) that we must be related with " "cannot be connected: %s", link.type, link.name, exp) else: logger.info( "Stopping... daemon (%s/%s) cannot be connected. " "It is also probably stopping or yet stopped.", link.type, link.name) link.set_dead() except (LinkError, HTTPClientTimeoutException) as exp: link.add_failed_check_attempt("Connection timeout " "with '%s': %s" % (fn_name, str(exp))) return False except HTTPClientDataException as exp: # A Data error is raised when the daemon HTTP reponse is not 200! # No way with the communication if some problems exist in the daemon interface! # Abort all err = "Some daemons that we must be related with " \ "have some interface problems. Sorry, I bail out" logger.error(err) os.sys.exit(err) except HTTPClientException as exp: link.add_failed_check_attempt("Error with '%s': %s" % (fn_name, str(exp))) return None return decorated return decorator @communicate() def get_running_id(self): """Send a HTTP request to the satellite (GET /identity) Used to get the daemon running identifier that allows to know if the daemon got restarted This is called on connection initialization or re-connection If the daemon is notreachable, this function will raise an exception and the caller will receive a False as return :return: Boolean indicating if the running id was received :type: bool """ former_running_id = self.running_id logger.info(" get the running identifier for %s %s.", self.type, self.name) # An exception is raised in this function if the daemon is not reachable self.running_id = self.con.get('identity') if isinstance(self.running_id, dict): self.running_id = self.running_id['running_id'] if former_running_id == 0: if self.running_id: logger.info(" -> got: %s.", self.running_id) former_running_id = self.running_id # If the daemon has just started or has been restarted: it has a new running_id. if former_running_id != self.running_id: if former_running_id: logger.info( " -> The %s %s running identifier changed: %s. " "The daemon was certainly restarted!", self.type, self.name, self.running_id) # So we clear all verifications, they are obsolete now. logger.info( "The running id of the %s %s changed (%s), " "we must clear its context.", self.type, self.name, self.running_id) (_, _, _, _) = self.get_and_clear_context() # Set the daemon as alive self.set_alive() return True @valid_connection() @communicate() def stop_request(self, stop_now=False): """Send a stop request to the daemon :param stop_now: stop now or go to stop wait mode :type stop_now: bool :return: the daemon response (True) """ logger.debug("Sending stop request to %s, stop now: %s", self.name, stop_now) res = self.con.get('stop_request', {'stop_now': '1' if stop_now else '0'}) return res @valid_connection() @communicate() def update_infos(self, forced=False, test=False): """Update satellite info each self.polling_interval seconds so we smooth arbiter actions for just useful actions. Raise a satellite update status Brok If forced is True, then ignore the ping period. This is used when the configuration has not yet been dispatched to the Arbiter satellites. If test is True, do not really ping the daemon (useful for the unit tests only) :param forced: ignore the ping smoothing :type forced: bool :param test: :type test: bool :return: None if the last request is too recent, False if a timeout was raised during the request, else the managed configurations dictionary """ logger.debug("Update informations, forced: %s", forced) # First look if it's not too early to ping now = time.time() if not forced and self.last_check and self.last_check + self.polling_interval > now: logger.debug( "Too early to ping %s, ping period is %ds!, last check: %d, now: %d", self.name, self.polling_interval, self.last_check, now) return None self.get_conf(test=test) # Update the daemon last check timestamp self.last_check = time.time() # Update the state of this element self.broks.append(self.get_update_status_brok()) return self.cfg_managed @valid_connection() @communicate() def get_daemon_stats(self, details=False): """Send a HTTP request to the satellite (GET /get_daemon_stats) :return: Daemon statistics :rtype: dict """ logger.debug("Get daemon statistics for %s, %s %s", self.name, self.alive, self.reachable) return self.con.get('stats%s' % ('?details=1' if details else '')) @valid_connection() @communicate() def get_initial_broks(self, broker_name): """Send a HTTP request to the satellite (GET /_initial_broks) Used to build the initial broks for a broker connecting to a scheduler :param broker_name: the concerned broker name :type broker_name: str :return: Boolean indicating if the running id changed :type: bool """ logger.debug("Getting initial broks for %s, %s %s", self.name, self.alive, self.reachable) return self.con.get('_initial_broks', {'broker_name': broker_name}, wait=True) @valid_connection() @communicate() def wait_new_conf(self): """Send a HTTP request to the satellite (GET /wait_new_conf) :return: True if wait new conf, otherwise False :rtype: bool """ logger.debug("Wait new configuration for %s, %s %s", self.name, self.alive, self.reachable) return self.con.get('_wait_new_conf') @valid_connection() @communicate() def put_conf(self, configuration, test=False): """Send the configuration to the satellite HTTP request to the satellite (POST /push_configuration) If test is True, store the configuration internally :param configuration: The conf to send (data depend on the satellite) :type configuration: :return: None """ logger.debug("Sending configuration to %s, %s %s", self.name, self.alive, self.reachable) # ---------- if test: setattr(self, 'unit_test_pushed_configuration', configuration) # print("*** unit tests - sent configuration %s: %s" % (self.name, configuration)) return True # ---------- return self.con.post('_push_configuration', {'conf': configuration}, wait=True) @valid_connection() @communicate() def has_a_conf(self, magic_hash=None): # pragma: no cover """Send a HTTP request to the satellite (GET /have_conf) Used to know if the satellite has a conf :param magic_hash: Config hash. Only used for HA arbiter communication :type magic_hash: int :return: Boolean indicating if the satellite has a (specific) configuration :type: bool """ logger.debug("Have a configuration for %s, %s %s", self.name, self.alive, self.reachable) self.have_conf = self.con.get('_have_conf', {'magic_hash': magic_hash}) return self.have_conf @valid_connection() @communicate() def get_conf(self, test=False): """Send a HTTP request to the satellite (GET /managed_configurations) and update the cfg_managed attribute with the new information Set to {} on failure the managed configurations are a dictionary which keys are the scheduler link instance id and the values are the push_flavor If test is True, returns the unit test internally stored configuration Returns False if a timeout is raised :return: see @communicate, or the managed configuration """ logger.debug("Get managed configuration for %s, %s %s", self.name, self.alive, self.reachable) # ---------- if test: self.cfg_managed = {} self.have_conf = True logger.debug("Get managed configuration test ...") if getattr(self, 'unit_test_pushed_configuration', None) is not None: # Note this is a dict not a SatelliteLink object ! for scheduler_link in self.unit_test_pushed_configuration[ 'schedulers'].values(): self.cfg_managed[scheduler_link['instance_id']] = { 'hash': scheduler_link['hash'], 'push_flavor': scheduler_link['push_flavor'], 'managed_conf_id': scheduler_link['managed_conf_id'] } # print("*** unit tests - get managed configuration %s: %s" # % (self.name, self.cfg_managed)) # ---------- else: self.cfg_managed = self.con.get('managed_configurations') logger.debug("My (%s) fresh managed configuration: %s", self.name, self.cfg_managed) self.have_conf = (self.cfg_managed != {}) return self.cfg_managed @valid_connection() @communicate() def push_broks(self, broks): """Send a HTTP request to the satellite (POST /push_broks) Send broks to the satellite :param broks: Brok list to send :type broks: list :return: True on success, False on failure :rtype: bool """ logger.debug("[%s] Pushing %d broks", self.name, len(broks)) return self.con.post('_push_broks', {'broks': broks}, wait=True) @valid_connection() @communicate() def push_actions(self, actions, scheduler_instance_id): """Post the actions to execute to the satellite. Indeed, a scheduler post its checks to a poller and its actions to a reactionner. :param actions: Action list to send :type actions: list :param scheduler_instance_id: Scheduler instance identifier :type scheduler_instance_id: uuid :return: True on success, False on failure :rtype: bool """ logger.debug("Pushing %d actions from %s", len(actions), scheduler_instance_id) return self.con.post('_push_actions', { 'actions': actions, 'scheduler_instance_id': scheduler_instance_id }, wait=True) @valid_connection() @communicate() def push_results(self, results, scheduler_name): """Send a HTTP request to the satellite (POST /put_results) Send actions results to the satellite :param results: Results list to send :type results: list :param scheduler_name: Scheduler name :type scheduler_name: uuid :return: True on success, False on failure :rtype: bool """ logger.debug("Pushing %d results", len(results)) result = self.con.post('put_results', { 'results': results, 'from': scheduler_name }, wait=True) return result @valid_connection() @communicate() def push_external_commands(self, commands): """Send a HTTP request to the satellite (POST /r_un_external_commands) to send the external commands to the satellite :param results: Results list to send :type results: list :return: True on success, False on failure :rtype: bool """ logger.debug("Pushing %d external commands", len(commands)) return self.con.post('_run_external_commands', {'cmds': commands}, wait=True) @valid_connection() @communicate() def get_external_commands(self): """Send a HTTP request to the satellite (GET /_external_commands) to get the external commands from the satellite. :return: External Command list on success, [] on failure :rtype: list """ res = self.con.get('_external_commands', wait=False) logger.debug("Got %d external commands from %s: %s", len(res), self.name, res) return unserialize(res, True) @valid_connection() @communicate() def get_broks(self, broker_name): """Send a HTTP request to the satellite (GET /_broks) Get broks from the satellite. Un-serialize data received. :param broker_name: the concerned broker link :type broker_name: BrokerLink :return: Broks list on success, [] on failure :rtype: list """ res = self.con.get('_broks', {'broker_name': broker_name}, wait=False) logger.debug("Got broks from %s: %s", self.name, res) return unserialize(res, True) @valid_connection() @communicate() def get_events(self): """Send a HTTP request to the satellite (GET /_events) Get monitoring events from the satellite. :return: Broks list on success, [] on failure :rtype: list """ res = self.con.get('_events', wait=False) logger.debug("Got events from %s: %s", self.name, res) return unserialize(res, True) @valid_connection() def get_results(self, scheduler_instance_id): """Send a HTTP request to the satellite (GET /_results) Get actions results from satellite (only passive satellites expose this method. :param scheduler_instance_id: scheduler instance identifier :type scheduler_instance_id: str :return: Results list on success, [] on failure :rtype: list """ res = self.con.get('_results', {'scheduler_instance_id': scheduler_instance_id}, wait=True) logger.debug("Got %d results from %s: %s", len(res), self.name, res) return res @valid_connection() def get_actions(self, params): """Send a HTTP request to the satellite (GET /_checks) Get actions from the scheduler. Un-serialize data received. :param params: the request parameters :type params: str :return: Actions list on success, [] on failure :rtype: list """ res = self.con.get('_checks', params, wait=True) logger.debug("Got checks to execute from %s: %s", self.name, res) return unserialize(res, True)
class Realm(Itemgroup): """Realm class is used to implement realm. It is basically a group of Hosts assigned to a specific Scheduler/Poller (other daemon are optional) """ my_type = 'realm' members_property = "members" group_members_property = "realm_members" properties = Itemgroup.properties.copy() properties.update({ 'realm_name': StringProp(default=u'', fill_brok=['full_status']), 'alias': StringProp(default=u'', fill_brok=['full_status']), 'realm_members': ListProp(default=[], split_on_comma=True), 'group_members': ListProp(default=[], split_on_comma=True), 'higher_realms': ListProp(default=[], split_on_comma=True), 'default': BoolProp(default=False) }) running_properties = Itemgroup.running_properties.copy() running_properties.update({ # Indicate if some only passively or actively checks host exist in the realm 'passively_checked_hosts': BoolProp(default=None), 'actively_checked_hosts': BoolProp(default=None), # Those lists contain only the uuid of the satellite link, not the whole object! 'arbiters': ListProp(default=[]), 'schedulers': ListProp(default=[]), 'brokers': ListProp(default=[]), 'pollers': ListProp(default=[]), 'reactionners': ListProp(default=[]), 'receivers': ListProp(default=[]), 'potential_schedulers': ListProp(default=[]), 'potential_brokers': ListProp(default=[]), 'potential_pollers': ListProp(default=[]), 'potential_reactionners': ListProp(default=[]), 'potential_receivers': ListProp(default=[]), # Once configuration is prepared, the count of the hosts in the realm 'hosts_count': IntegerProp(default=0), 'packs': DictProp(default={}), 'parts': DictProp(default={}), # Realm level in the realms hierarchy 'level': IntegerProp(default=-1), # All the sub realms (children and grand-children) 'all_sub_members': ListProp(default=[]), 'all_sub_members_names': ListProp(default=[]), }) macros = { 'REALMNAME': 'realm_name', 'REALMDEFAULT': 'default', 'REALMMEMBERS': 'members', 'REALMREALM_MEMBERS': 'realm_members', 'REALMGROUP_MEMBERS': 'group_members', 'REALMHOSTS_COUNT': 'hosts_count', } def __init__(self, params=None, parsing=True): super(Realm, self).__init__(params, parsing) self.fill_default() # Define a packs list for the configuration preparation self.packs = [] # Once the configuration got prepared, packs becomes a dictionary! # packs is a dictionary indexed with the configuration part # number and containing the list of hosts # List of satellites related to the realm self.to_satellites = { 'reactionner': {}, 'poller': {}, 'broker': {}, 'receiver': {} } # List of satellites that need a configuration dispatch self.to_satellites_need_dispatch = { 'reactionner': {}, 'poller': {}, 'broker': {}, 'receiver': {} } # List of satellites with their managed configuration self.to_satellites_managed_by = { 'reactionner': {}, 'poller': {}, 'broker': {}, 'receiver': {} } # Attributes depending of the satellite type for sat_type in [ 'arbiter', 'scheduler', 'reactionner', 'poller', 'broker', 'receiver' ]: # Minimum is to have one satellite setattr(self, "nb_%ss" % sat_type, 0) setattr(self, 'potential_%ss' % sat_type, []) def __repr__(self): res = '<%r %r (%d)' % (self.__class__.__name__, self.get_name(), self.level) if self.realm_members: res = res + ', %d sub-realms: %r' \ % (len(self.realm_members), ', '.join([str(s) for s in self.realm_members])) if self.all_sub_members_names: res = res + ', %d all sub-realms: %r' \ % (len(self.all_sub_members_names), ', '.join([str(s) for s in self.all_sub_members_names])) if self.hosts_count: res = res + ', %d hosts' % self.hosts_count if getattr(self, 'parts', None): res = res + ', %d parts' % len(self.parts) if getattr(self, 'packs', None): res = res + ', %d packs' % len(self.packs) return res + '/>' __str__ = __repr__ @property def name(self): """Get the realm name""" return self.get_name() def get_name(self): """Accessor to realm_name attribute :return: realm name :rtype: str """ return getattr(self, 'realm_name', 'unset') def add_group_members(self, members): """Add a new group member to the groups list :param members: member name :type members: str :return: None """ if not isinstance(members, list): members = [members] if not getattr(self, 'group_members', None): self.group_members = members else: self.group_members.extend(members) def prepare_satellites(self, satellites): """Update the following attributes of a realm:: * nb_*satellite type*s * self.potential_*satellite type*s (satellite types are scheduler, reactionner, poller, broker and receiver) :param satellites: dict of SatelliteLink objects :type satellites: dict :return: None """ for sat_type in [ "scheduler", "reactionner", "poller", "broker", "receiver" ]: # We get potential TYPE at realm level first for sat_link_uuid in getattr(self, "%ss" % sat_type): if sat_link_uuid not in satellites: continue sat_link = satellites[sat_link_uuid] # Found our declared satellite in the provided satellites if sat_link.active and not sat_link.spare: # Generic increment : realm.nb_TYPE += 1 setattr(self, "nb_%ss" % sat_type, getattr(self, "nb_%ss" % sat_type) + 1) break else: self.add_error( "Realm %s, satellite %s declared in the realm is not found " "in the allowed satellites!" % (self.name, sat_link.name)) logger.error( "Satellite %s declared in the realm %s not found " "in the allowed satellites!", sat_link.name, self.name) logger.info( " Realm %s: (in/potential) (schedulers:%d/%d) (pollers:%d/%d) " "(reactionners:%d/%d) (brokers:%d/%d) (receivers:%d/%d)", self.name, self.nb_schedulers, len(self.potential_schedulers), self.nb_pollers, len(self.potential_pollers), self.nb_reactionners, len(self.potential_reactionners), self.nb_brokers, len(self.potential_brokers), self.nb_receivers, len(self.potential_receivers)) def get_realms_by_explosion(self, realms): """Get all members of this realm including members of sub-realms on multi-levels :param realms: realms list, used to look for a specific one :type realms: fusionsupervision.objects.realm.Realms :return: list of members and add realm to realm_members attribute :rtype: list """ # If rec_tag is already set, then we detected a loop in the realms hierarchy! if getattr(self, 'rec_tag', False): self.add_error( "Error: there is a loop in the realm definition %s" % self.get_name()) return None # Ok, not in a loop, we tag the realm and parse its members self.rec_tag = True # Order realm members list by name self.realm_members = sorted(self.realm_members) for member in self.realm_members: realm = realms.find_by_name(member) if not realm: self.add_unknown_members(member) continue children = realm.get_realms_by_explosion(realms) if children is None: # We got a loop in our children definition self.all_sub_members = [] self.realm_members = [] return None # Return the list of all unique members return self.all_sub_members def set_level(self, level, realms): """Set the realm level in the realms hierarchy :return: None """ self.level = level if not self.level: logger.info("- %s", self.get_name()) else: logger.info(" %s %s", '+' * self.level, self.get_name()) self.all_sub_members = [] self.all_sub_members_names = [] for child in sorted(self.realm_members): child = realms.find_by_name(child) if not child: continue self.all_sub_members.append(child.uuid) self.all_sub_members_names.append(child.get_name()) grand_children = child.set_level(self.level + 1, realms) for grand_child in grand_children: if grand_child in self.all_sub_members_names: continue grand_child = realms.find_by_name(grand_child) if grand_child: self.all_sub_members_names.append(grand_child.get_name()) self.all_sub_members.append(grand_child.uuid) return self.all_sub_members_names def get_all_subs_satellites_by_type(self, sat_type, realms): """Get all satellites of the wanted type in this realm recursively :param sat_type: satellite type wanted (scheduler, poller ..) :type sat_type: :param realms: all realms :type realms: list of realm object :return: list of satellite in this realm :rtype: list """ res = copy.copy(getattr(self, sat_type)) for member in self.all_sub_members: res.extend(realms[member].get_all_subs_satellites_by_type( sat_type, realms)) return res def get_satellites_by_type(self, s_type): """Generic function to access one of the satellite attribute ie : self.pollers, self.reactionners ... :param s_type: satellite type wanted :type s_type: str :return: self.*type*s :rtype: list """ if hasattr(self, s_type + 's'): return getattr(self, s_type + 's') logger.debug("[realm %s] do not have this kind of satellites: %s", self.name, s_type) return [] def get_potential_satellites_by_type(self, satellites, s_type): """Generic function to access one of the potential satellite attribute ie : self.potential_pollers, self.potential_reactionners ... :param satellites: list of SatelliteLink objects :type satellites: SatelliteLink list :param s_type: satellite type wanted :type s_type: str :return: self.potential_*type*s :rtype: list """ if not hasattr(self, 'potential_' + s_type + 's'): logger.debug("[realm %s] do not have this kind of satellites: %s", self.name, s_type) return [] matching_satellites = [] for sat_link in satellites: if sat_link.uuid in getattr(self, s_type + 's'): matching_satellites.append(sat_link) if matching_satellites: logger.debug("- found %ss: %s", s_type, matching_satellites) return matching_satellites for sat_link in satellites: if sat_link.uuid in getattr(self, 'potential_' + s_type + 's'): matching_satellites.append(sat_link) # Do not limit to one satellite! # break logger.debug("- potential %ss: %s", s_type, matching_satellites) return matching_satellites def get_nb_of_must_have_satellites(self, s_type): """Generic function to access one of the number satellite attribute ie : self.nb_pollers, self.nb_reactionners ... :param s_type: satellite type wanted :type s_type: str :return: self.nb_*type*s :rtype: int """ if hasattr(self, 'nb_' + s_type + 's'): return getattr(self, 'nb_' + s_type + 's') logger.debug("[realm %s] do not have this kind of satellites: %s", self.name, s_type) return 0 def get_links_for_a_broker(self, pollers, reactionners, receivers, realms, manage_sub_realms=False): """Get a configuration dictionary with pollers, reactionners and receivers links for a broker :param pollers: pollers :type pollers: :param reactionners: reactionners :type reactionners: :param receivers: receivers :type receivers: :param realms: realms :type realms: :param manage_sub_realms: :type manage_sub_realms: True if the borker manages sub realms :return: dict containing pollers, reactionners and receivers links (key is satellite id) :rtype: dict """ # Create void satellite links cfg = { 'pollers': {}, 'reactionners': {}, 'receivers': {}, } # Our self.daemons are only identifiers... that we use to fill the satellite links for poller_id in self.pollers: poller = pollers[poller_id] cfg['pollers'][poller.uuid] = poller.give_satellite_cfg() for reactionner_id in self.reactionners: reactionner = reactionners[reactionner_id] cfg['reactionners'][ reactionner.uuid] = reactionner.give_satellite_cfg() for receiver_id in self.receivers: receiver = receivers[receiver_id] cfg['receivers'][receiver.uuid] = receiver.give_satellite_cfg() # If the broker manages sub realms, fill the satellite links... if manage_sub_realms: # Now pollers for poller_id in self.get_all_subs_satellites_by_type( 'pollers', realms): poller = pollers[poller_id] cfg['pollers'][poller.uuid] = poller.give_satellite_cfg() # Now reactionners for reactionner_id in self.get_all_subs_satellites_by_type( 'reactionners', realms): reactionner = reactionners[reactionner_id] cfg['reactionners'][ reactionner.uuid] = reactionner.give_satellite_cfg() # Now receivers for receiver_id in self.get_all_subs_satellites_by_type( 'receivers', realms): receiver = receivers[receiver_id] cfg['receivers'][receiver.uuid] = receiver.give_satellite_cfg() return cfg def get_links_for_a_scheduler(self, pollers, reactionners, brokers): """Get a configuration dictionary with pollers, reactionners and brokers links for a scheduler :return: dict containing pollers, reactionners and brokers links (key is satellite id) :rtype: dict """ # Create void satellite links cfg = { 'pollers': {}, 'reactionners': {}, 'brokers': {}, } # Our self.daemons are only identifiers... that we use to fill the satellite links try: for poller in self.pollers + self.get_potential_satellites_by_type( pollers, "poller"): if poller in pollers: poller = pollers[poller] cfg['pollers'][poller.uuid] = poller.give_satellite_cfg() for reactionner in self.reactionners + self.get_potential_satellites_by_type( reactionners, "reactionner"): if reactionner in reactionners: reactionner = reactionners[reactionner] cfg['reactionners'][ reactionner.uuid] = reactionner.give_satellite_cfg() for broker in self.brokers + self.get_potential_satellites_by_type( brokers, "broker"): if broker in brokers: broker = brokers[broker] cfg['brokers'][broker.uuid] = broker.give_satellite_cfg() except Exception as exp: # pylint: disable=broad-except logger.exception("realm.get_links_for_a_scheduler: %s", exp) # for poller in self.get_potential_satellites_by_type(pollers, "poller"): # logger.info("Poller: %s", poller) # cfg['pollers'][poller.uuid] = poller.give_satellite_cfg() # # for reactionner in self.get_potential_satellites_by_type(reactionners, "reactionner"): # cfg['reactionners'][reactionner.uuid] = reactionner.give_satellite_cfg() # # for broker in self.get_potential_satellites_by_type(brokers, "broker"): # cfg['brokers'][broker.uuid] = broker.give_satellite_cfg() return cfg
class Receiver(Satellite): """Receiver class. Referenced as "app" in most Interface """ my_type = 'receiver' properties = Satellite.properties.copy() properties.update({ 'type': StringProp(default='receiver'), 'port': IntegerProp(default=7773) }) def __init__(self, **kwargs): """Receiver daemon initialisation :param kwargs: command line arguments """ super(Receiver, self).__init__(kwargs.get('daemon_name', 'Default-receiver'), **kwargs) # Our schedulers and arbiters are initialized in the base class # Our related daemons # self.pollers = {} # self.reactionners = {} # Modules are load one time self.have_modules = False # Now an external commands manager and a list for the external_commands self.external_commands_manager = None # and the unprocessed one, a buffer self.unprocessed_external_commands = [] self.accept_passive_unknown_check_results = False self.http_interface = GenericInterface(self) def add(self, elt): """Generic function to add objects to the daemon internal lists. Manage Broks, External commands :param elt: object to add :type elt: fusionsupervision.FusionsupervisionObject :return: None """ # external commands may be received as a dictionary when pushed from the WebUI if isinstance( elt, dict ) and 'my_type' in elt and elt['my_type'] == "externalcommand": if 'cmd_line' not in elt: logger.debug( "Received a bad formated external command: %s. " "No cmd_line!", elt) return logger.debug("Received a dictionary external command: %s", elt) if 'creation_timestamp' not in elt: elt['creation_timestamp'] = None elt = ExternalCommand(elt['cmd_line'], elt['creation_timestamp']) if isinstance(elt, Brok): # For brok, we tag the brok with our instance_id elt.instance_id = self.instance_id if elt.type == 'monitoring_log': # The brok is a monitoring event with self.events_lock: self.events.append(elt) statsmgr.counter('events', 1) else: with self.broks_lock: self.broks.append(elt) statsmgr.counter('broks.added', 1) elif isinstance(elt, ExternalCommand): logger.debug("Queuing an external command: %s", str(ExternalCommand.__dict__)) self.unprocessed_external_commands.append(elt) statsmgr.counter('external-commands.added', 1) def setup_new_conf(self): """Receiver custom setup_new_conf method This function calls the base satellite treatment and manages the configuration needed for a receiver daemon: - get and configure its satellites - configure the modules :return: None """ # Execute the base class treatment... super(Receiver, self).setup_new_conf() # ...then our own specific treatment! with self.conf_lock: # self_conf is our own configuration from the fusionsupervision environment # self_conf = self.cur_conf['self_conf'] logger.debug("Got config: %s", self.cur_conf) # Configure and start our modules if not self.have_modules: try: self.modules = unserialize(self.cur_conf['modules'], no_load=True) except FusionsupervisionClassLookupException as exp: # pragma: no cover, simple protection logger.error( 'Cannot un-serialize modules configuration ' 'received from arbiter: %s', exp) if self.modules: logger.info("I received some modules configuration: %s", self.modules) self.have_modules = True self.do_load_modules(self.modules) # and start external modules too self.modules_manager.start_external_instances() else: logger.info("I do not have modules") # Now create the external commands manager # We are a receiver: our role is to get and dispatch commands to the schedulers global_conf = self.cur_conf.get('global_conf', None) if not global_conf: logger.error( "Received a configuration without any global_conf! " "This may hide a configuration problem with the " "realms and the manage_sub_realms of the satellites!") global_conf = { 'accept_passive_unknown_check_results': False, 'log_external_commands': True } self.external_commands_manager = \ ExternalCommandManager(None, 'receiver', self, global_conf.get( 'accept_passive_unknown_check_results', False), global_conf.get( 'log_external_commands', False)) # Initialize connection with all our satellites logger.info("Initializing connection with my satellites:") my_satellites = self.get_links_of_type(s_type='') for satellite in list(my_satellites.values()): logger.info("- : %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) # Now I have a configuration! self.have_conf = True def get_external_commands_from_arbiters(self): """Get external commands from our arbiters As of now, only the arbiter are requested to provide their external commands that the receiver will push to all the known schedulers to make them being executed. :return: None """ for arbiter_link_uuid in self.arbiters: link = self.arbiters[arbiter_link_uuid] if not link.active: logger.debug( "The arbiter '%s' is not active, it is not possible to get " "its external commands!", link.name) continue try: logger.debug("Getting external commands from: %s", link.name) external_commands = link.get_external_commands() if external_commands: logger.debug("Got %d commands from: %s", len(external_commands), link.name) else: # Simple protection against None value external_commands = [] for external_command in external_commands: self.add(external_command) except LinkError: logger.warning( "Arbiter connection failed, I could not get external commands!" ) except Exception as exp: # pylint: disable=broad-except logger.error( "Arbiter connection failed, I could not get external commands!" ) logger.exception("Exception: %s", exp) def push_external_commands_to_schedulers(self): """Push received external commands to the schedulers :return: None """ if not self.unprocessed_external_commands: return # Those are the global external commands commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] logger.debug("Commands: %s", commands_to_process) # Now get all external commands and put them into the good schedulers logger.debug("Commands to process: %d commands", len(commands_to_process)) for ext_cmd in commands_to_process: cmd = self.external_commands_manager.resolve_command(ext_cmd) logger.debug("Resolved command: %s, result: %s", ext_cmd.cmd_line, cmd) if cmd and cmd['global']: # Send global command to all our schedulers for scheduler_link_uuid in self.schedulers: self.schedulers[ scheduler_link_uuid].pushed_commands.append(ext_cmd) # Now for all active schedulers, send the commands count_pushed_commands = 0 count_failed_commands = 0 for scheduler_link_uuid in self.schedulers: link = self.schedulers[scheduler_link_uuid] if not link.active: logger.debug( "The scheduler '%s' is not active, it is not possible to push " "external commands to its connection!", link.name) continue # If there are some commands for this scheduler... commands = [ext_cmd.cmd_line for ext_cmd in link.pushed_commands] if not commands: logger.debug("The scheduler '%s' has no commands.", link.name) continue logger.debug("Sending %d commands to scheduler %s", len(commands), link.name) sent = [] try: sent = link.push_external_commands(commands) except LinkError: logger.warning( "Scheduler connection failed, I could not push external commands!" ) # Whether we sent the commands or not, clean the scheduler list link.pushed_commands = [] # If we didn't sent them, add the commands to the arbiter list if sent: statsmgr.gauge('external-commands.pushed.%s' % link.name, len(commands)) count_pushed_commands = count_pushed_commands + len(commands) else: count_failed_commands = count_failed_commands + len(commands) statsmgr.gauge('external-commands.failed.%s' % link.name, len(commands)) # Kepp the not sent commands... for a next try self.external_commands.extend(commands) statsmgr.gauge('external-commands.pushed.all', count_pushed_commands) statsmgr.gauge('external-commands.failed.all', count_failed_commands) def do_loop_turn(self): """Receiver daemon main loop :return: None """ # Begin to clean modules self.check_and_del_zombie_modules() # Maybe the arbiter pushed a new configuration... if self.watch_for_new_conf(timeout=0.05): logger.info("I got a new configuration...") # Manage the new configuration self.setup_new_conf() # Maybe external modules raised 'objects' # we should get them _t0 = time.time() self.get_objects_from_from_queues() statsmgr.timer('core.get-objects-from-queues', time.time() - _t0) # Get external commands from the arbiters... _t0 = time.time() self.get_external_commands_from_arbiters() statsmgr.timer('external-commands.got.time', time.time() - _t0) statsmgr.gauge('external-commands.got.count', len(self.unprocessed_external_commands)) _t0 = time.time() self.push_external_commands_to_schedulers() statsmgr.timer('external-commands.pushed.time', time.time() - _t0) # Say to modules it's a new tick :) _t0 = time.time() self.hook_point('tick') statsmgr.timer('hook.tick', time.time() - _t0) def get_daemon_stats(self, details=False): """Increase the stats provided by the Daemon base class :return: stats dictionary :rtype: dict """ # Call the base Daemon one res = super(Receiver, self).get_daemon_stats(details=details) res.update({'name': self.name, 'type': self.type}) counters = res['counters'] counters['external-commands'] = len(self.external_commands) counters['external-commands-unprocessed'] = len( self.unprocessed_external_commands) return res def main(self): """Main receiver function Init daemon and loop forever :return: None """ try: # Start the daemon mode if not self.do_daemon_init_and_start(): self.exit_on_error(message="Daemon initialization error", exit_code=3) # We wait for initial conf self.wait_for_initial_conf() if self.new_conf: # Setup the received configuration self.setup_new_conf() # Now the main loop self.do_main_loop() logger.info("Exited from the main loop.") self.request_stop() except Exception: # pragma: no cover, this should never happen indeed ;) self.exit_on_exception(traceback.format_exc()) raise
class NotificationWay(Item): """NotificationWay class is used to implement way of sending notifications (command, periods..) """ my_type = 'notificationway' properties = Item.properties.copy() properties.update({ 'notificationway_name': StringProp(fill_brok=['full_status']), 'host_notifications_enabled': BoolProp(default=True, fill_brok=['full_status']), 'service_notifications_enabled': BoolProp(default=True, fill_brok=['full_status']), 'host_notification_period': StringProp(fill_brok=['full_status']), 'service_notification_period': StringProp(fill_brok=['full_status']), 'host_notification_options': ListProp(default=[''], fill_brok=['full_status'], split_on_comma=True), 'service_notification_options': ListProp(default=[''], fill_brok=['full_status'], split_on_comma=True), 'host_notification_commands': ListProp(default=[], fill_brok=['full_status']), 'service_notification_commands': ListProp(default=[], fill_brok=['full_status']), 'min_business_impact': IntegerProp(default=0, fill_brok=['full_status']), }) running_properties = Item.running_properties.copy() # This tab is used to transform old parameters name into new ones # so from Nagios2 format, to Nagios3 ones. # Or FusionSupervision Engine deprecated names like criticity old_properties = { 'min_criticity': 'min_business_impact', } macros = {} special_properties = ('service_notification_commands', 'host_notification_commands', 'service_notification_period', 'host_notification_period') def __init__(self, params=None, parsing=True): if params is None: params = {} # At deserialization, thoses are dict # TODO: Separate parsing instance from recreated ones for prop in ['service_notification_commands', 'host_notification_commands']: if prop in params and isinstance(params[prop], list) and params[prop] \ and isinstance(params[prop][0], dict): new_list = [CommandCall(elem, parsing=parsing) for elem in params[prop]] # We recreate the object setattr(self, prop, new_list) # And remove prop, to prevent from being overridden del params[prop] super(NotificationWay, self).__init__(params, parsing=parsing) def serialize(self): res = super(NotificationWay, self).serialize() for prop in ['service_notification_commands', 'host_notification_commands']: if getattr(self, prop) is None: res[prop] = None else: res[prop] = [elem.serialize() for elem in getattr(self, prop)] return res def get_name(self): """Accessor to notificationway_name attribute :return: notificationway name :rtype: str """ return self.notificationway_name def want_service_notification(self, timeperiods, timestamp, state, n_type, business_impact, cmd=None): # pylint: disable=too-many-return-statements """Check if notification options match the state of the service Notification is NOT wanted in ONE of the following case:: * service notifications are disabled * cmd is not in service_notification_commands * business_impact < self.min_business_impact * service_notification_period is not valid * state does not match service_notification_options for problem, recovery and flapping * state does not match host_notification_options for downtime :param timestamp: time we want to notify the contact (usually now) :type timestamp: int :param state: host or service state ("WARNING", "CRITICAL" ..) :type state: str :param n_type: type of notification ("PROBLEM", "RECOVERY" ..) :type n_type: str :param business_impact: impact of this service :type business_impact: int :param cmd: command launched to notify the contact :type cmd: str :return: True if no condition is matched, otherwise False :rtype: bool TODO: Simplify function """ if not self.service_notifications_enabled: return False # Maybe the command we ask for are not for us, but for another notification ways # on the same contact. If so, bail out if cmd and cmd not in self.service_notification_commands: return False # If the business_impact is not high enough, we bail out if business_impact < self.min_business_impact: return False notif_period = timeperiods[self.service_notification_period] in_notification_period = notif_period.is_time_valid(timestamp) if 'n' in self.service_notification_options: return False if in_notification_period: short_states = { u'WARNING': 'w', u'UNKNOWN': 'u', u'CRITICAL': 'c', u'RECOVERY': 'r', u'FLAPPING': 'f', u'DOWNTIME': 's' } if n_type == u'PROBLEM' and state in short_states: return short_states[state] in self.service_notification_options if n_type == u'RECOVERY' and n_type in short_states: return short_states[n_type] in self.service_notification_options if n_type == u'ACKNOWLEDGEMENT': return in_notification_period if n_type in (u'FLAPPINGSTART', u'FLAPPINGSTOP', u'FLAPPINGDISABLED'): return 'f' in self.service_notification_options if n_type in (u'DOWNTIMESTART', u'DOWNTIMEEND', u'DOWNTIMECANCELLED'): # No notification when a downtime was cancelled. Is that true?? # According to the documentation we need to look at _host_ options return 's' in self.host_notification_options return False def want_host_notification(self, timperiods, timestamp, state, n_type, business_impact, cmd=None): # pylint: disable=too-many-return-statements """Check if notification options match the state of the host Notification is NOT wanted in ONE of the following case:: * host notifications are disabled * cmd is not in host_notification_commands * business_impact < self.min_business_impact * host_notification_period is not valid * state does not match host_notification_options for problem, recovery, flapping and dt :param timestamp: time we want to notify the contact (usually now) :type timestamp: int :param state: host or service state ("WARNING", "CRITICAL" ..) :type state: str :param n_type: type of notification ("PROBLEM", "RECOVERY" ..) :type n_type: str :param business_impact: impact of this service :type business_impact: int :param cmd: command launched to notify the contact :type cmd: str :return: True if no condition is matched, otherwise False :rtype: bool TODO: Simplify function """ if not self.host_notifications_enabled: return False # If the business_impact is not high enough, we bail out if business_impact < self.min_business_impact: return False # Maybe the command we ask for are not for us, but for another notification ways # on the same contact. If so, bail out if cmd and cmd not in self.host_notification_commands: return False notif_period = timperiods[self.host_notification_period] in_notification_period = notif_period.is_time_valid(timestamp) if 'n' in self.host_notification_options: return False if in_notification_period: short_states = { u'DOWN': 'd', u'UNREACHABLE': 'u', u'RECOVERY': 'r', u'FLAPPING': 'f', u'DOWNTIME': 's' } if n_type == u'PROBLEM' and state in short_states: return short_states[state] in self.host_notification_options if n_type == u'RECOVERY' and n_type in short_states: return short_states[n_type] in self.host_notification_options if n_type == u'ACKNOWLEDGEMENT': return in_notification_period if n_type in (u'FLAPPINGSTART', u'FLAPPINGSTOP', u'FLAPPINGDISABLED'): return 'f' in self.host_notification_options if n_type in (u'DOWNTIMESTART', u'DOWNTIMEEND', u'DOWNTIMECANCELLED'): return 's' in self.host_notification_options return False def get_notification_commands(self, o_type): """Get notification commands for object type :param o_type: object type (host or service) :type o_type: str :return: command list :rtype: list[fusionsupervision.objects.command.Command] """ # service_notification_commands for service notif_commands_prop = o_type + '_notification_commands' notif_commands = getattr(self, notif_commands_prop) return notif_commands def is_correct(self): # pylint: disable=too-many-branches """Check if this object configuration is correct :: * Check our own specific properties * Call our parent class is_correct checker :return: True if the configuration is correct, otherwise False :rtype: bool """ state = True # Do not execute checks if notifications are disabled if (hasattr(self, 'service_notification_options') and self.service_notification_options == ['n']): if (hasattr(self, 'host_notification_options') and self.host_notification_options == ['n']): return True # Internal checks before executing inherited function... # Service part if not hasattr(self, 'service_notification_commands'): msg = "[notificationway::%s] do not have any service_notification_commands defined" % ( self.get_name() ) self.add_error(msg) state = False else: for cmd in self.service_notification_commands: if cmd is None: msg = "[notificationway::%s] a service_notification_command is missing" % ( self.get_name() ) self.add_error(msg) state = False elif not cmd.is_valid(): msg = "[notificationway::%s] a service_notification_command is invalid" % ( self.get_name() ) self.add_error(msg) state = False if getattr(self, 'service_notification_period', None) is None: msg = "[notificationway::%s] the service_notification_period is invalid" % ( self.get_name() ) self.add_error(msg) state = False # Now host part if not hasattr(self, 'host_notification_commands'): msg = "[notificationway::%s] do not have any host_notification_commands defined" % ( self.get_name() ) self.add_error(msg) state = False else: for cmd in self.host_notification_commands: if cmd is None: msg = "[notificationway::%s] a host_notification_command is missing" % ( self.get_name() ) self.add_error(msg) state = False elif not cmd.is_valid(): msg = "[notificationway::%s] a host_notification_command is invalid (%s)" % ( cmd.get_name(), str(cmd.__dict__) ) self.add_error(msg) state = False if getattr(self, 'host_notification_period', None) is None: msg = "[notificationway::%s] the host_notification_period is invalid" % ( self.get_name() ) self.add_error(msg) state = False return super(NotificationWay, self).is_correct() and state
class Downtime(FusionsupervisionObject): """ Schedules downtime for a specified service. If the "fixed" argument is set to one (1), downtime will start and end at the times specified by the "start" and "end" arguments. Otherwise, downtime will begin between the "start" and "end" times and last for "duration" seconds. The "start" and "end" arguments are specified in time_t format (seconds since the UNIX epoch). The specified service downtime can be triggered by another downtime entry if the "trigger_id" is set to the ID of another scheduled downtime entry. Set the "trigger_id" argument to zero (0) if the downtime for the specified service should not be triggered by another downtime entry. """ my_type = 'downtime' properties = { 'activate_me': StringProp(default=u''), 'entry_time': IntegerProp(default=0, fill_brok=['full_status']), 'fixed': BoolProp(default=True, fill_brok=['full_status']), 'start_time': IntegerProp(default=0, fill_brok=['full_status']), 'duration': IntegerProp(default=0, fill_brok=['full_status']), 'trigger_id': StringProp(default=u''), 'end_time': IntegerProp(default=0, fill_brok=['full_status']), 'real_end_time': IntegerProp(default=0), 'author': StringProp(default=u'FusionSupervision Engine', fill_brok=['full_status']), 'comment': StringProp(default=u''), 'is_in_effect': BoolProp(default=False), 'has_been_triggered': BoolProp(default=False), 'can_be_deleted': BoolProp(default=False), 'ref': StringProp(default=u'unset'), 'ref_type': StringProp(default=u'unset'), 'comment_id': StringProp(default=u''), } def __init__(self, params, parsing=False): creating = 'uuid' not in params super(Downtime, self).__init__(params, parsing=parsing) self.fill_default() if creating: self.activate_me = [] # The other downtimes i need to activate self.entry_time = int(time.time()) if self.trigger_id not in [ '', '0' ]: # triggered plus fixed makes no sense self.fixed = False if self.fixed: self.duration = self.end_time - self.start_time # This is important for flexible downtimes. Here start_time and # end_time mean: in this time interval it is possible to trigger # the beginning of the downtime which lasts for duration. # Later, when a non-ok event happens, real_end_time will be # recalculated from now+duration # end_time will be displayed in the web interface, but real_end_time # is used internally self.real_end_time = self.end_time self.is_in_effect = False self.has_been_triggered = False # another downtime has triggered me self.can_be_deleted = False def __str__(self): # pragma: no cover if self.is_in_effect is True: active = "active" else: active = "inactive" if self.fixed is True: d_type = "fixed" else: d_type = "flexible" return "%s %s Downtime id=%s %s - %s" % (active, d_type, self.uuid, time.ctime(self.start_time), time.ctime(self.end_time)) def trigger_me(self, other_downtime): """Wrapper to activate_me.append function Used to add another downtime to activate :param other_downtime: other downtime to activate/cancel :type other_downtime: :return: None """ self.activate_me.append(other_downtime) def in_scheduled_downtime(self): """Getter for is_in_effect attribute :return: True if downtime is in effect, False otherwise :rtype: bool """ return self.is_in_effect def enter(self, timeperiods, hosts, services): """Set ref in scheduled downtime and raise downtime log entry (start) :param hosts: hosts objects to get item ref :type hosts: fusionsupervision.objects.host.Hosts :param services: services objects to get item ref :type services: fusionsupervision.objects.service.Services :return: broks :rtype: list of broks """ if self.ref in hosts: item = hosts[self.ref] else: item = services[self.ref] broks = [] self.is_in_effect = True if self.fixed is False: now = time.time() self.real_end_time = now + self.duration item.scheduled_downtime_depth += 1 item.in_scheduled_downtime = True if item.scheduled_downtime_depth == 1: item.raise_enter_downtime_log_entry() notification_period = None if getattr(item, 'notification_period', None) is not None: notification_period = timeperiods[item.notification_period] # Notification author data # todo: note that alias and name are not implemented yet author_data = { 'author': self.author, 'author_name': u'Not available', 'author_alias': u'Not available', 'author_comment': self.comment } item.create_notifications('DOWNTIMESTART', notification_period, hosts, services, author_data=author_data) if self.ref in hosts: broks.append(self.get_raise_brok(item.get_name())) # For an host, acknowledge the host problem (and its services problems) # Acknowledge the host with a sticky ack and notifications # The acknowledge will expire at the same time as the downtime end item.acknowledge_problem( notification_period, hosts, services, 2, 1, "FusionSupervision Engine", "Acknowledged because of an host downtime") else: broks.append( self.get_raise_brok(item.host_name, item.get_name())) for downtime_id in self.activate_me: for host in hosts: if downtime_id in host.downtimes: downtime = host.downtimes[downtime_id] broks.extend(downtime.enter(timeperiods, hosts, services)) for service in services: if downtime_id in service.downtimes: downtime = service.downtimes[downtime_id] broks.extend(downtime.enter(timeperiods, hosts, services)) return broks def exit(self, timeperiods, hosts, services): """Remove ref in scheduled downtime and raise downtime log entry (exit) :param hosts: hosts objects to get item ref :type hosts: fusionsupervision.objects.host.Hosts :param services: services objects to get item ref :type services: fusionsupervision.objects.service.Services :return: [], always | None :rtype: list """ if self.ref in hosts: item = hosts[self.ref] else: item = services[self.ref] broks = [] # If not is_in_effect means that ot was probably a flexible downtime which was # not triggered. In this case, nothing special to do... if self.is_in_effect is True: # This was a fixed or a flexible+triggered downtime self.is_in_effect = False item.scheduled_downtime_depth -= 1 if item.scheduled_downtime_depth == 0: item.raise_exit_downtime_log_entry() notification_period = timeperiods[item.notification_period] # Notification author data # todo: note that alias and name are not implemented yet author_data = { 'author': self.author, 'author_name': u'Not available', 'author_alias': u'Not available', 'author_comment': self.comment } item.create_notifications(u'DOWNTIMEEND', notification_period, hosts, services, author_data=author_data) item.in_scheduled_downtime = False if self.ref in hosts: broks.append(self.get_expire_brok(item.get_name())) else: broks.append( self.get_expire_brok(item.host_name, item.get_name())) item.del_comment(self.comment_id) self.can_be_deleted = True # when a downtime ends and the concerned item was a problem # a notification should be sent with the next critical check # So we should set a flag here which informs the consume_result function # to send a notification item.in_scheduled_downtime_during_last_check = True return broks def cancel(self, timeperiods, hosts, services): """Remove ref in scheduled downtime and raise downtime log entry (cancel) :param hosts: hosts objects to get item ref :type hosts: fusionsupervision.objects.host.Hosts :param services: services objects to get item ref :type services: fusionsupervision.objects.service.Services :return: [], always :rtype: list """ if self.ref in hosts: item = hosts[self.ref] else: item = services[self.ref] broks = [] self.is_in_effect = False item.scheduled_downtime_depth -= 1 if item.scheduled_downtime_depth == 0: item.raise_cancel_downtime_log_entry() item.in_scheduled_downtime = False if self.ref in hosts: broks.append(self.get_expire_brok(item.get_name())) else: broks.append( self.get_expire_brok(item.host_name, item.get_name())) self.del_automatic_comment(item) self.can_be_deleted = True item.in_scheduled_downtime_during_last_check = True # Nagios does not notify on canceled downtimes # res.extend(self.ref.create_notifications('DOWNTIMECANCELLED')) # Also cancel other downtimes triggered by me for downtime in self.activate_me: broks.extend(downtime.cancel(timeperiods, hosts, services)) return broks def add_automatic_comment(self, ref): """Add comment on ref for downtime :param ref: the host/service we want to link a comment to :type ref: fusionsupervision.objects.schedulingitem.SchedulingItem :return: None """ if self.fixed is True: text = ( DOWNTIME_FIXED_MESSAGE % (ref.my_type, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.start_time)), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.end_time)), ref.my_type)) else: hours, remainder = divmod(self.duration, 3600) minutes, _ = divmod(remainder, 60) text = (DOWNTIME_FLEXIBLE_MESSAGE % (ref.my_type, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.start_time)), time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( self.end_time)), hours, minutes, ref.my_type)) data = { 'comment': text, 'comment_type': 1 if ref.my_type == 'host' else 2, 'entry_type': 2, 'source': 0, 'expires': False, 'ref': ref.uuid } comment = Comment(data) self.comment_id = comment.uuid ref.comments[comment.uuid] = comment return comment def del_automatic_comment(self, item): """Remove automatic comment on ref previously created :param item: item service or host :type item: object :return: None """ item.del_comment(self.comment_id) self.comment_id = '' def fill_data_brok_from(self, data, brok_type): """Fill data with info of item by looking at brok_type in props of properties or running_properties :param data: data to fill :type data: :param brok_type: type of brok :type brok_type: str :return: None TODO: Duplicate from Notification.fill_data_brok_from """ cls = self.__class__ # Now config properties for prop, entry in list(cls.properties.items()): if hasattr(prop, 'fill_brok'): if brok_type in entry['fill_brok']: data[prop] = getattr(self, prop) def get_raise_brok(self, host_name, service_name=''): """Get a start downtime brok :param host_name: host concerned by the downtime :type host_name :param service_name: service concerned by the downtime :type service_name :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ data = self.serialize() data['host'] = host_name if service_name != '': data['service'] = service_name return Brok({'type': 'downtime_raise', 'data': data}) def get_expire_brok(self, host_name, service_name=''): """Get an expire downtime brok :param host_name: host concerned by the downtime :type host_name :param service_name: service concerned by the downtime :type service_name :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ data = self.serialize() data['host'] = host_name if service_name != '': data['service'] = service_name return Brok({'type': 'downtime_expire', 'data': data})
class Comment(FusionsupervisionObject): """Comment class implements comments for monitoring purpose. It contains data like author, type etc.. """ my_type = 'comment' properties = { 'entry_time': IntegerProp(default=0), 'entry_type': IntegerProp(), 'author': StringProp(default=u'FusionSupervision Engine'), 'comment': StringProp(default=u''), 'comment_type': IntegerProp(), 'source': IntegerProp(default=0), 'expires': BoolProp(default=False), 'ref': StringProp(default=u'unset'), 'ref_type': StringProp(default=u'unset'), } def __init__(self, params, parsing=False): """Adds a comment to a particular service. :param ref: reference object (host / service) :type ref: fusionsupervision.object.schedulingitem.SchedulingItem :param author: Author of this comment :type author: str :param comment: text comment itself :type comment: str :param comment_type: comment type :: * 1 <=> HOST_COMMENT * 2 <=> SERVICE_COMMENT :type comment_type: int :param entry_type: type of entry linked to this comment :: * 1 <=> USER_COMMENT * 2 <=>DOWNTIME_COMMENT * 3 <=>FLAPPING_COMMENT * 4 <=>ACKNOWLEDGEMENT_COMMENT :type entry_type: int :param source: source of this comment :: * 0 <=> COMMENTSOURCE_INTERNAL * 1 <=> COMMENTSOURCE_EXTERNAL :type source: int :param expires: comment expires or not :type expires: bool :return: None """ super(Comment, self).__init__(params, parsing) if not hasattr(self, 'entry_time') or not self.entry_time: self.entry_time = int(time.time()) self.fill_default() def __str__(self): # pragma: no cover return "Comment id=%s %s" % (self.uuid, self.comment) def get_comment_brok(self, host_name, service_name=''): """Get a comment brok :param host_name: :param service_name: :return: brok with wanted data :rtype: fusionsupervision.brok.Brok """ data = self.serialize() data['host'] = host_name if service_name: data['service'] = service_name return Brok({'type': 'comment', 'data': data})