def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues.""" # Initialize ScheduledMonitorThread and InternalMsgQ super(ServiceMonitor, self).initialize(conf_reader) # Initialize internal message queues for this module super(ServiceMonitor, self).initialize_msgQ(msgQlist) self.iem = Iem() self.iem.check_exsisting_fault_iems() self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1] # Integrate into the main dbus loop to catch events DBusGMainLoop(set_as_default=True) # Initialize SystemBus and get Manager Interface self._bus = SystemBus() systemd = self._bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") self._manager = Interface( systemd, dbus_interface='org.freedesktop.systemd1.Manager') self.remove_disabled_services() return True
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues.""" # Initialize ScheduledMonitorThread and InternalMsgQ super(ServiceMonitor, self).initialize(conf_reader) # Initialize internal message queues for this module super(ServiceMonitor, self).initialize_msgQ(msgQlist) self.iem = Iem() self.iem.check_exsisting_fault_iems() self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1] # Integrate into the main dbus loop to catch events DBusGMainLoop(set_as_default=True) # Initialize SystemBus and get Manager Interface self._bus = SystemBus() systemd = self._bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") self._manager = Interface( systemd, dbus_interface='org.freedesktop.systemd1.Manager') self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SERVICE_MONITOR_DATA_PATH = os.path.join( cache_dir_path, f'SERVICE_MONITOR_DATA_{self._node_id}') # Get the stored previous service info self.persistent_service_data = {} if os.path.isfile(self.SERVICE_MONITOR_DATA_PATH): self.persistent_service_data = \ store.get(self.SERVICE_MONITOR_DATA_PATH) if self.persistent_service_data: self.not_active_services = \ self.persistent_service_data['not_active_services'] self.failed_services = \ self.persistent_service_data['failed_services'] self.service_status = \ self.persistent_service_data['service_status'] else: self.persistent_service_data = { 'not_active_services': self.not_active_services, 'failed_services': self.failed_services, 'service_status': self.service_status } store.put(self.persistent_service_data, self.SERVICE_MONITOR_DATA_PATH) self.remove_disabled_services() return True
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues.""" # Initialize ScheduledMonitorThread and InternalMsgQ super(ServiceMonitor, self).initialize(conf_reader) # Initialize internal message queues for this module super(ServiceMonitor, self).initialize_msgQ(msgQlist) self.iem = Iem() self.iem.check_exsisting_fault_iems() self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1] self.initialize_dbus() for service in self.services_to_monitor: self.initialize_service(service) self.subscribe_unit_file_changed_signal() return True
def test_iem_alerts(self): """Test iem 'ipmitool' fault alert receive.""" check_sspl_ll_is_running() Iem().iem_fault("IPMITOOL_ERROR") time.sleep(10) EventMessage.subscribe(component='sspl') fault_alert = EventMessage.receive() print(f"IEM Received:{fault_alert}") assert (fault_alert is not None) assert (fault_alert["iem"]["info"]["severity"] is not None) assert (fault_alert["iem"]["info"]["type"] is not None) assert (fault_alert["iem"]["info"]["event_time"] is not None) assert (fault_alert["iem"]["source"]["module"] is not None) assert (fault_alert["iem"]["contents"]["event"] is not None)
class ServiceMonitor(SensorThread, InternalMsgQ): """ Sensor to monitor state change events of services. """ SENSOR_NAME = "ServiceMonitor" PRIORITY = 2 # Section and keys in configuration file SERVICEMONITOR = SENSOR_NAME.upper() MONITORED_SERVICES = 'monitored_services' THREAD_SLEEP = 'thread_sleep' POLLING_FREQUENCY = 'polling_frequency' # Dependency list DEPENDENCIES = {"plugins": ["SeviceMsgHandler"]} RESOURCE_TYPE = "node:sw:os:service" @staticmethod def name(): """@return: name of the module.""" return ServiceMonitor.SENSOR_NAME def __init__(self): """Initialize the relevant datastructures.""" super(ServiceMonitor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.services_to_monitor = set(Conf.get( SSPL_CONF, f"{self.SERVICEMONITOR}>{self.MONITORED_SERVICES}", [])) self.services = {} self.thread_sleep = int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.THREAD_SLEEP}", "1")) self.polling_frequency = int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.POLLING_FREQUENCY}", "30")) def read_data(self): """Return the dict of service status.""" return self.service_status def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues.""" # Initialize ScheduledMonitorThread and InternalMsgQ super(ServiceMonitor, self).initialize(conf_reader) # Initialize internal message queues for this module super(ServiceMonitor, self).initialize_msgQ(msgQlist) self.iem = Iem() self.iem.check_exsisting_fault_iems() self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1] self.initialize_dbus() for service in self.services_to_monitor: self.initialize_service(service) self.subscribe_unit_file_changed_signal() return True def run(self): try: logger.info(f"Monitoring Services : {self.services.keys()}") if not self.services_to_monitor: logger.info( "No service to monitor, shutting down {}".format( self.name())) self.shutdown() # WHILE LOOP FUNCTION : every second we check for # properties change event if any generated (using context # iteration) and after a delay of polling frequency we # check for non_active processes. iterations = 0 non_active_check = int( self.polling_frequency / self.thread_sleep) or 1 while self.is_running(): # At interval of 'thread_sleep' check for events occurred for # services and process them self.process_events() self.process_alerts() if not iterations % non_active_check: # Initialize errored service again for service in self.services_to_monitor - set( self.services.keys()): self.initialize_service(service) for service in Service.monitoring_disabled.copy(): self.services[service].new_unit_state(EnabledState) # Check for services in intermediate state(not active) self.check_nonactive_services() self.check_active_services() time.sleep(self.thread_sleep) iterations += 1 logger.info("ServiceMonitor gracefully breaking out " + "of dbus Loop, not restarting.") except GLib.Error as err: raise ThreadException(self.SENSOR_NAME, "Ungrecefully breaking out of" "GLib.MainLoop() with error: %s" % err) except DBusException as err: raise ThreadException(self.SENSOR_NAME, "Ungracefully breaking out of dbus loop" "with error: %s" % err) except Exception as err: raise ThreadException(self.SENSOR_NAME, "Ungracefully breaking out of" "ServiceMonitor:run() with error: %s" % err) def initialize_service(self, service_name): try: unit = self._bus.get_object(SYSTEMD_BUS, self._manager.LoadUnit(service_name)) if Service.cache_exists(service_name): service = Service.from_cache(service_name, unit) else: service = Service(unit) service.handle_unit_state_change() self.services[service_name] = service except DBusException: logger.error("Error: {} Failed to initialize service {}," "initialization will be retried in" "{} seconds".format(DBusException, service_name, self.polling_frequency)) def subscribe_unit_file_changed_signal(self): self._manager.connect_to_signal('UnitFilesChanged', self.unit_file_state_change_handler, dbus_interface=MANAGER_IFACE) @staticmethod def subscribe_properties_changed_signal(service): service.properties_changed_signal = Interface( object=service.unit, dbus_interface=MANAGER_IFACE).connect_to_signal( 'PropertiesChanged', service.properties_changed_handler, PROPERTIES_IFACE) def unit_file_state_change_handler(self): for service in self.services.values(): service.handle_unit_state_change() def process_events(self): while self.context.pending(): self.context.iteration(False) def process_alerts(self): while not Service.alerts.empty(): message = Service.alerts.get() self.raise_alert(message) def initialize_dbus(self): DBusGMainLoop(set_as_default=True) # Initialize SystemBus and get Manager Interface self._bus = SystemBus() systemd = self._bus.get_object(SYSTEMD_BUS, "/org/freedesktop/systemd1") self._manager = Interface(systemd, dbus_interface=MANAGER_IFACE) # Retrieve the main loop which will be called in the run method self._loop = GLib.MainLoop() self.context = self._loop.get_context() def check_nonactive_services(self): """ Monitor non-active Services. Raise FAULT Alert if any of the not-active services has exceeded the threshold time for inactivity. """ for service in Service.non_active.copy(): if self.services[service].is_nonactive_for_threshold_time(): self.services[service].new_service_state(FailedState) self.raise_alert(self.get_alert(self.services[service], InactiveAlert)) def check_active_services(self): """ Monitor active services. Raise fault resolved alert for active services if service stays in active state for threshhold time. """ for service in Service.active_services.copy(): if self.services[service].is_active_for_threshold_time(): self.raise_alert( self.get_alert(self.services[service], ResolvedAlert)) Service.active_services.discard(service) def raise_iem(self, service, alert_type): """Raise iem alert for kafka service.""" if service == "kafka.service" and alert_type == "fault": self.iem.iem_fault("KAFKA_NOT_ACTIVE") if (self.KAFKA not in self.iem.fault_iems): self.iem.fault_iems.append(self.KAFKA) elif (service == "kafka.service" and alert_type == "fault_resolved" and self.KAFKA in self.iem.fault_iems): self.iem.iem_fault_resolved("KAFKA_ACTIVE") self.iem.fault_iems.remove(self.KAFKA) @classmethod def get_alert(cls, service, alert): if service.state == "active": description = alert.description.format( service.name, service.state, service.active_threshold) else: description = alert.description.format( service.name, service.state, service.nonactive_threshold) return { "sensor_request_type": { "service_status_alert": { "host_id": socket.getfqdn(), "severity": SeverityReader().map_severity( alert.alert_type), "alert_id": get_alert_id(str(int(time.time()))), "alert_type": alert.alert_type, "info": { "resource_type": cls.RESOURCE_TYPE, "resource_id": service.name, "event_time": str(int(time.time())), "description": description, "impact": alert.impact.format(service.name), "recommendation": alert.recommendation, }, "specific_info": { "service_name": service.name, "previous_state": service.previous_state, "state": service.state, "previous_substate": service.previous_substate, "substate": service.substate, "previous_pid": service.previous_pid, "pid": service.pid, } } } } def raise_alert(self, message): service = message["sensor_request_type"]["service_status_alert"][ "info"]["resource_id"] alert_type = message["sensor_request_type"]["service_status_alert"][ "alert_type"] self.raise_iem(service, alert_type) self._write_internal_msgQ(ServiceMsgHandler.name(), message) self.services[service].dump_to_cache() def suspend(self): """Suspend the module thread. It should be non-blocking.""" super(ServiceMonitor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking.""" super(ServiceMonitor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread.""" super(ServiceMonitor, self).shutdown()
def _send_msg(self, iem_components, log_timestamp): """Creates JSON message from iem components and sends to message bus. """ impact = "NA" recommendation = "NA" # IEM format is IEC:DESCRIPTION # IEC format is SEVERITY|SOURCEID|COMPONENTID|MODULEID|EVENTID # Field lengths ----1---|---1----|------3----|----3---|---4--- # Example IEM -> "IEC: BO1001000001:Error in connecting to controller" # Actual IEC doesn't contain separator between fields. It is shown # here just for readability. Each field has fixed length. severity, source_id, component_id, module_id, event_id, description = \ [iem_components[i] for i in range(6)] # Check if severity level is valid if severity not in self.SEVERITY_LEVELS: logger.warn(f"Invalid Severity level: {severity}") return # Check for valid source id if source_id not in self.SOURCE_IDS: logger.warn(f"Invalid Source ID level: {source_id}") return # Check for valid event time event_time = self._get_epoch_time_from_timestamp(log_timestamp) if not event_time: logger.error("Timestamp is not in required format, discarding the message") return # Check for other components args = { "_comp_id": component_id, "_module_id": module_id, "_event_id": event_id } if not self._are_components_in_range(**args): return # component-id for sspl=005 if component_id == "005": event_code = component_id + module_id + event_id impact = Iem().EVENT_STRING[event_code][1] recommendation = Iem().EVENT_STRING[event_code][2] # Update severity and source_id alert_type = iem_severity_to_alert_mapping.get(severity) severity = iem_severity_types.get(severity, severity) source_id = iem_source_types.get(source_id, source_id) # Decode component_id, module_id and event_id component_id, module_id, event_id = self._decode_msg( f"{component_id}{module_id}{event_id}") info = { "source_id": source_id, "component_id": component_id, "module_id": module_id, "event_id": event_id, "severity": severity, "description": description, "impact": impact, "recommendation": recommendation, "alert_type": alert_type, "event_time": event_time, "IEC": "".join(iem_components[:-1]) } iem_data_msg = IEMDataMsg(info) json_msg = iem_data_msg.getJson() self._write_internal_msgQ(EgressProcessor.name(), json_msg)
def run(self): """Run the sensor on its own thread""" logger.debug("Consul accumulated messages processing started") if not self._is_my_msgQ_empty(): # Check for shut down message from sspl_ll_d and set a flag to shutdown # once our message queue is empty self._jsonMsg, _ = self._read_my_msgQ() if self._jsonMsg.get("message").get( "actuator_response_type") is not None and \ self._jsonMsg.get("message").get( "actuator_response_type").get( "thread_controller") is not None and \ self._jsonMsg.get("message").get( "actuator_response_type").get("thread_controller").get( "thread_response") == \ "SSPL-LL is shutting down": logger.info("EgressAccumulatedMsgsProcessor, run, received" "global shutdown message from sspl_ll_d") self.shutdown() try: # TODO : Fix accumulated message processor when message bus changes are available to # error out in case of failure (EOS-17626) if not self.store_queue.is_empty(): logger.debug( "Found accumulated messages, trying to send again") while not self.store_queue.is_empty(): message = self.store_queue.get() if isinstance(message, bytes): message = message.decode() dict_msg = json.loads(message) if dict_msg.get("iem"): try: Iem.raise_iem_event( module=dict_msg["iem"]["module"], event_code=dict_msg["iem"]["event_code"], severity=dict_msg["iem"]["severity"], description=dict_msg["iem"]["description"]) logger.info("Accumulated IEM sent. %s" % dict_msg) self.store_queue.delete() except (EventMessageError, Exception) as e: logger.error(f"Failed to send IEM. ERROR: {e}") else: if "actuator_response_type" in dict_msg["message"]: event_time = dict_msg["message"] \ ["actuator_response_type"]["info"]["event_time"] time_diff = int(time.time()) - int(event_time) if time_diff > self.MSG_TIMEOUT: continue if "sensor_response_type" in dict_msg["message"]: logger.info( f"Publishing Accumulated Alert: {message}") if isinstance(self._producer, MessageProducer): self._producer.send([message]) logger.info( f"Published Accumulated Message {message}") self.store_queue.delete() else: self.create_MsgProducer_obj() except MessageBusError as e: logger.error("EgressAccumulatedMsgsProcessor, run, %r" % e) except Exception as e: logger.error(e) finally: logger.debug("Consul accumulated processing ended") self._scheduler.enter(30, self._priority, self.run, ())
class ServiceMonitor(SensorThread, InternalMsgQ): """ Sensor to monitor state change events of services. """ SENSOR_NAME = "ServiceMonitor" PRIORITY = 2 # Section and keys in configuration file SERVICEMONITOR = SENSOR_NAME.upper() RESOURCE_TYPE = "node:sw:os:service" MONITORED_SERVICES = 'monitored_services' THREAD_SLEEP = 'thread_sleep' POLLING_FREQUENCY = 'polling_frequency' MAX_WAIT_TIME = 'threshold_inactive_time' # Dependency list DEPENDENCIES = {"plugins": ["SeviceMsgHandler"]} @staticmethod def name(): """@return: name of the module.""" return ServiceMonitor.SENSOR_NAME def __init__(self): """Initialize the relavent datastructures.""" super(ServiceMonitor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.services_to_monitor = copy.deepcopy( Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.MONITORED_SERVICES}", [])) self.not_active_services = {} self.failed_services = [] self.service_status = {} self.thread_sleep = \ int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.THREAD_SLEEP}", 1)) self.polling_frequency = \ int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.POLLING_FREQUENCY}", 30)) self.max_wait_time = \ int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.MAX_WAIT_TIME}", 60)) def read_data(self): """Return the dict of service status.""" return self.service_status def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues.""" # Initialize ScheduledMonitorThread and InternalMsgQ super(ServiceMonitor, self).initialize(conf_reader) # Initialize internal message queues for this module super(ServiceMonitor, self).initialize_msgQ(msgQlist) self.iem = Iem() self.iem.check_exsisting_fault_iems() self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1] # Integrate into the main dbus loop to catch events DBusGMainLoop(set_as_default=True) # Initialize SystemBus and get Manager Interface self._bus = SystemBus() systemd = self._bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") self._manager = Interface( systemd, dbus_interface='org.freedesktop.systemd1.Manager') self.remove_disabled_services() return True def remove_disabled_services(self): """Remove `disabled` services from the list of services to monitor.""" temp = copy.deepcopy(self.services_to_monitor) for service in temp: try: if 'disabled' in str(self._manager.GetUnitFileState(service)): self.services_to_monitor.remove(service) except DBusException as err: # If a service is enabled then it definitely has 'UnitFileState`, # but for disabled both presence or absence of UnitFileState is # possible. so if `UnitFileState' not present for the service, # it is definitely disabled. logger.debug(f"{service} is not getting monitored due "\ f"to an error : {err}") self.services_to_monitor.remove(service) def run(self): logger.info(f"Monitoring Services : {self.services_to_monitor}") try: # Register all the services to signal of 'PropertiesChanged' and # raise an alert if some service is not active on initially or if # Unit is not found for the service services_to_monitor_copy = copy.deepcopy(self.services_to_monitor) for service in services_to_monitor_copy: err = self.connect_to_prop_changed_signal(service) if err: self.raise_alert(service, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", 0) logger.error( f"{service} is not active initially. \n Error {err}") else: self.services_to_monitor.remove(service) logger.debug(f"failed_services : {self.failed_services}") logger.debug(f"services_to_monitor : {self.services_to_monitor}") # Retrieve the main loop which will be called in the run method self._loop = GLib.MainLoop() # Initialize the gobject threads and get its context GLib.threads_init() context = self._loop.get_context() time_to_check_lists = self.current_time() + self.polling_frequency # WHILE LOOP FUNCTION : every second we check for # properties change event if any generated (using context # iteration) and after a delay of polling frequency we # check for inactive processes. while self.is_running(): # At interval of 'thread_sleep' check for events occured for # registered services and process them(call on_pro_changed()) context.iteration(False) time.sleep(self.thread_sleep) # At interval of 'polling_freqency' process unregistered # services and services with not-active (intermidiate) state. if time_to_check_lists <= self.current_time(): time_to_check_lists = self.current_time() + \ self.polling_frequency # Try to bind the enabled services on the node to the # signal whose Unit was earlier not found. On successfully # registering for service state change signal, remove from # local list as monitoring enabled through SystemD # and to avoid re-registration. services_to_monitor_copy = copy.deepcopy( self.services_to_monitor) for service in services_to_monitor_copy: if not self.connect_to_prop_changed_signal(service): self.services_to_monitor.remove(service) # Check for services in intermidiate state(not active) self.check_notactive_services() logger.info("ServiceMonitor gracefully breaking out " +\ "of dbus Loop, not restarting.") except GLib.Error as err: raise ThreadException( self.SENSOR_NAME, "Ungrecefully breaking out of GLib.MainLoop() with error: %s" % err) except DBusException as err: raise ThreadException( self.SENSOR_NAME, "Ungracefully breaking out of dbus loop with error: %s" % err) except Exception as err: raise ThreadException(self.SENSOR_NAME, "Ungracefully breaking out of ServiceMonitor:run() "\ "with error: %s" % err) def current_time(self): """Returns the time as integer number in seconds since the epoch in UTC.""" return int(time.time()) def get_service_status(self, service=None, unit=None): """Returns tuple of unit, service name, state, substate and pid.""" if not unit: unit = self._bus.get_object('org.freedesktop.systemd1',\ self._manager.LoadUnit(service)) Iunit = Interface(unit, dbus_interface='org.freedesktop.DBus.Properties') if not service: service = str(Iunit.Get('org.freedesktop.systemd1.Unit', 'Id')) state = str(Iunit.Get('org.freedesktop.systemd1.Unit', 'ActiveState')) substate = str(Iunit.Get('org.freedesktop.systemd1.Unit', 'SubState')) pid = str(Iunit.Get('org.freedesktop.systemd1.Service', 'ExecMainPID')) return (unit, service, state, substate, pid) def connect_to_prop_changed_signal(self, service): """ Bind the service to a signal('PropertiesChanged'). Fetch the service unit from systemd and its state, substate, pid etc. Bind the service to the sigle which will be triggered whenever the service changes it's state/substate. Also raise an alert if service is in failed/inactive state. """ try: unit, _, state, substate, pid = self.get_service_status( service=service) self.update_status_local_cache(service, state, substate, pid) Iunit2 = Interface( unit, dbus_interface='org.freedesktop.systemd1.Manager') Iunit2.connect_to_signal( 'PropertiesChanged', lambda a, b, c, p=unit: self.on_prop_changed(a, b, c, p), dbus_interface=PROPERTIES_IFACE) logger.debug(f"{service}({pid}) state is {state}:{substate}") if state in ["activating", "reloading", "deactivating"]: self.not_active_services[service] = \ [self.current_time(), "N/A", "N/A"] elif state != "active": self.failed_services.append(service) self.raise_alert(service, "N/A", state, "N/A", substate, "N/A", pid, 0) logger.error( f"{service} is not active initially. state = {state}:{substate}" ) return None except DBusException as err: return err def check_notactive_services(self): """ Monitor non-active Services. Raise FAULT Alert if any of the not-active services has exceeded the threshould time for inactivity. """ not_active_services_copy = copy.deepcopy(self.not_active_services) for service, [start_time, prev_state, prev_substate]\ in not_active_services_copy.items(): if self.current_time() - start_time > self.max_wait_time: state = self.service_status[service]["state"] substate = self.service_status[service]["substate"] pid = self.service_status[service]["pid"] self.not_active_services.pop(service) self.failed_services.append(service) self.raise_alert(service, prev_state, state, prev_substate, substate, pid, pid, 1) logger.warning(f"{service} in {state}:{substate} for "\ f"more than {self.max_wait_time} seconds.") def update_status_local_cache(self, service, state, substate, pid): self.service_status[service] = { "state": state, "substate": substate, "pid": pid } def on_prop_changed(self, interface, changed_properties, invalidated_properties, unit): """Handler to process the service state change signal.""" _, service, state, substate, pid = self.get_service_status(unit=unit) prev_state = self.service_status[service]["state"] prev_substate = self.service_status[service]["substate"] prev_pid = self.service_status[service]["pid"] logger.debug(f"Event for {service}, properties changed from "\ f"{prev_state}:{prev_substate} to {state}:{substate}") if prev_state == state: return logger.info(f"{service} changed state from " + \ f"{prev_state}:{prev_substate} to {state}:{substate}") self.update_status_local_cache(service, state, substate, pid) self.action_per_transition(service, prev_state, state, prev_substate, substate, prev_pid, pid) def action_per_transition(self, service, prev_state, state, prev_substate, substate, prev_pid, pid): """Take action according to the state change of the service.""" # alert_info_index : index pointing to alert_info table from # ServiceMonitor:raise_alerts() representing alert # description, type, impact etc. to be sent. alert_info_index = -1 logger.debug(f"ServiceMonitor:action_per_transition for {service} : " + \ f"({prev_state}:{prev_substate}) -> ({state}:{substate})") if prev_state in ["active", "reloading"]: if state == "active": # reloading -> active self.not_active_services.pop(service) if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 elif state != "failed": # active -> deactivating/inactive/reloading/activating # or # reloading -> deactivating/inactive/activating self.not_active_services[service] = \ [self.current_time(), prev_state, prev_substate] elif state == "failed": # active/reloading -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 elif prev_state == "deactivating": if state in ["inactive", "activating"]: # deactivating -> inactive/activating if service not in self.not_active_services: self.not_active_services[service] = \ [self.current_time(), prev_state, prev_substate] elif state == "failed": # deactivating -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 elif state == "active": # deactivating -> active if service in self.not_active_services: self.not_active_services.pop(service) if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 else: alert_info_index = 3 elif prev_state in ["inactive", "failed"]: if state == "activating": # inactive/failed -> activating if service not in self.not_active_services: self.not_active_services[service] = \ [self.current_time(), prev_state, prev_substate] elif state == "active": # inactive/failed -> active if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 if service in self.not_active_services: self.not_active_services.pop(service) elif state == "failed": # inactive -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 else: alert_info_index = 3 elif prev_state == "activating": if service in self.not_active_services: self.not_active_services.pop(service) if state in ["inactive", "deactivating"]: # activating -> inactive/deactivating self.failed_services.append(service) alert_info_index = 0 elif state == "active": # activating -> active if service in self.failed_services: self.failed_services.remove(service) alert_info_index = 2 else: # its a restart. pass elif state == "failed": # activating -> failed if service not in self.failed_services: self.failed_services.append(service) alert_info_index = 0 else: alert_info_index = 3 if alert_info_index == 3: logger.warning(f"{service} service state transition from "\ f"{prev_state} to {state} is not handled.") if alert_info_index != -1: self.raise_alert(service, prev_state, state, prev_substate, substate, prev_pid, pid, alert_info_index) def raise_alert(self, service, prev_state, state, prev_substate, substate, prev_pid, pid, alert_info_index): """Send the alert to ServiceMsgHandler.""" # Each alert info contains 4 fields # 1.Description | 2.Alert Type | 3.Impact | 4.Recommendation alert_info = [ [ f"{service} in {state} state.", #index 0 "fault", f"{service} service is unavailable.", "Try to restart the service" ], [ f"{service} in a {state} state for more than {self.max_wait_time} seconds.", "fault", #index 1 f"{service} service is unavailable.", "Try to restart the service" ], [ f"{service} in {state} state.", "fault_resolved", #index 2 f"{service} service is available now.", "" ], ] description = alert_info[alert_info_index][0] alert_type = alert_info[alert_info_index][1] impact = alert_info[alert_info_index][2] recommendation = alert_info[alert_info_index][3] severity = SeverityReader().map_severity(alert_type) epoch_time = str(self.current_time()) alert_id = get_alert_id(epoch_time) host_name = socket.getfqdn() self._site_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{SITE_ID}", 'DC01') self._rack_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{RACK_ID}", 'RC01') self._node_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{NODE_ID}", 'SN01') self._cluster_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{CLUSTER_ID}', 'CC01') info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": service, "event_time": epoch_time, "description": description, "impact": impact, "recommendation": recommendation, } alert_msg = { "sensor_request_type": { "service_status_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "info": info, "specific_info": { "service_name": service, "previous_state": prev_state, "state": state, "previous_substate": prev_substate, "substate": substate, "previous_pid": prev_pid, "pid": pid, } } } } self.raise_iem(service, alert_type) self._write_internal_msgQ(ServiceMsgHandler.name(), alert_msg) def raise_iem(self, service, alert_type): """Raise iem alert for kafka service.""" if service == "kafka.service" and alert_type == "fault": self.iem.iem_fault("KAFKA_NOT_ACTIVE") if (self.KAFKA not in self.iem.fault_iems): self.iem.fault_iems.append(self.KAFKA) elif (service == "kafka.service" and alert_type == "fault_resolved" and self.KAFKA in self.iem.fault_iems): self.iem.iem_fault_resolved("KAFKA_ACTIVE") self.iem.fault_iems.remove(self.KAFKA) def suspend(self): """Suspend the module thread. It should be non-blocking.""" super(ServiceMonitor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking.""" super(ServiceMonitor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread.""" super(ServiceMonitor, self).shutdown()