def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues."""
        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(ServiceMonitor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(ServiceMonitor, self).initialize_msgQ(msgQlist)

        self.iem = Iem()
        self.iem.check_exsisting_fault_iems()
        self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1]

        # Integrate into the main dbus loop to catch events
        DBusGMainLoop(set_as_default=True)

        # Initialize SystemBus and get Manager Interface
        self._bus = SystemBus()
        systemd = self._bus.get_object("org.freedesktop.systemd1",
                                       "/org/freedesktop/systemd1")
        self._manager = Interface(
            systemd, dbus_interface='org.freedesktop.systemd1.Manager')

        self.remove_disabled_services()

        return True
Exemple #2
0
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues."""
        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(ServiceMonitor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(ServiceMonitor, self).initialize_msgQ(msgQlist)

        self.iem = Iem()
        self.iem.check_exsisting_fault_iems()
        self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1]

        # Integrate into the main dbus loop to catch events
        DBusGMainLoop(set_as_default=True)

        # Initialize SystemBus and get Manager Interface
        self._bus = SystemBus()
        systemd = self._bus.get_object("org.freedesktop.systemd1",
                                       "/org/freedesktop/systemd1")
        self._manager = Interface(
            systemd, dbus_interface='org.freedesktop.systemd1.Manager')

        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SERVICE_MONITOR_DATA_PATH = os.path.join(
            cache_dir_path, f'SERVICE_MONITOR_DATA_{self._node_id}')
        # Get the stored previous service info
        self.persistent_service_data = {}
        if os.path.isfile(self.SERVICE_MONITOR_DATA_PATH):
            self.persistent_service_data = \
                store.get(self.SERVICE_MONITOR_DATA_PATH)
        if self.persistent_service_data:
            self.not_active_services = \
                self.persistent_service_data['not_active_services']
            self.failed_services = \
                self.persistent_service_data['failed_services']
            self.service_status = \
                self.persistent_service_data['service_status']
        else:
            self.persistent_service_data = {
                'not_active_services': self.not_active_services,
                'failed_services': self.failed_services,
                'service_status': self.service_status
            }
            store.put(self.persistent_service_data,
                      self.SERVICE_MONITOR_DATA_PATH)

        self.remove_disabled_services()

        return True
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues."""
        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(ServiceMonitor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(ServiceMonitor, self).initialize_msgQ(msgQlist)

        self.iem = Iem()
        self.iem.check_exsisting_fault_iems()
        self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1]

        self.initialize_dbus()
        for service in self.services_to_monitor:
            self.initialize_service(service)
        self.subscribe_unit_file_changed_signal()

        return True
def test_iem_alerts(self):
    """Test iem 'ipmitool' fault alert receive."""
    check_sspl_ll_is_running()
    Iem().iem_fault("IPMITOOL_ERROR")
    time.sleep(10)
    EventMessage.subscribe(component='sspl')
    fault_alert = EventMessage.receive()
    print(f"IEM Received:{fault_alert}")

    assert (fault_alert is not None)
    assert (fault_alert["iem"]["info"]["severity"] is not None)
    assert (fault_alert["iem"]["info"]["type"] is not None)
    assert (fault_alert["iem"]["info"]["event_time"] is not None)
    assert (fault_alert["iem"]["source"]["module"] is not None)
    assert (fault_alert["iem"]["contents"]["event"] is not None)
class ServiceMonitor(SensorThread, InternalMsgQ):
    """ Sensor to monitor state change events of services. """

    SENSOR_NAME = "ServiceMonitor"
    PRIORITY = 2

    # Section and keys in configuration file
    SERVICEMONITOR = SENSOR_NAME.upper()

    MONITORED_SERVICES = 'monitored_services'
    THREAD_SLEEP = 'thread_sleep'
    POLLING_FREQUENCY = 'polling_frequency'

    # Dependency list
    DEPENDENCIES = {"plugins": ["SeviceMsgHandler"]}

    RESOURCE_TYPE = "node:sw:os:service"

    @staticmethod
    def name():
        """@return: name of the module."""
        return ServiceMonitor.SENSOR_NAME

    def __init__(self):
        """Initialize the relevant datastructures."""
        super(ServiceMonitor, self).__init__(self.SENSOR_NAME,
                                             self.PRIORITY)

        self.services_to_monitor = set(Conf.get(
            SSPL_CONF, f"{self.SERVICEMONITOR}>{self.MONITORED_SERVICES}", []))

        self.services = {}

        self.thread_sleep = int(Conf.get(SSPL_CONF,
                                         f"{self.SERVICEMONITOR}>{self.THREAD_SLEEP}",
                                         "1"))

        self.polling_frequency = int(Conf.get(SSPL_CONF,
                                              f"{self.SERVICEMONITOR}>{self.POLLING_FREQUENCY}",
                                              "30"))

    def read_data(self):
        """Return the dict of service status."""
        return self.service_status

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues."""
        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(ServiceMonitor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(ServiceMonitor, self).initialize_msgQ(msgQlist)

        self.iem = Iem()
        self.iem.check_exsisting_fault_iems()
        self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1]

        self.initialize_dbus()
        for service in self.services_to_monitor:
            self.initialize_service(service)
        self.subscribe_unit_file_changed_signal()

        return True

    def run(self):

        try:
            logger.info(f"Monitoring Services : {self.services.keys()}")
            if not self.services_to_monitor:
                logger.info(
                    "No service to monitor, shutting down {}".format(
                        self.name()))
                self.shutdown()
            # WHILE LOOP FUNCTION : every second we check for
            # properties change event if any generated (using context
            # iteration) and after a delay of polling frequency we
            # check for non_active processes.
            iterations = 0
            non_active_check = int(
                self.polling_frequency / self.thread_sleep) or 1
            while self.is_running():
                # At interval of 'thread_sleep' check for events occurred for
                # services and process them
                self.process_events()
                self.process_alerts()
                if not iterations % non_active_check:
                    # Initialize errored service again
                    for service in self.services_to_monitor - set(
                            self.services.keys()):
                        self.initialize_service(service)
                    for service in Service.monitoring_disabled.copy():
                        self.services[service].new_unit_state(EnabledState)
                    # Check for services in intermediate state(not active)
                    self.check_nonactive_services()
                    self.check_active_services()
                time.sleep(self.thread_sleep)
                iterations += 1
            logger.info("ServiceMonitor gracefully breaking out " +
                        "of dbus Loop, not restarting.")
        except GLib.Error as err:
            raise ThreadException(self.SENSOR_NAME,
                                  "Ungrecefully breaking out of"
                                  "GLib.MainLoop() with error: %s"
                                  % err)
        except DBusException as err:
            raise ThreadException(self.SENSOR_NAME,
                                  "Ungracefully breaking out of dbus loop"
                                  "with error: %s" % err)
        except Exception as err:
            raise ThreadException(self.SENSOR_NAME,
                                  "Ungracefully breaking out of"
                                  "ServiceMonitor:run() with error: %s" % err)

    def initialize_service(self, service_name):
        try:
            unit = self._bus.get_object(SYSTEMD_BUS,
                                        self._manager.LoadUnit(service_name))
            if Service.cache_exists(service_name):
                service = Service.from_cache(service_name, unit)
            else:
                service = Service(unit)
            service.handle_unit_state_change()
            self.services[service_name] = service
        except DBusException:
            logger.error("Error: {} Failed to initialize service {},"
                         "initialization will be retried in"
                         "{} seconds".format(DBusException, service_name,
                                             self.polling_frequency))

    def subscribe_unit_file_changed_signal(self):
        self._manager.connect_to_signal('UnitFilesChanged',
                                        self.unit_file_state_change_handler,
                                        dbus_interface=MANAGER_IFACE)

    @staticmethod
    def subscribe_properties_changed_signal(service):
        service.properties_changed_signal = Interface(
            object=service.unit,
            dbus_interface=MANAGER_IFACE).connect_to_signal(
            'PropertiesChanged', service.properties_changed_handler,
            PROPERTIES_IFACE)

    def unit_file_state_change_handler(self):
        for service in self.services.values():
            service.handle_unit_state_change()

    def process_events(self):
        while self.context.pending():
            self.context.iteration(False)

    def process_alerts(self):
        while not Service.alerts.empty():
            message = Service.alerts.get()
            self.raise_alert(message)

    def initialize_dbus(self):
        DBusGMainLoop(set_as_default=True)

        # Initialize SystemBus and get Manager Interface
        self._bus = SystemBus()
        systemd = self._bus.get_object(SYSTEMD_BUS,
                                       "/org/freedesktop/systemd1")
        self._manager = Interface(systemd,
                                  dbus_interface=MANAGER_IFACE)
        # Retrieve the main loop which will be called in the run method
        self._loop = GLib.MainLoop()
        self.context = self._loop.get_context()

    def check_nonactive_services(self):
        """
           Monitor non-active Services.

           Raise FAULT Alert if any of the not-active services has exceeded
           the threshold time for inactivity.
        """
        for service in Service.non_active.copy():
            if self.services[service].is_nonactive_for_threshold_time():
                self.services[service].new_service_state(FailedState)
                self.raise_alert(self.get_alert(self.services[service],
                                                InactiveAlert))

    def check_active_services(self):
        """
            Monitor active services.

            Raise fault resolved alert for active services
            if service stays in active state for threshhold time.
        """
        for service in Service.active_services.copy():
            if self.services[service].is_active_for_threshold_time():
                self.raise_alert(
                    self.get_alert(self.services[service], ResolvedAlert))
                Service.active_services.discard(service)

    def raise_iem(self, service, alert_type):
        """Raise iem alert for kafka service."""
        if service == "kafka.service" and alert_type == "fault":
            self.iem.iem_fault("KAFKA_NOT_ACTIVE")
            if (self.KAFKA not in self.iem.fault_iems):
                self.iem.fault_iems.append(self.KAFKA)
        elif (service == "kafka.service" and alert_type == "fault_resolved"
              and self.KAFKA in self.iem.fault_iems):
            self.iem.iem_fault_resolved("KAFKA_ACTIVE")
            self.iem.fault_iems.remove(self.KAFKA)

    @classmethod
    def get_alert(cls, service, alert):
        if service.state == "active":
            description = alert.description.format(
                service.name, service.state, service.active_threshold)
        else:
            description = alert.description.format(
                service.name, service.state, service.nonactive_threshold)
        return {
            "sensor_request_type": {
                "service_status_alert": {
                    "host_id": socket.getfqdn(),
                    "severity": SeverityReader().map_severity(
                        alert.alert_type),
                    "alert_id": get_alert_id(str(int(time.time()))),
                    "alert_type": alert.alert_type,
                    "info": {
                        "resource_type": cls.RESOURCE_TYPE,
                        "resource_id": service.name,
                        "event_time": str(int(time.time())),
                        "description": description,
                        "impact": alert.impact.format(service.name),
                        "recommendation": alert.recommendation,
                    },
                    "specific_info": {
                        "service_name": service.name,
                        "previous_state": service.previous_state,
                        "state": service.state,
                        "previous_substate": service.previous_substate,
                        "substate": service.substate,
                        "previous_pid": service.previous_pid,
                        "pid": service.pid,
                    }
                }
            }
        }

    def raise_alert(self, message):
        service = message["sensor_request_type"]["service_status_alert"][
            "info"]["resource_id"]
        alert_type = message["sensor_request_type"]["service_status_alert"][
            "alert_type"]
        self.raise_iem(service, alert_type)
        self._write_internal_msgQ(ServiceMsgHandler.name(), message)
        self.services[service].dump_to_cache()

    def suspend(self):
        """Suspend the module thread. It should be non-blocking."""
        super(ServiceMonitor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking."""
        super(ServiceMonitor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread."""
        super(ServiceMonitor, self).shutdown()
    def _send_msg(self, iem_components, log_timestamp):
        """Creates JSON message from iem components and sends to message bus.
        """
        impact = "NA"
        recommendation = "NA"
        # IEM format is IEC:DESCRIPTION
        # IEC format is SEVERITY|SOURCEID|COMPONENTID|MODULEID|EVENTID
        # Field lengths ----1---|---1----|------3----|----3---|---4---
        # Example IEM -> "IEC: BO1001000001:Error in connecting to controller"
        # Actual IEC doesn't contain separator between fields. It is shown
        # here just for readability. Each field has fixed length.
        severity, source_id, component_id, module_id, event_id, description = \
                                                        [iem_components[i] for i in range(6)]

        # Check if severity level is valid
        if severity not in self.SEVERITY_LEVELS:
            logger.warn(f"Invalid Severity level: {severity}")
            return

        # Check for valid source id
        if source_id not in self.SOURCE_IDS:
            logger.warn(f"Invalid Source ID level: {source_id}")
            return

        # Check for valid event time
        event_time = self._get_epoch_time_from_timestamp(log_timestamp)
        if not event_time:
            logger.error("Timestamp is not in required format, discarding the message")
            return

        # Check for other components
        args = {
            "_comp_id": component_id,
            "_module_id": module_id,
            "_event_id": event_id
        }
        if not self._are_components_in_range(**args):
            return

        # component-id for sspl=005
        if component_id == "005":
            event_code = component_id + module_id + event_id
            impact = Iem().EVENT_STRING[event_code][1]
            recommendation = Iem().EVENT_STRING[event_code][2]

        # Update severity and source_id
        alert_type = iem_severity_to_alert_mapping.get(severity)
        severity = iem_severity_types.get(severity, severity)
        source_id = iem_source_types.get(source_id, source_id)

        # Decode component_id, module_id and event_id
        component_id, module_id, event_id = self._decode_msg( f"{component_id}{module_id}{event_id}")

        info = {
            "source_id": source_id,
            "component_id": component_id,
            "module_id": module_id,
            "event_id": event_id,
            "severity": severity,
            "description": description,
            "impact": impact,
            "recommendation": recommendation,
            "alert_type": alert_type,
            "event_time": event_time,
            "IEC": "".join(iem_components[:-1])
        }
        iem_data_msg = IEMDataMsg(info)
        json_msg = iem_data_msg.getJson()
        self._write_internal_msgQ(EgressProcessor.name(), json_msg)
Exemple #7
0
 def run(self):
     """Run the sensor on its own thread"""
     logger.debug("Consul accumulated messages processing started")
     if not self._is_my_msgQ_empty():
         # Check for shut down message from sspl_ll_d and set a flag to shutdown
         #  once our message queue is empty
         self._jsonMsg, _ = self._read_my_msgQ()
         if self._jsonMsg.get("message").get(
                 "actuator_response_type") is not None and \
                 self._jsonMsg.get("message").get(
                     "actuator_response_type").get(
                     "thread_controller") is not None and \
                 self._jsonMsg.get("message").get(
                     "actuator_response_type").get("thread_controller").get(
                     "thread_response") == \
                 "SSPL-LL is shutting down":
             logger.info("EgressAccumulatedMsgsProcessor, run, received"
                         "global shutdown message from sspl_ll_d")
             self.shutdown()
     try:
         # TODO : Fix accumulated message processor when message bus changes are available to
         # error out in case of failure (EOS-17626)
         if not self.store_queue.is_empty():
             logger.debug(
                 "Found accumulated messages, trying to send again")
             while not self.store_queue.is_empty():
                 message = self.store_queue.get()
                 if isinstance(message, bytes):
                     message = message.decode()
                 dict_msg = json.loads(message)
                 if dict_msg.get("iem"):
                     try:
                         Iem.raise_iem_event(
                             module=dict_msg["iem"]["module"],
                             event_code=dict_msg["iem"]["event_code"],
                             severity=dict_msg["iem"]["severity"],
                             description=dict_msg["iem"]["description"])
                         logger.info("Accumulated IEM sent. %s" % dict_msg)
                         self.store_queue.delete()
                     except (EventMessageError, Exception) as e:
                         logger.error(f"Failed to send IEM. ERROR: {e}")
                 else:
                     if "actuator_response_type" in dict_msg["message"]:
                         event_time = dict_msg["message"] \
                             ["actuator_response_type"]["info"]["event_time"]
                         time_diff = int(time.time()) - int(event_time)
                         if time_diff > self.MSG_TIMEOUT:
                             continue
                     if "sensor_response_type" in dict_msg["message"]:
                         logger.info(
                             f"Publishing Accumulated Alert: {message}")
                     if isinstance(self._producer, MessageProducer):
                         self._producer.send([message])
                         logger.info(
                             f"Published Accumulated Message {message}")
                         self.store_queue.delete()
                     else:
                         self.create_MsgProducer_obj()
     except MessageBusError as e:
         logger.error("EgressAccumulatedMsgsProcessor, run, %r" % e)
     except Exception as e:
         logger.error(e)
     finally:
         logger.debug("Consul accumulated processing ended")
         self._scheduler.enter(30, self._priority, self.run, ())
class ServiceMonitor(SensorThread, InternalMsgQ):
    """ Sensor to monitor state change events of services. """

    SENSOR_NAME = "ServiceMonitor"
    PRIORITY = 2

    # Section and keys in configuration file
    SERVICEMONITOR = SENSOR_NAME.upper()
    RESOURCE_TYPE = "node:sw:os:service"
    MONITORED_SERVICES = 'monitored_services'
    THREAD_SLEEP = 'thread_sleep'
    POLLING_FREQUENCY = 'polling_frequency'
    MAX_WAIT_TIME = 'threshold_inactive_time'

    # Dependency list
    DEPENDENCIES = {"plugins": ["SeviceMsgHandler"]}

    @staticmethod
    def name():
        """@return: name of the module."""
        return ServiceMonitor.SENSOR_NAME

    def __init__(self):
        """Initialize the relavent datastructures."""
        super(ServiceMonitor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        self.services_to_monitor = copy.deepcopy(
            Conf.get(SSPL_CONF,
                     f"{self.SERVICEMONITOR}>{self.MONITORED_SERVICES}", []))

        self.not_active_services = {}
        self.failed_services = []

        self.service_status = {}

        self.thread_sleep = \
            int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.THREAD_SLEEP}", 1))

        self.polling_frequency = \
            int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.POLLING_FREQUENCY}", 30))

        self.max_wait_time = \
            int(Conf.get(SSPL_CONF, f"{self.SERVICEMONITOR}>{self.MAX_WAIT_TIME}", 60))

    def read_data(self):
        """Return the dict of service status."""
        return self.service_status

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues."""
        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(ServiceMonitor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(ServiceMonitor, self).initialize_msgQ(msgQlist)

        self.iem = Iem()
        self.iem.check_exsisting_fault_iems()
        self.KAFKA = self.iem.EVENT_CODE["KAFKA_ACTIVE"][1]

        # Integrate into the main dbus loop to catch events
        DBusGMainLoop(set_as_default=True)

        # Initialize SystemBus and get Manager Interface
        self._bus = SystemBus()
        systemd = self._bus.get_object("org.freedesktop.systemd1",
                                       "/org/freedesktop/systemd1")
        self._manager = Interface(
            systemd, dbus_interface='org.freedesktop.systemd1.Manager')

        self.remove_disabled_services()

        return True

    def remove_disabled_services(self):
        """Remove `disabled` services from the list of services to monitor."""
        temp = copy.deepcopy(self.services_to_monitor)
        for service in temp:
            try:
                if 'disabled' in str(self._manager.GetUnitFileState(service)):
                    self.services_to_monitor.remove(service)
            except DBusException as err:
                # If a service is enabled then it definitely has 'UnitFileState`,
                # but for disabled both presence or absence of UnitFileState is
                # possible. so if `UnitFileState' not present for the service,
                # it is definitely disabled.
                logger.debug(f"{service} is not getting monitored due "\
                             f"to an error : {err}")
                self.services_to_monitor.remove(service)

    def run(self):
        logger.info(f"Monitoring Services : {self.services_to_monitor}")
        try:
            # Register all the services to signal of 'PropertiesChanged' and
            # raise an alert if some service is not active on initially or if
            # Unit is not found for the service
            services_to_monitor_copy = copy.deepcopy(self.services_to_monitor)
            for service in services_to_monitor_copy:
                err = self.connect_to_prop_changed_signal(service)
                if err:
                    self.raise_alert(service, "N/A", "N/A", "N/A", "N/A",
                                     "N/A", "N/A", 0)
                    logger.error(
                        f"{service} is not active initially. \n Error {err}")
                else:
                    self.services_to_monitor.remove(service)

            logger.debug(f"failed_services : {self.failed_services}")
            logger.debug(f"services_to_monitor : {self.services_to_monitor}")

            # Retrieve the main loop which will be called in the run method
            self._loop = GLib.MainLoop()

            # Initialize the gobject threads and get its context
            GLib.threads_init()
            context = self._loop.get_context()

            time_to_check_lists = self.current_time() + self.polling_frequency

            # WHILE LOOP FUNCTION : every second we check for
            # properties change event if any generated (using context
            # iteration) and after a delay of polling frequency we
            # check for inactive processes.
            while self.is_running():
                # At interval of 'thread_sleep' check for events occured for
                # registered services and process them(call on_pro_changed())
                context.iteration(False)
                time.sleep(self.thread_sleep)

                # At interval of 'polling_freqency' process unregistered
                # services and services with not-active (intermidiate) state.
                if time_to_check_lists <= self.current_time():
                    time_to_check_lists = self.current_time() + \
                                            self.polling_frequency

                    # Try to bind the enabled services on the node to the
                    # signal whose Unit was earlier not found. On successfully
                    # registering for service state change signal, remove from
                    # local list as monitoring enabled through SystemD
                    # and to avoid re-registration.
                    services_to_monitor_copy = copy.deepcopy(
                        self.services_to_monitor)
                    for service in services_to_monitor_copy:
                        if not self.connect_to_prop_changed_signal(service):
                            self.services_to_monitor.remove(service)

                    # Check for services in intermidiate state(not active)
                    self.check_notactive_services()


            logger.info("ServiceMonitor gracefully breaking out " +\
                                "of dbus Loop, not restarting.")
        except GLib.Error as err:
            raise ThreadException(
                self.SENSOR_NAME,
                "Ungrecefully breaking out of GLib.MainLoop() with error: %s" %
                err)
        except DBusException as err:
            raise ThreadException(
                self.SENSOR_NAME,
                "Ungracefully breaking out of dbus loop with error: %s" % err)
        except Exception as err:
            raise ThreadException(self.SENSOR_NAME,
                "Ungracefully breaking out of ServiceMonitor:run() "\
                "with error: %s" % err)

    def current_time(self):
        """Returns the time as integer number in seconds since the epoch in UTC."""
        return int(time.time())

    def get_service_status(self, service=None, unit=None):
        """Returns tuple of unit, service name, state, substate and pid."""
        if not unit:
            unit = self._bus.get_object('org.freedesktop.systemd1',\
                                    self._manager.LoadUnit(service))

        Iunit = Interface(unit,
                          dbus_interface='org.freedesktop.DBus.Properties')

        if not service:
            service = str(Iunit.Get('org.freedesktop.systemd1.Unit', 'Id'))

        state = str(Iunit.Get('org.freedesktop.systemd1.Unit', 'ActiveState'))
        substate = str(Iunit.Get('org.freedesktop.systemd1.Unit', 'SubState'))
        pid = str(Iunit.Get('org.freedesktop.systemd1.Service', 'ExecMainPID'))

        return (unit, service, state, substate, pid)

    def connect_to_prop_changed_signal(self, service):
        """
           Bind the service to a signal('PropertiesChanged').

           Fetch the service unit from systemd and its state, substate,
           pid etc. Bind the service to the sigle which will be triggered
           whenever the service changes it's state/substate. Also raise
           an alert if service is in failed/inactive state.
        """
        try:
            unit, _, state, substate, pid = self.get_service_status(
                service=service)

            self.update_status_local_cache(service, state, substate, pid)

            Iunit2 = Interface(
                unit, dbus_interface='org.freedesktop.systemd1.Manager')

            Iunit2.connect_to_signal(
                'PropertiesChanged',
                lambda a, b, c, p=unit: self.on_prop_changed(a, b, c, p),
                dbus_interface=PROPERTIES_IFACE)

            logger.debug(f"{service}({pid}) state is {state}:{substate}")

            if state in ["activating", "reloading", "deactivating"]:
                self.not_active_services[service] = \
                                    [self.current_time(), "N/A", "N/A"]
            elif state != "active":
                self.failed_services.append(service)
                self.raise_alert(service, "N/A", state, "N/A", substate, "N/A",
                                 pid, 0)
                logger.error(
                    f"{service} is not active initially. state = {state}:{substate}"
                )

            return None
        except DBusException as err:
            return err

    def check_notactive_services(self):
        """
           Monitor non-active Services.

           Raise FAULT Alert if any of the not-active services has exceeded
           the threshould time for inactivity.
        """
        not_active_services_copy = copy.deepcopy(self.not_active_services)
        for service, [start_time, prev_state, prev_substate]\
                                 in not_active_services_copy.items():

            if self.current_time() - start_time > self.max_wait_time:
                state = self.service_status[service]["state"]
                substate = self.service_status[service]["substate"]
                pid = self.service_status[service]["pid"]
                self.not_active_services.pop(service)
                self.failed_services.append(service)
                self.raise_alert(service, prev_state, state, prev_substate,
                                 substate, pid, pid, 1)
                logger.warning(f"{service} in {state}:{substate} for "\
                               f"more than {self.max_wait_time} seconds.")

    def update_status_local_cache(self, service, state, substate, pid):
        self.service_status[service] = {
            "state": state,
            "substate": substate,
            "pid": pid
        }

    def on_prop_changed(self, interface, changed_properties,
                        invalidated_properties, unit):
        """Handler to process the service state change signal."""
        _, service, state, substate, pid = self.get_service_status(unit=unit)

        prev_state = self.service_status[service]["state"]
        prev_substate = self.service_status[service]["substate"]
        prev_pid = self.service_status[service]["pid"]

        logger.debug(f"Event for {service}, properties changed from "\
                     f"{prev_state}:{prev_substate} to {state}:{substate}")

        if prev_state == state:
            return


        logger.info(f"{service} changed state from " + \
                    f"{prev_state}:{prev_substate} to {state}:{substate}")

        self.update_status_local_cache(service, state, substate, pid)

        self.action_per_transition(service, prev_state, state, prev_substate,
                                   substate, prev_pid, pid)

    def action_per_transition(self, service, prev_state, state, prev_substate,
                              substate, prev_pid, pid):
        """Take action according to the state change of the service."""
        # alert_info_index : index pointing to alert_info table from
        #               ServiceMonitor:raise_alerts() representing alert
        #               description, type, impact etc. to be sent.
        alert_info_index = -1

        logger.debug(f"ServiceMonitor:action_per_transition for {service} : " + \
            f"({prev_state}:{prev_substate}) -> ({state}:{substate})")

        if prev_state in ["active", "reloading"]:
            if state == "active":
                # reloading -> active
                self.not_active_services.pop(service)
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
            elif state != "failed":
                # active -> deactivating/inactive/reloading/activating
                # or
                # reloading -> deactivating/inactive/activating
                self.not_active_services[service] = \
                    [self.current_time(), prev_state, prev_substate]
            elif state == "failed":
                # active/reloading -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
        elif prev_state == "deactivating":
            if state in ["inactive", "activating"]:
                # deactivating -> inactive/activating
                if service not in self.not_active_services:
                    self.not_active_services[service] = \
                        [self.current_time(), prev_state, prev_substate]
            elif state == "failed":
                # deactivating -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
            elif state == "active":
                # deactivating -> active
                if service in self.not_active_services:
                    self.not_active_services.pop(service)
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
            else:
                alert_info_index = 3
        elif prev_state in ["inactive", "failed"]:
            if state == "activating":
                # inactive/failed -> activating
                if service not in self.not_active_services:
                    self.not_active_services[service] = \
                        [self.current_time(), prev_state, prev_substate]
            elif state == "active":
                # inactive/failed -> active
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
                if service in self.not_active_services:
                    self.not_active_services.pop(service)
            elif state == "failed":
                # inactive -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
            else:
                alert_info_index = 3
        elif prev_state == "activating":
            if service in self.not_active_services:
                self.not_active_services.pop(service)
            if state in ["inactive", "deactivating"]:
                # activating -> inactive/deactivating
                self.failed_services.append(service)
                alert_info_index = 0
            elif state == "active":
                # activating -> active
                if service in self.failed_services:
                    self.failed_services.remove(service)
                    alert_info_index = 2
                else:
                    # its a restart.
                    pass
            elif state == "failed":
                # activating -> failed
                if service not in self.failed_services:
                    self.failed_services.append(service)
                    alert_info_index = 0
            else:
                alert_info_index = 3

        if alert_info_index == 3:
            logger.warning(f"{service} service state transition from "\
                           f"{prev_state} to {state} is not handled.")
        if alert_info_index != -1:
            self.raise_alert(service, prev_state, state, prev_substate,
                             substate, prev_pid, pid, alert_info_index)

    def raise_alert(self, service, prev_state, state, prev_substate, substate,
                    prev_pid, pid, alert_info_index):
        """Send the alert to ServiceMsgHandler."""
        # Each alert info contains 4 fields
        # 1.Description | 2.Alert Type | 3.Impact | 4.Recommendation
        alert_info = [
            [
                f"{service} in {state} state.",  #index 0
                "fault",
                f"{service} service is unavailable.",
                "Try to restart the service"
            ],
            [
                f"{service} in a {state} state for more than {self.max_wait_time} seconds.",
                "fault",  #index 1
                f"{service} service is unavailable.",
                "Try to restart the service"
            ],
            [
                f"{service} in {state} state.",
                "fault_resolved",  #index 2
                f"{service} service is available now.",
                ""
            ],
        ]

        description = alert_info[alert_info_index][0]
        alert_type = alert_info[alert_info_index][1]
        impact = alert_info[alert_info_index][2]
        recommendation = alert_info[alert_info_index][3]

        severity = SeverityReader().map_severity(alert_type)
        epoch_time = str(self.current_time())
        alert_id = get_alert_id(epoch_time)
        host_name = socket.getfqdn()

        self._site_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{SITE_ID}",
                                 'DC01')
        self._rack_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{RACK_ID}",
                                 'RC01')
        self._node_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{NODE_ID}",
                                 'SN01')
        self._cluster_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{CLUSTER_ID}',
                                    'CC01')

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": service,
            "event_time": epoch_time,
            "description": description,
            "impact": impact,
            "recommendation": recommendation,
        }

        alert_msg = {
            "sensor_request_type": {
                "service_status_alert": {
                    "host_id": host_name,
                    "severity": severity,
                    "alert_id": alert_id,
                    "alert_type": alert_type,
                    "info": info,
                    "specific_info": {
                        "service_name": service,
                        "previous_state": prev_state,
                        "state": state,
                        "previous_substate": prev_substate,
                        "substate": substate,
                        "previous_pid": prev_pid,
                        "pid": pid,
                    }
                }
            }
        }

        self.raise_iem(service, alert_type)
        self._write_internal_msgQ(ServiceMsgHandler.name(), alert_msg)

    def raise_iem(self, service, alert_type):
        """Raise iem alert for kafka service."""
        if service == "kafka.service" and alert_type == "fault":
            self.iem.iem_fault("KAFKA_NOT_ACTIVE")
            if (self.KAFKA not in self.iem.fault_iems):
                self.iem.fault_iems.append(self.KAFKA)
        elif (service == "kafka.service" and alert_type == "fault_resolved"
              and self.KAFKA in self.iem.fault_iems):
            self.iem.iem_fault_resolved("KAFKA_ACTIVE")
            self.iem.fault_iems.remove(self.KAFKA)

    def suspend(self):
        """Suspend the module thread. It should be non-blocking."""
        super(ServiceMonitor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking."""
        super(ServiceMonitor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread."""
        super(ServiceMonitor, self).shutdown()