Esempio n. 1
0
class MemFaultSensor(SensorThread, InternalMsgQ):
    """Memory fault Sensor which runs on its own thread once every power cycle and
       is responsible for identifying total RAM memory on the node and any errors in it using
       available tool/utility"""

    SENSOR_NAME = "MemFaultSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:memory"

    # section in the configuration store
    SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION"
    SITE_ID_KEY = "site_id"
    CLUSTER_ID_KEY = "cluster_id"
    NODE_ID_KEY = "node_id"
    RACK_ID_KEY = "rack_id"
    POLLING_INTERVAL_KEY = "polling_interval"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "0"
    DEFAULT_POLLING_INTERVAL = '0'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the module."""
        return MemFaultSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(MemFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance
        self.total_mem = None
        self.mem_path_file = None
        self.prev_mem = None
        self.fault_alert_state = "Neutral State"
        # Flag to indicate suspension of module
        self._suspended = False

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(MemFaultSensor, self).initialize(conf_reader)

        super(MemFaultSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.SITE_ID_KEY), '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.CLUSTER_ID_KEY), '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.RACK_ID_KEY), '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.NODE_ID_KEY), '001')

        # get the mem fault implementor from configuration
        mem_fault_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, "procfs")

        self.polling_interval = int(
            self._conf_reader._get_value_with_default(
                self.SENSOR_NAME.upper(), self.POLLING_INTERVAL_KEY,
                self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(mem_fault_utility)
#            self._utility_instance.initialize()
        except KeyError as key_error:
            logger.error(
                "Unable to get the instance of {} \
                Utility. Hence shutting down the sensor {}"\
                .format(mem_fault_utility, MemFaultSensor.SENSOR_NAME))
            self.shutdown()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.MEM_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'MEM_FAULT_SENSOR_DATA_{self._node_id}')

        return True

    def get_stored_mem_info(self):
        """ Get the memory info from consul"""

        if store.exists(self.MEM_FAULT_SENSOR_DATA):
            consul_data = (store.get(self.MEM_FAULT_SENSOR_DATA)).split(":")
            self.prev_mem = consul_data[0].strip()
            self.fault_alert_state = consul_data[1].strip()

    def put_mem_info(self, total_memory_size):
        """ Store the current memory in Consul"""

        store.put(f"{total_memory_size}:{self.fault_alert_state}",
                  self.MEM_FAULT_SENSOR_DATA)

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = "fault"

        mem_path = self._utility_instance.get_proc_memory('meminfo')
        if mem_path.is_file():
            self.mem_path_file = mem_path.read_text()
            mem_info_fields = self.mem_path_file.split()

            if mem_info_fields[0] == 'MemTotal:':
                self.total_mem = mem_info_fields[1]

                # Get data from store if available and compare to the current value
                self.get_stored_mem_info()

                if self.prev_mem is not None:
                    # Fault and Fault_resolved Both conditions are handled.
                    if int(self.prev_mem) > int(self.total_mem):
                        # update the store with new value, raise an alert of type "fault"
                        if self.fault_alert_state == "Neutral State":
                            self.fault_alert_state = "Fault Generated"
                            self._generate_alert(alert_type)
                            self.put_mem_info(self.prev_mem)

                    elif (int(self.prev_mem) <= int(self.total_mem)) and (
                            self.fault_alert_state == "Fault Generated"):
                        self.fault_alert_state = "Neutral State"
                        alert_type = "fault_resolved"
                        self._generate_alert(alert_type)
                        self.put_mem_info(self.total_mem)
                else:
                    self.put_mem_info(self.total_mem)
            else:
                logger.error(
                    "MemFaultSensor: invalid file, shutting down the sensor")
                self.shutdown()
                return True
        else:
            logger.error(
                "MemFaultSensor: file does not exist, shutting down the sensor"
            )
            self.shutdown()
            return True

        # Do not proceed if module is suspended
        # Memory sensor is going to trigger only during SSPL reboot; at reboot time a sensor
        # can not be in suspended state.
        # Commented code is retained if in future we want to make the sensor periodic,
        # this piece will be needed
        #if self._suspended is True:
        #    self._scheduler.enter(self.polling_interval, self._priority, self.run, ())
        #    return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        # self scheduling is commented so that the process runs only once per SSPL reboot
        # Enable with correct polling_interval if in future memory sensor needs to run periodically
        #self._scheduler.enter(self.polling_interval, self._priority, self.run, ())

    def _create_json_message(self, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        specific_info = {}
        specific_info_list = []
        if alert_type == "fault":
            specific_info["event"] = \
                    "Total available main memory value decreased from {} kB to {} kB"\
                    .format(self.prev_mem, self.total_mem)
        elif alert_type == "fault_resolved":
            specific_info["event"] = \
                    "Total main memory value available {} kB"\
                    .format(self.total_mem)

        # populate all the data from /proc/meminfo
        split_strs = [
            s.split(maxsplit=1) for s in self.mem_path_file.splitlines()
        ]
        dictionary_str = dict(split_strs)
        specific_info["meminfo"] = dictionary_str
        specific_info_list.append(specific_info)

        alert_specific_info = specific_info_list

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID,
            "event_time": epoch_time
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(MemFaultSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(MemFaultSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(MemFaultSensor, self).shutdown()
Esempio n. 2
0
class CPUFaultSensor(SensorThread, InternalMsgQ):
    """CPU Fault Sensor which runs on its own thread on each boot up and
       is responsible for sensing changes in online CPUs using
       available tool/utility"""

    SENSOR_NAME = "CPUFaultSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:cpu:core"

    # Section in the configuration store
    SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION"
    SITE_ID_KEY = "site_id"
    CLUSTER_ID_KEY = "cluster_id"
    NODE_ID_KEY = "node_id"
    RACK_ID_KEY = "rack_id"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "CPU-"

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the module."""
        return CPUFaultSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        # CPU info
        self.stored_cpu_info = None
        self.prev_cpu_info = None
        self.current_cpu_info = None

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(CPUFaultSensor, self).initialize(conf_reader)

        super(CPUFaultSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.SITE_ID_KEY), '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.CLUSTER_ID_KEY), '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.RACK_ID_KEY), '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.NODE_ID_KEY), '001')

        # get the cpu fault implementor from configuration
        cpu_fault_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, 'sysfs')

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(cpu_fault_utility)
        except Exception as e:
            logger.error(
                f"Error while initializing, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.CPU_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}')

        return True

    def read_stored_cpu_info(self):
        """Read the most recent stored cpu info"""
        try:
            if self.stored_cpu_info is None:
                self.stored_cpu_info = store.get(self.CPU_FAULT_SENSOR_DATA)
            if self.stored_cpu_info is not None and self._node_id in self.stored_cpu_info.keys(
            ):
                self.prev_cpu_info = self.stored_cpu_info[
                    self._node_id]['CPU_LIST']
        except Exception as e:
            logger.error(
                f"Error while reading stored cpu info, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

    def read_current_cpu_info(self):
        """Read current cpu info"""
        try:
            self.current_cpu_info = self._utility_instance.get_cpu_info()
        except Exception as e:
            logger.error(
                f"Error while reading current cpu info, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

    def run(self):
        """Run the sensor on its own thread"""

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()
        # Read recent stored cpu info
        self.read_stored_cpu_info()
        # Store alerts to be sent here
        self.alerts_for = {}
        # Specific info field for alerts
        self.specific_info = []
        # Read current cpu info
        self.read_current_cpu_info()

        to_update = False
        # Compare with previous cpu info
        # If a cpu is present in prev_cpu_info and not present in current_cpu_info : fault alert is generated
        # If a cpu is present in current_cpu_info and not present in prev_cpu_info : two possibilities
        #   1) if cpu has an outstanding fault alert : it is a repaired cpu, hence generate fault_resolved
        #   2) if cpu has no outstanding alert : it is a newly added cpu, do not do anything
        try:
            if self.prev_cpu_info:
                if self.current_cpu_info != self.prev_cpu_info:
                    # Create a set of all relevant cpus
                    cpu_list = set(self.prev_cpu_info + self.current_cpu_info)
                    # Iterate through the set
                    for cpu in cpu_list:
                        if cpu not in self.current_cpu_info and cpu not in self.stored_cpu_info[
                                self._node_id]['FAULT_LIST']:
                            # This is a failed cpu
                            self.stored_cpu_info[
                                self._node_id]['FAULT_LIST'].append(cpu)
                            self.alerts_for[cpu] = "fault"
                        elif cpu not in self.prev_cpu_info and cpu in self.stored_cpu_info[
                                self._node_id]['FAULT_LIST']:
                            # This is a repaired cpu
                            self.alerts_for[cpu] = "fault_resolved"
                    # Update stored cpu info for next run
                    self.stored_cpu_info[
                        self._node_id]['CPU_LIST'] = self.current_cpu_info
                    to_update = True
            else:
                # Previous cpu info not available, need to store current info
                if not self.stored_cpu_info:
                    # No info is available
                    self.stored_cpu_info = {}
                # Add info for the current node
                self.stored_cpu_info[self._node_id] = {}
                self.stored_cpu_info[
                    self._node_id]['CPU_LIST'] = self.current_cpu_info
                self.stored_cpu_info[self._node_id]['FAULT_LIST'] = []
                # Update stored cpu info
                to_update = True

        except Exception as e:
            logger.error(
                f"Error while processing cpu info, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

        # Send alerts
        for cpu, alert_type in self.alerts_for.items():
            if self._generate_alert(
                    cpu,
                    alert_type) == True and alert_type == "fault_resolved":
                # Delete from the FAULT_LIST
                self.stored_cpu_info[self._node_id]['FAULT_LIST'].remove(cpu)

        # Update stored cpu info
        if to_update:
            store.put(self.stored_cpu_info, self.CPU_FAULT_SENSOR_DATA)

    def fill_specific_info(self):
        """Fills the specific info to be sent via alert"""
        if not self.specific_info:
            # Create a set of all relevant cpus
            cpu_list = set(self.prev_cpu_info + self.current_cpu_info)
            # Iterate through the set
            for cpu in cpu_list:
                item = {}
                item['resource_id'] = self.RESOURCE_ID + str(cpu)
                # Keep default state online
                item['state'] = "online"
                if cpu in self.alerts_for.keys():
                    if self.alerts_for[cpu] == "fault":
                        item['state'] = "offline"
                self.specific_info.append(item)

    def _create_json_message(self, cpu, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        # Populate specific info
        self.fill_specific_info()
        alert_specific_info = self.specific_info

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID + str(cpu),
            "event_time": epoch_time
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, cpu, alert_type):
        """Queues the message to NodeData Message Handler"""
        try:
            json_msg = self._create_json_message(cpu, alert_type)
            if json_msg:
                # RAAL stands for - RAise ALert
                logger.info(f"RAAL: {json_msg}")
                self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)
            return True
        except Exception as e:
            logger.error(f"Exception while sending alert : {e}")
            return False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(CPUFaultSensor, self).shutdown()
class SASPortSensor(SensorThread, InternalMsgQ):
    """SAS Port Sensor which runs on its own thread periodically and
       is responsible for sensing changes is SAS ports/cable using
       available tool/utility"""

    SENSOR_NAME = "SASPortSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:interface:sas"

    # section in the configuration store
    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"
    SITE_ID = "site_id"
    CLUSTER_ID = "cluster_id"
    NODE_ID = "node_id"
    RACK_ID = "rack_id"
    POLLING_INTERVAL = "polling_interval"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "SASHBA-0"
    DEFAULT_POLLING_INTERVAL = '30'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    # Number of SAS Ports
    NUM_SAS_PORTS = 4
    # Number of Phys in a Port
    NUM_PHYS_PER_PORT = 4
    # Current Data Version
    CURRENT_DATA_VERSION = 1

    @staticmethod
    def name():
        """@return: name of the module."""
        return SASPortSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        self.phy_dir_to_linkrate_mapping = None

        # Flag to indicate suspension of module
        self._suspended = False
        self._count = 0
        self.phy_link_count = 0
        self.sas_ports_status = {}
        self.port_phy_list_dict = {}
        self.sas_phy_stored_alert = None

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY, 'DC01')
        self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY, 'RC01')
        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')
        self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY, 'CC01')

        # Get the sas port implementor from configuration
        sas_port_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self.polling_interval = int(
            Conf.get(SSPL_CONF,
                     f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}",
                     self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[
                    phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert()

        except KeyError as key_error:
            logger.error("Unable to get the instance of {} \
                Utility. Hence shutting down the sensor".format(
                sas_port_utility))
            self.shutdown()
        except Exception as e:
            if e == errno.ENOENT:
                logger.error("Problem occured while reading from sas_phy \
                    directory. directory path doesn't directory. Hence \
                    shuting down the sensor")
            elif e == errno.EACCES:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     Not enough permission to read from the directory. \
                     Hence shuting down the sensor")
            else:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     {0}. Hence shuting down the sensor".format(e))
            self.shutdown()

        return True

    def update_sas_ports_status(self):
        """
        Reads current phy status and updates port connectivity status
        Assumption : phys will be present in multiples of 4
        """
        phy_list = [*self.phy_dir_to_linkrate_mapping]
        phy_list = sort_phy_list(phy_list)

        # Now we have a sorted list of phys
        # Phys 0-3 for the 0th sas port, and so on in groups of 4 phys
        # List containing status of all phys
        hba = []
        for phy in phy_list:
            if self.phy_dir_to_linkrate_mapping[phy][1] == 'up':
                hba.append(1)
            else:
                hba.append(0)

        for i in range(0, self.NUM_SAS_PORTS):
            # Save phy names forming this port for future use
            self.port_phy_list_dict[i] = phy_list[ self.NUM_PHYS_PER_PORT * i : \
                                                        self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT ]
            # Check port status
            s = set(hba[self.NUM_PHYS_PER_PORT * i:self.NUM_PHYS_PER_PORT * i +
                        self.NUM_PHYS_PER_PORT])
            if len(s) == 1 and 0 in s:
                port_status = 'down'
            elif len(s) == 1 and 1 in s:
                port_status = 'up'
            else:
                port_status = 'degraded'
            # Store the data
            self.sas_ports_status[i] = port_status

    def check_and_send_conn_alert(self):
        """
        Sends conn fault alert if all phys go down
        Sends conn fault_resolved alert if at least 1 sas port (4 phys) comes up
        """
        # Case 1 : all fault for fault alert
        cur_all_fault = True

        # Case 2 : all fault_resolved for fault_resolved alert
        cur_all_fault_resolved = True

        # Previous conn alert that was sent
        prev_conn_alert = self.sas_phy_stored_alert['conn']

        # Current
        for port, value in self.sas_phy_stored_alert.items():
            if port in ['version', 'conn']:
                # This is key for conn alert, skip
                continue

            # Case 1 : All faults in current status
            if value != 'fault':
                cur_all_fault = False

            # Case 2 : All fault_resolved in current status
            elif value != 'fault_resolved':
                cur_all_fault_resolved = False

        if prev_conn_alert == 'fault_resolved' and cur_all_fault:
            # Send conn fault alert
            alert_type = 'fault'
            self._generate_alert(alert_type, -1)
            self.sas_phy_stored_alert['conn'] = alert_type

        elif prev_conn_alert == 'fault' and cur_all_fault_resolved:
            # Send conn fault_resolved alert
            alert_type = 'fault_resolved'
            self._generate_alert(alert_type, -1)
            self.sas_phy_stored_alert['conn'] = alert_type

    def handle_current_version_data(self):
        """Contains logic to check and send alert if data has version == 1."""
        # Compare current status of each port with previous alert_type
        for port, value in self.sas_phy_stored_alert.items():
            if port in ['version', 'conn']:
                # Skip
                continue
            if value == 'fault_resolved' and \
                        self.sas_ports_status[port] == 'down':
                alert_type = 'fault'
                self._generate_alert(alert_type, port)
                self.sas_phy_stored_alert[port] = alert_type
            elif value == 'fault' and \
                        self.sas_ports_status[port] == 'up':
                alert_type = 'fault_resolved'
                self._generate_alert(alert_type, port)
                self.sas_phy_stored_alert[port] = alert_type
        # See if conn failure/conn resolved alert needs to be sent
        self.check_and_send_conn_alert()
        # Save data to store
        store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

    def check_and_send_alert(self):
        """Checks whether conditions are met and sends alert if required
        Alerts will be sent if -
        1. All 4 phys of a sas port go up -> down : fault alert
        2. All 4 phys of a sas port come down -> up : fault_resolved alert
        Sensor data stored in persistent storage is a dict of { sas_port_number : alert_type }
        """
        # Update sas ports status
        self.update_sas_ports_status()

        # Check the version of stored alert
        version = None
        try:
            # Try to get the version
            # Exception will be raised if stored alert is None or no Version is available
            version = self.sas_phy_stored_alert['version']
        except Exception:
            logger.warning(
                f"Found no data or old data format for SASPortSensor, \
                            updating data format to version {self.CURRENT_DATA_VERSION}"
            )
            # Versioning is not implemented or there is no data, write new data
            # Initialize dummy fault_resolved for all sas ports and conn
            self.sas_phy_stored_alert = {}
            self.sas_phy_stored_alert['version'] = self.CURRENT_DATA_VERSION
            self.sas_phy_stored_alert['conn'] = 'fault_resolved'
            for i in range(0, self.NUM_SAS_PORTS):
                self.sas_phy_stored_alert[i] = 'fault_resolved'
            # Save data to store
            store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

        if version == self.CURRENT_DATA_VERSION:
            self.handle_current_version_data()

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = None
        status = None

        new_phy_up = 0
        new_phy_down = 0

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(self.polling_interval, self._priority,
                                  self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        try:
            phy_link_rate_dict = \
                self._utility_instance.get_phy_negotiated_link_rate()
            if phy_link_rate_dict:
                for key, value in phy_link_rate_dict.items():
                    link_rate = value.strip()
                    prev_linkrate_value = \
                        self.phy_dir_to_linkrate_mapping[key][0].strip()
                    prev_alert_type = \
                        self.phy_dir_to_linkrate_mapping[key][1].strip()
                    status = prev_alert_type

                    # Compare local dict wrt global dictionary for change in the
                    # negotiated link rate
                    if link_rate.lower() != prev_linkrate_value.lower():
                        # If current link rate has no value like 12/6/3 Gbit
                        # and previously it was up, then it's a fault condition
                        if 'Gbit'.lower() not in link_rate.lower(
                        ) and prev_alert_type.lower() == 'up':
                            # Increment count for new phy down which were up previously
                            new_phy_down += 1

                            # Make respective phy_status as fault
                            status = 'fault'

                        # Check if 12/6/3 Gbit is there in the current link rate and
                        # the previous alert_type is fault. If so, means phy is Up again
                        elif 'Gbit'.lower() in link_rate.lower(
                        ) and prev_alert_type.lower() == 'fault':

                            # Mark respective phy_status as Up
                            status = 'up'

                            # Increment count for new phy up
                            new_phy_up += 1

                        # Finally update the global dict with current link rate
                        # and respctive phy status
                        self.phy_dir_to_linkrate_mapping[key] = (link_rate,
                                                                 status)

                # Get current phy status i.e number of Up phys
                new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down

                # Get the last sent alert info
                self.sas_phy_stored_alert = store.get(
                    self.SAS_PORT_SENSOR_DATA)
                self.check_and_send_alert()
                # Update current active phy count for next iteration
                self.phy_link_count = new_phy_link_count

        except Exception as ae:
            logger.exception(ae)

        # Fire every 30 seconds to see if there's a change in the phy status
        self._scheduler.enter(self.polling_interval, self._priority, self.run,
                              ())

    def _create_json_message(self, alert_type, port):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        specific_info = {}
        specific_info_list = []
        description = "N/A"

        # specific_info will contain all 16 phys for conn level alert
        # Only 4 phys for port level alert
        for key, val in self.phy_dir_to_linkrate_mapping.items():
            if port != -1:
                # This is a port level alert, skip phys that are not relevant
                if key not in self.port_phy_list_dict[port]:
                    # Skip adding this phy
                    continue
            # Key will be phy-0:0. So, aplit it using ':'
            # So, structure will be SASHBA-0:phy-0
            phy_number = key.split(":")[1]
            specific_info[
                "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number
            specific_info[
                "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[
                    key][0].strip()
            specific_info_list.append(specific_info)
            specific_info = {}

        alert_specific_info = specific_info_list

        if port == -1:
            # This is a SAS HBA level connection alert
            if alert_type == 'fault':
                description = "SAS connection error detected in SAS HBA %s." % self.RESOURCE_ID
            elif alert_type == 'fault_resolved':
                description = "SAS connection re-established in SAS HBA %s." % self.RESOURCE_ID

            info = {
                "site_id": self._site_id,
                "cluster_id": self._cluster_id,
                "rack_id": self._rack_id,
                "node_id": self._node_id,
                "resource_type": self.RESOURCE_TYPE,  # node:interface:sas
                "resource_id": self.RESOURCE_ID,  # SASHBA-0
                "event_time": epoch_time,
                "description": description
            }
        else:
            # This is a port level alert
            if alert_type == 'fault':
                description = (
                    "No connectivity detected on the SAS port %s, possible"
                    "causes could be missing SAS cable, bad cable connection,"
                    "faulty cable or SAS port failure." % port)
            elif alert_type == 'fault_resolved':
                description = "Connection established on SAS port."

            info = {
                "site_id": self._site_id,
                "cluster_id": self._cluster_id,
                "rack_id": self._rack_id,
                "node_id": self._node_id,
                "resource_type":
                self.RESOURCE_TYPE + ':port',  # node:interface:sas:port
                "resource_id":
                self.RESOURCE_ID + f'-port-{port}',  # SASHBA-0-port-0
                "event_time": epoch_time,
                "description": description
            }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type, port):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type, port)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(SASPortSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(SASPortSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(SASPortSensor, self).shutdown()
Esempio n. 4
0
class NodeData(Debug):
    """Obtains data about the node and makes it available"""

    SENSOR_NAME = "NodeData"

    # conf attribute initialization
    PROBE = 'probe'

    @staticmethod
    def name():
        """@return: name of the module."""
        return NodeData.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return ("Server CPU, network, disk space, process and local mount "
                "data can not be monitored.")

    def __init__(self):
        super(NodeData, self).__init__()

        self.os_utils = OSUtils()
        self._epoch_time = str(int(time.time()))
        # Total number of CPUs
        self.cpus = psutil.cpu_count()
        self.host_id = self.os_utils.get_fqdn()

        # Calculate the load averages on separate blocking threads
        self.load_1min_average = []
        self.load_5min_average = []
        self.load_15min_average = []
        self.prev_bmcip = None
        load_1min_avg = threading.Thread(target=self._load_1min_avg).start()
        load_5min_avg = threading.Thread(target=self._load_5min_avg).start()
        load_15min_avg = threading.Thread(target=self._load_15min_avg).start()

        self.conf_reader = ConfigReader()

        nw_fault_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self._utility_instance = None

        try:
            # Creating the instance of ToolFactory class
            self.tool_factory = ToolFactory()
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(nw_fault_utility)
            if self._utility_instance:
                # Initialize the path as /sys/class/net/
                self.nw_interface_path = self._utility_instance.get_sys_dir_path(
                    'net')
        except KeyError as key_error:
            logger.error(
                f'NodeData, Unable to get the instance of {nw_fault_utility} Utility'
            )
        except Exception as err:
            logger.error(
                f'NodeData, Problem occured while getting the instance of {nw_fault_utility}'
            )

    def read_data(self, subset, debug, units="MB"):
        """Updates data based on a subset"""
        self._set_debug(debug)
        self._log_debug("read_data, subset: %s, units: %s" % (subset, units))

        try:
            # Determine the units factor value
            self.units_factor = 1
            if units == "GB":
                self.units_factor = 1000000000
            elif units == "MB":
                self.units_factor = 1000000
            elif units == "KB":
                self.units_factor = 1000

            self.host_id = self.os_utils.get_fqdn()
            # get_fqdn() function checks the socket.gethostname() to get the host name if it not available
            # then it try to find host name from socket.gethostbyaddr(socket.gethostname())[0] and return the
            # meaningful host name.

            self.local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z')

            # Branch off and gather data based upon value sent into subset
            if subset == "host_update":
                self._get_host_update_data()

            elif subset == "local_mount_data":
                self._get_local_mount_data()

            elif subset == "cpu_data":
                self._get_cpu_data()

            elif subset == "if_data":
                self._get_if_data()

            elif subset == "disk_space_alert":
                self._get_disk_space_alert_data()

        except Exception as e:
            raise Exception(f"Failed to read data, {e}")

        return True

    def _get_host_update_data(self):
        """Retrieves node information for the host_update json message"""
        logged_in_users = []
        uname_keys = ("sysname", "nodename", "version", "release", "machine")
        self.up_time = int(psutil.boot_time())
        self.boot_time = self._epoch_time
        self.uname = dict(zip(uname_keys, os.uname()))
        self.total_memory = dict(psutil.virtual_memory()._asdict())
        self.process_count = len(psutil.pids())
        for users in psutil.users():
            logged_in_users.append(dict(users._asdict()))
        self.logged_in_users = logged_in_users
        # Calculate the current number of running processes at this moment
        total_running_proc = 0
        for proc in psutil.process_iter():
            try:
                pinfo = proc.as_dict(attrs=['status'])
                if pinfo['status'] not in (psutil.STATUS_ZOMBIE,
                                           psutil.STATUS_DEAD,
                                           psutil.STATUS_STOPPED,
                                           psutil.STATUS_IDLE,
                                           psutil.STATUS_SLEEPING):
                    total_running_proc += 1
            except psutil.NoSuchProcess:
                logger.warn(
                    f"(psutil) Process '{proc.name()}' exited unexpectedly.")
        self.running_process_count = total_running_proc

    def _get_local_mount_data(self):
        """Retrieves node information for the local_mount_data json message"""
        self.total_space = int(psutil.disk_usage("/")[0]) // int(
            self.units_factor)
        self.free_space = int(psutil.disk_usage("/")[2]) // int(
            self.units_factor)
        self.total_swap = int(psutil.swap_memory()[0]) // int(
            self.units_factor)
        self.free_swap = int(psutil.swap_memory()[2]) // int(self.units_factor)
        self.free_inodes = int(100 - math.ceil((float(os.statvfs("/").f_files - os.statvfs("/").f_ffree) \
                             / os.statvfs("/").f_files) * 100))

    def _get_cpu_data(self):
        """Retrieves node information for the cpu_data json message"""
        cpu_core_usage_dict = dict()
        cpu_data = psutil.cpu_times_percent()
        self._log_debug(
            "_get_cpu_data, cpu_data: %s %s %s %s %s %s %s %s %s %s" %
            cpu_data)

        self.csps = 0  # What the hell is csps - cycles per second?
        self.user_time = int(cpu_data[0])
        self.nice_time = int(cpu_data[1])
        self.system_time = int(cpu_data[2])
        self.idle_time = int(cpu_data[3])
        self.iowait_time = int(cpu_data[4])
        self.interrupt_time = int(cpu_data[5])
        self.softirq_time = int(cpu_data[6])
        self.steal_time = int(cpu_data[7])

        self.cpu_usage = psutil.cpu_percent(interval=1, percpu=False)
        # Array to hold data about each CPU core
        self.cpu_core_data = []
        index = 0
        while index < self.cpus:
            self._log_debug(
                "_get_cpu_data, index: %s, 1 min: %s, 5 min: %s, 15 min: %s" %
                (index, self.load_1min_average[index],
                 self.load_5min_average[index],
                 self.load_15min_average[index]))

            cpu_core_data = {
                "coreId": index,
                "load1MinAvg": int(self.load_1min_average[index]),
                "load5MinAvg": int(self.load_5min_average[index]),
                "load15MinAvg": int(self.load_15min_average[index]),
                "ips": 0
            }
            self.cpu_core_data.append(cpu_core_data)
            index += 1

    def _get_if_data(self):
        """Retrieves node information for the if_data json message"""
        net_data = psutil.net_io_counters(pernic=True)
        # Array to hold data about each network interface
        self.if_data = []
        bmc_data = self._get_bmc_info()
        for interface, if_data in net_data.items():
            self._log_debug("_get_if_data, interface: %s %s" %
                            (interface, net_data))
            nw_status = self._fetch_nw_status()
            nw_cable_conn_status = self.fetch_nw_cable_conn_status(interface)
            if_data = {
                "ifId":
                interface,
                "networkErrors":
                (net_data[interface].errin + net_data[interface].errout),
                "droppedPacketsIn":
                net_data[interface].dropin,
                "packetsIn":
                net_data[interface].packets_recv,
                "trafficIn":
                net_data[interface].bytes_recv,
                "droppedPacketsOut":
                net_data[interface].dropout,
                "packetsOut":
                net_data[interface].packets_sent,
                "trafficOut":
                net_data[interface].bytes_sent,
                "nwStatus":
                nw_status[interface][0],
                "ipV4":
                nw_status[interface][1],
                "nwCableConnStatus":
                nw_cable_conn_status
            }
            self.if_data.append(if_data)
        self.if_data.append(bmc_data)

    def _fetch_nw_status(self):
        nw_dict = {}
        nws = os.popen("ip --br a | awk '{print $1, $2, $3}'").read().split(
            '\n')[:-1]
        for nw in nws:
            if nw.split(' ')[2]:
                ip = nw.split(' ')[2].split("/")[0]
            else:
                ip = ""
            nw_dict[nw.split(' ')[0]] = [nw.split(' ')[1], ip]
        logger.debug("network info going is : {}".format(nw_dict))
        return nw_dict

    def fetch_nw_cable_conn_status(self, interface):
        carrier_status = None
        try:
            carrier_status = Network().get_link_state(interface)
        except NetworkError as err:
            # NetworkError i.e. all OSError exceptions indicate that
            # the carrier file is not available to access which
            # constitute the UNKOWN status for network cable.
            logger.debug(err)
            carrier_status = "UNKNOWN"
        except Exception as e:
            # All other exceptions are unexpected and are logged as errors.
            logger.excpetion(
                "Problem occured while reading from nw carrier file:"
                f" {self.nw_interface_path}/{interface}/carrier. Error: {e}")
        return carrier_status

    def _get_bmc_info(self):
        """
        nwCableConnection will be default UNKNOWN,
        Until solution to find bmc eth port cable connection status is found.
        """
        try:
            bmcdata = {
                'ifId': 'ebmc0',
                'ipV4Prev': "",
                'ipV4': "",
                'nwStatus': "DOWN",
                'nwCableConnStatus': 'UNKNOWN'
            }
            ipdata = sp.Popen(
                "sudo ipmitool lan print",
                shell=True,
                stdout=sp.PIPE,
                stderr=sp.PIPE).communicate()[0].decode().strip()
            bmcip = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ipdata)
            if bmcip:
                bmcip = bmcip[0]
                pingbmchost = "ping -c1 -W1 -q " + bmcip
                child = sp.Popen(pingbmchost.split(), stdout=sp.PIPE)
                streamdata = child.communicate(
                )[0]  #child must be communicated before fetching return code.
                retcode = child.returncode
                if self.prev_bmcip is not None and self.prev_bmcip != bmcip:
                    bmcdata['ipV4Prev'] = self.prev_bmcip
                    bmcdata['ipV4'] = bmcip
                    self.prev_bmcip = bmcip
                else:
                    self.prev_bmcip = bmcdata['ipV4Prev'] = bmcdata[
                        'ipV4'] = bmcip
                if retcode == 0:
                    bmcdata['nwStatus'] = "UP"
                else:
                    logger.warn("BMC Host:{0} is not reachable".format(bmcip))
        except Exception as e:
            logger.error(
                "Exception occurs while fetching bmc_info:{}".format(e))
        return bmcdata

    def _get_disk_space_alert_data(self):
        """Retrieves node information for the disk_space_alert_data json message"""
        self.total_space = int(psutil.disk_usage("/")[0]) // int(
            self.units_factor)
        self.free_space = int(psutil.disk_usage("/")[2]) // int(
            self.units_factor)
        self.disk_used_percentage = psutil.disk_usage("/")[3]

    def _load_1min_avg(self):
        """Loop forever calculating the one minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_1min_average.append(-1)
            index += 1

        while True:
            # API call blocks for one minute and then returns the value
            self.load_1min_average = psutil.cpu_percent(interval=1,
                                                        percpu=True)

    def _load_5min_avg(self):
        """Loop forever calculating the five minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_5min_average.append(-1)
            index += 1

        while True:
            # API call blocks for five minutes and then returns the value
            self.load_5min_average = psutil.cpu_percent(interval=5,
                                                        percpu=True)

    def _load_15min_avg(self):
        """Loop forever calculating the fifteen minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_15min_average.append(-1)
            index += 1

        while True:
            # API call blocks for fifteen minutes and then returns the value
            self.load_15min_average = psutil.cpu_percent(interval=15,
                                                         percpu=True)
class SASPortSensor(SensorThread, InternalMsgQ):
    """SAS Port Sensor which runs on its own thread periodically and
       is responsible for sensing changes is SAS ports/cable using
       available tool/utility"""

    SENSOR_NAME = "SASPortSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:interface:sas"

    # section in the configuration store
    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"
    SITE_ID = "site_id"
    CLUSTER_ID = "cluster_id"
    NODE_ID = "node_id"
    RACK_ID = "rack_id"
    POLLING_INTERVAL = "polling_interval"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "SASHBA-0"
    DEFAULT_POLLING_INTERVAL = '30'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    MIN_PHY_COUNT = 4

    @staticmethod
    def name():
        """@return: name of the module."""
        return SASPortSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        self.phy_dir_to_linkrate_mapping = None

        # Flag to indicate suspension of module
        self._suspended = False
        self._count = 0
        self.phy_link_count = 0

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.SITE_ID),
            '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.CLUSTER_ID),
            '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.RACK_ID),
            '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.NODE_ID),
            '001')

        # Get the sas port implementor from configuration
        sas_port_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, "sysfs")

        self.polling_interval = int(
            self._conf_reader._get_value_with_default(
                self.SENSOR_NAME.upper(), self.POLLING_INTERVAL,
                self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'Up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[
                    phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert(self.phy_link_count)

        except KeyError as key_error:
            logger.error("Unable to get the instance of {} \
                Utility. Hence shutting down the sensor".format(
                sas_port_utility))
            self.shutdown()
        except Exception as e:
            if e == errno.ENOENT:
                logger.error("Problem occured while reading from sas_phy \
                    directory. directory path doesn't directory. Hence \
                    shuting down the sensor")
            elif e == errno.EACCES:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     Not enough permission to read from the directory. \
                     Hence shuting down the sensor")
            else:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     {0}. Hence shuting down the sensor".format(e))
            self.shutdown()

        return True

    def check_and_send_alert(self, new_phy_link_count):
        """Checks whether conditions are met and sends alert if required
        Alerts will be sent if -
        1. All phys are down -> fault alert
        2. 4 phys are up -> fault_resolved alert
        3. Next group of 4 phys comes up -> informational alert

        Sensor data stored in Consul is a tuple (alert_type, phy_link_count)
        """
        if self.sas_phy_stored_alert == None:
            # No info is stored for this node in Consul
            # Initialize alert_type to dummy fault_resolved
            self.sas_phy_stored_alert = ('fault_resolved', new_phy_link_count)
            # Save data to Consul
            store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)
        elif self.sas_phy_stored_alert[0] == 'fault':
            # Previous alert sent for this node was fault, check if fault is resolved
            if new_phy_link_count >= self.MIN_PHY_COUNT:
                alert_type = 'fault_resolved'
                # Send alert
                self._generate_alert(alert_type)
                # Save data to Consul
                self.sas_phy_stored_alert = (alert_type, new_phy_link_count)
                store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)
        elif self.sas_phy_stored_alert[0] in ['fault_resolved', 'insertion']:
            # Check if we need to send informational alert
            if new_phy_link_count > self.sas_phy_stored_alert[
                    1] and new_phy_link_count % self.MIN_PHY_COUNT == 0:
                alert_type = 'insertion'
                # Send alert
                self._generate_alert(alert_type)
                # Save data to Consul
                self.sas_phy_stored_alert = (alert_type, new_phy_link_count)
                store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)
            # Check to see if we need to send fault alert
            if new_phy_link_count == 0:
                alert_type = 'fault'
                # Send alert
                self._generate_alert(alert_type)
                # Save data to Consul
                self.sas_phy_stored_alert = (alert_type, new_phy_link_count)
                store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = None
        status = None

        new_phy_up = 0
        new_phy_down = 0

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(self.polling_interval, self._priority,
                                  self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        try:
            phy_link_rate_dict = \
                self._utility_instance.get_phy_negotiated_link_rate()
            if phy_link_rate_dict:
                for key, value in phy_link_rate_dict.items():
                    link_rate = value.strip()
                    prev_linkrate_value = \
                        self.phy_dir_to_linkrate_mapping[key][0].strip()
                    prev_alert_type = \
                        self.phy_dir_to_linkrate_mapping[key][1].strip()
                    status = prev_alert_type

                    # Compare local dict wrt global dictionary for change in the
                    # negitiated link rate
                    if link_rate.lower() != prev_linkrate_value.lower():
                        # If current link rate has no value like 12/6/3 Gbit
                        # and previously it was up, then it's a fault condition
                        if 'Gbit'.lower() not in link_rate.lower(
                        ) and prev_alert_type.lower() == 'up':
                            # Increment count for new phy down which were up previously
                            new_phy_down += 1

                            # Make respective phy_status as fault
                            status = 'fault'

                        # Check if 12/6/3 Gbit is there in the current link rate and
                        # the previous alert_type is fault. If so, means phy is Up again
                        elif 'Gbit'.lower() in link_rate.lower(
                        ) and prev_alert_type.lower() == 'fault':

                            # Mark respective phy_status as Up
                            status = 'Up'

                            # Increment count for new phy up
                            new_phy_up += 1

                        # Finally update the global dict with current link rate
                        # and respctive phy status
                        self.phy_dir_to_linkrate_mapping[key] = (link_rate,
                                                                 status)

                # Get current phy status i.e number of Up phys
                new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down

                # Get the last sent alert info
                # It is a tuple of (alert_type, phy_link_count)
                self.sas_phy_stored_alert = store.get(
                    self.SAS_PORT_SENSOR_DATA)
                self.check_and_send_alert(new_phy_link_count)
                # Update current active phy count for next iteration
                self.phy_link_count = new_phy_link_count

        except Exception as ae:
            logger.exception(ae)

        # Fire every 30 seconds to see if there's a change in the phy status
        self._scheduler.enter(self.polling_interval, self._priority, self.run,
                              ())

    def _create_json_message(self, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        specific_info = {}
        specific_info_list = []

        for key, val in self.phy_dir_to_linkrate_mapping.items():
            # key will be phy-0:0. So, aplit it using ':'
            # So, structure will be SASHBA-0:phy-0
            phy_number = key.split(":")[1]
            specific_info[
                "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number
            specific_info[
                "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[
                    key][0].strip()
            specific_info_list.append(specific_info)
            specific_info = {}

        alert_specific_info = specific_info_list

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID,
            "event_time": epoch_time
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(SASPortSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(SASPortSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(SASPortSensor, self).shutdown()