コード例 #1
0
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(CPUFaultSensor, self).initialize(conf_reader)

        super(CPUFaultSensor, self).initialize_msgQ(msgQlist)

        # get the cpu fault implementor from configuration
        cpu_fault_utility = Conf.get(SSPL_CONF,
                                     f"{self.name().upper()}>{self.PROBE}",
                                     'sysfs')

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(cpu_fault_utility)
        except Exception as err:
            raise Exception(
                "Error while initializing. "
                f"Unable to get the instance of {cpu_fault_utility} Utility, {err}"
            )

        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')
        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.CPU_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}')

        return True
コード例 #2
0
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(CPUFaultSensor, self).initialize(conf_reader)

        super(CPUFaultSensor, self).initialize_msgQ(msgQlist)

        self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY,'DC01')
        self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY,'RC01')
        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY,'SN01')
        self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY,'CC01')

        # get the cpu fault implementor from configuration
        cpu_fault_utility = Conf.get(SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}",
                                    'sysfs')

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(cpu_fault_utility)
        except Exception as e:
            logger.error(f"Error while initializing, shutting down CPUFaultSensor : {e}")
            self.shutdown()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.CPU_FAULT_SENSOR_DATA = os.path.join(cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}')

        return True
コード例 #3
0
 def __init__(self):
     """Initialize server."""
     super().__init__()
     self.log = CustomLog(const.HEALTH_SVC_NAME)
     self.validate_server_type_support()
     self.sysfs = ToolFactory().get_instance('sysfs')
     self.sysfs.initialize()
     self.sysfs_base_path = self.sysfs.get_sysfs_base_path()
     self.cpu_path = self.sysfs_base_path + const.CPU_PATH
     hw_resources = {
         'cpu': self.get_cpu_info,
         'platform_sensors': self.get_platform_sensors_info,
         'memory': self.get_mem_info,
         'fans': self.get_fans_info,
         'nw_ports': self.get_nw_ports_info,
         'sas_hba': self.get_sas_hba_info,
         'sas_ports': self.get_sas_ports_info,
         'disks': self.get_disks_info,
         'psus': self.get_psu_info
     }
     sw_resources = {
         'cortx_sw_services': self.get_cortx_service_info,
         'external_sw_services': self.get_external_service_info,
         'raid': self.get_raid_info
     }
     self.server_resources = {"hw": hw_resources, "sw": sw_resources}
     self._ipmi = IpmiFactory().get_implementor("ipmitool")
     self.platform_sensor_list = ['Temperature', 'Voltage', 'Current']
コード例 #4
0
 def __init__(self):
     """Initialize server."""
     super().__init__()
     self.log = CustomLog(const.HEALTH_SVC_NAME)
     server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY)
     Platform.validate_server_type_support(self.log, ResourceMapError,
                                           server_type)
     self.sysfs = ToolFactory().get_instance('sysfs')
     self.sysfs.initialize()
     self.sysfs_base_path = self.sysfs.get_sysfs_base_path()
     self.cpu_path = self.sysfs_base_path + const.CPU_PATH
     hw_resources = {
         'cpu': self.get_cpu_info,
         'platform_sensor': self.get_platform_sensors_info,
         'memory': self.get_mem_info,
         'fan': self.get_fans_info,
         'nw_port': self.get_nw_ports_info,
         'sas_hba': self.get_sas_hba_info,
         'sas_port': self.get_sas_ports_info,
         'disk': self.get_disks_info,
         'psu': self.get_psu_info
     }
     sw_resources = {
         'cortx_sw_services': self.get_cortx_service_info,
         'external_sw_services': self.get_external_service_info,
         'raid': self.get_raid_info
     }
     self.server_resources = {"hw": hw_resources, "sw": sw_resources}
     self._ipmi = IpmiFactory().get_implementor("ipmitool")
     self.platform_sensor_list = ['Temperature', 'Voltage', 'Current']
     self.service = Service()
     self.resource_indexing_map = ServerResourceMap.resource_indexing_map\
         ["health"]
コード例 #5
0
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(MemFaultSensor, self).initialize(conf_reader)

        super(MemFaultSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.SITE_ID_KEY), '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.CLUSTER_ID_KEY), '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.RACK_ID_KEY), '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.NODE_ID_KEY), '001')

        # get the mem fault implementor from configuration
        mem_fault_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, "procfs")

        self.polling_interval = int(
            self._conf_reader._get_value_with_default(
                self.SENSOR_NAME.upper(), self.POLLING_INTERVAL_KEY,
                self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(mem_fault_utility)
#            self._utility_instance.initialize()
        except KeyError as key_error:
            logger.error(
                "Unable to get the instance of {} \
                Utility. Hence shutting down the sensor {}"\
                .format(mem_fault_utility, MemFaultSensor.SENSOR_NAME))
            self.shutdown()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.MEM_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'MEM_FAULT_SENSOR_DATA_{self._node_id}')

        return True
コード例 #6
0
    def __init__(self):
        super(NodeData, self).__init__()

        self.os_utils = OSUtils()
        self._epoch_time = str(int(time.time()))
        # Total number of CPUs
        self.cpus = psutil.cpu_count()
        self.host_id = self.os_utils.get_fqdn()

        # Calculate the load averages on separate blocking threads
        self.load_1min_average = []
        self.load_5min_average = []
        self.load_15min_average = []
        self.prev_bmcip = None
        load_1min_avg = threading.Thread(target=self._load_1min_avg).start()
        load_5min_avg = threading.Thread(target=self._load_5min_avg).start()
        load_15min_avg = threading.Thread(target=self._load_15min_avg).start()

        self.conf_reader = ConfigReader()

        nw_fault_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self._utility_instance = None

        try:
            # Creating the instance of ToolFactory class
            self.tool_factory = ToolFactory()
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(nw_fault_utility)
            if self._utility_instance:
                # Initialize the path as /sys/class/net/
                self.nw_interface_path = self._utility_instance.get_sys_dir_path(
                    'net')
        except KeyError as key_error:
            logger.error(
                f'NodeData, Unable to get the instance of {nw_fault_utility} Utility'
            )
        except Exception as err:
            logger.error(
                f'NodeData, Problem occured while getting the instance of {nw_fault_utility}'
            )
コード例 #7
0
class MemFaultSensor(SensorThread, InternalMsgQ):
    """Memory fault Sensor which runs on its own thread once every power cycle and
       is responsible for identifying total RAM memory on the node and any errors in it using
       available tool/utility"""

    SENSOR_NAME = "MemFaultSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:memory"

    # section in the configuration store
    SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION"
    SITE_ID_KEY = "site_id"
    CLUSTER_ID_KEY = "cluster_id"
    NODE_ID_KEY = "node_id"
    RACK_ID_KEY = "rack_id"
    POLLING_INTERVAL_KEY = "polling_interval"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "0"
    DEFAULT_POLLING_INTERVAL = '0'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the module."""
        return MemFaultSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(MemFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance
        self.total_mem = None
        self.mem_path_file = None
        self.prev_mem = None
        self.fault_alert_state = "Neutral State"
        # Flag to indicate suspension of module
        self._suspended = False

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(MemFaultSensor, self).initialize(conf_reader)

        super(MemFaultSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.SITE_ID_KEY), '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.CLUSTER_ID_KEY), '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.RACK_ID_KEY), '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.NODE_ID_KEY), '001')

        # get the mem fault implementor from configuration
        mem_fault_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, "procfs")

        self.polling_interval = int(
            self._conf_reader._get_value_with_default(
                self.SENSOR_NAME.upper(), self.POLLING_INTERVAL_KEY,
                self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(mem_fault_utility)
#            self._utility_instance.initialize()
        except KeyError as key_error:
            logger.error(
                "Unable to get the instance of {} \
                Utility. Hence shutting down the sensor {}"\
                .format(mem_fault_utility, MemFaultSensor.SENSOR_NAME))
            self.shutdown()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.MEM_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'MEM_FAULT_SENSOR_DATA_{self._node_id}')

        return True

    def get_stored_mem_info(self):
        """ Get the memory info from consul"""

        if store.exists(self.MEM_FAULT_SENSOR_DATA):
            consul_data = (store.get(self.MEM_FAULT_SENSOR_DATA)).split(":")
            self.prev_mem = consul_data[0].strip()
            self.fault_alert_state = consul_data[1].strip()

    def put_mem_info(self, total_memory_size):
        """ Store the current memory in Consul"""

        store.put(f"{total_memory_size}:{self.fault_alert_state}",
                  self.MEM_FAULT_SENSOR_DATA)

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = "fault"

        mem_path = self._utility_instance.get_proc_memory('meminfo')
        if mem_path.is_file():
            self.mem_path_file = mem_path.read_text()
            mem_info_fields = self.mem_path_file.split()

            if mem_info_fields[0] == 'MemTotal:':
                self.total_mem = mem_info_fields[1]

                # Get data from store if available and compare to the current value
                self.get_stored_mem_info()

                if self.prev_mem is not None:
                    # Fault and Fault_resolved Both conditions are handled.
                    if int(self.prev_mem) > int(self.total_mem):
                        # update the store with new value, raise an alert of type "fault"
                        if self.fault_alert_state == "Neutral State":
                            self.fault_alert_state = "Fault Generated"
                            self._generate_alert(alert_type)
                            self.put_mem_info(self.prev_mem)

                    elif (int(self.prev_mem) <= int(self.total_mem)) and (
                            self.fault_alert_state == "Fault Generated"):
                        self.fault_alert_state = "Neutral State"
                        alert_type = "fault_resolved"
                        self._generate_alert(alert_type)
                        self.put_mem_info(self.total_mem)
                else:
                    self.put_mem_info(self.total_mem)
            else:
                logger.error(
                    "MemFaultSensor: invalid file, shutting down the sensor")
                self.shutdown()
                return True
        else:
            logger.error(
                "MemFaultSensor: file does not exist, shutting down the sensor"
            )
            self.shutdown()
            return True

        # Do not proceed if module is suspended
        # Memory sensor is going to trigger only during SSPL reboot; at reboot time a sensor
        # can not be in suspended state.
        # Commented code is retained if in future we want to make the sensor periodic,
        # this piece will be needed
        #if self._suspended is True:
        #    self._scheduler.enter(self.polling_interval, self._priority, self.run, ())
        #    return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        # self scheduling is commented so that the process runs only once per SSPL reboot
        # Enable with correct polling_interval if in future memory sensor needs to run periodically
        #self._scheduler.enter(self.polling_interval, self._priority, self.run, ())

    def _create_json_message(self, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        specific_info = {}
        specific_info_list = []
        if alert_type == "fault":
            specific_info["event"] = \
                    "Total available main memory value decreased from {} kB to {} kB"\
                    .format(self.prev_mem, self.total_mem)
        elif alert_type == "fault_resolved":
            specific_info["event"] = \
                    "Total main memory value available {} kB"\
                    .format(self.total_mem)

        # populate all the data from /proc/meminfo
        split_strs = [
            s.split(maxsplit=1) for s in self.mem_path_file.splitlines()
        ]
        dictionary_str = dict(split_strs)
        specific_info["meminfo"] = dictionary_str
        specific_info_list.append(specific_info)

        alert_specific_info = specific_info_list

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID,
            "event_time": epoch_time
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(MemFaultSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(MemFaultSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(MemFaultSensor, self).shutdown()
コード例 #8
0
class CPUFaultSensor(SensorThread, InternalMsgQ):
    """CPU Fault Sensor which runs on its own thread on each boot up and
       is responsible for sensing changes in online CPUs using
       available tool/utility"""

    SENSOR_NAME = "CPUFaultSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:os:cpu:core"

    # Section in the configuration store
    SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION"
    SITE_ID_KEY = "site_id"
    CLUSTER_ID_KEY = "cluster_id"
    NODE_ID_KEY = "node_id"
    RACK_ID_KEY = "rack_id"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "CPU-"

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    @staticmethod
    def name():
        """@return: name of the module."""
        return CPUFaultSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        # CPU info
        self.stored_cpu_info = None
        self.prev_cpu_info = None
        self.current_cpu_info = None

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(CPUFaultSensor, self).initialize(conf_reader)

        super(CPUFaultSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.SITE_ID_KEY), '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.CLUSTER_ID_KEY), '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.RACK_ID_KEY), '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION_KEY,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION_KEY).get(
                self.NODE_ID_KEY), '001')

        # get the cpu fault implementor from configuration
        cpu_fault_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, 'sysfs')

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(cpu_fault_utility)
        except Exception as e:
            logger.error(
                f"Error while initializing, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.CPU_FAULT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}')

        return True

    def read_stored_cpu_info(self):
        """Read the most recent stored cpu info"""
        try:
            if self.stored_cpu_info is None:
                self.stored_cpu_info = store.get(self.CPU_FAULT_SENSOR_DATA)
            if self.stored_cpu_info is not None and self._node_id in self.stored_cpu_info.keys(
            ):
                self.prev_cpu_info = self.stored_cpu_info[
                    self._node_id]['CPU_LIST']
        except Exception as e:
            logger.error(
                f"Error while reading stored cpu info, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

    def read_current_cpu_info(self):
        """Read current cpu info"""
        try:
            self.current_cpu_info = self._utility_instance.get_cpu_info()
        except Exception as e:
            logger.error(
                f"Error while reading current cpu info, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

    def run(self):
        """Run the sensor on its own thread"""

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()
        # Read recent stored cpu info
        self.read_stored_cpu_info()
        # Store alerts to be sent here
        self.alerts_for = {}
        # Specific info field for alerts
        self.specific_info = []
        # Read current cpu info
        self.read_current_cpu_info()

        to_update = False
        # Compare with previous cpu info
        # If a cpu is present in prev_cpu_info and not present in current_cpu_info : fault alert is generated
        # If a cpu is present in current_cpu_info and not present in prev_cpu_info : two possibilities
        #   1) if cpu has an outstanding fault alert : it is a repaired cpu, hence generate fault_resolved
        #   2) if cpu has no outstanding alert : it is a newly added cpu, do not do anything
        try:
            if self.prev_cpu_info:
                if self.current_cpu_info != self.prev_cpu_info:
                    # Create a set of all relevant cpus
                    cpu_list = set(self.prev_cpu_info + self.current_cpu_info)
                    # Iterate through the set
                    for cpu in cpu_list:
                        if cpu not in self.current_cpu_info and cpu not in self.stored_cpu_info[
                                self._node_id]['FAULT_LIST']:
                            # This is a failed cpu
                            self.stored_cpu_info[
                                self._node_id]['FAULT_LIST'].append(cpu)
                            self.alerts_for[cpu] = "fault"
                        elif cpu not in self.prev_cpu_info and cpu in self.stored_cpu_info[
                                self._node_id]['FAULT_LIST']:
                            # This is a repaired cpu
                            self.alerts_for[cpu] = "fault_resolved"
                    # Update stored cpu info for next run
                    self.stored_cpu_info[
                        self._node_id]['CPU_LIST'] = self.current_cpu_info
                    to_update = True
            else:
                # Previous cpu info not available, need to store current info
                if not self.stored_cpu_info:
                    # No info is available
                    self.stored_cpu_info = {}
                # Add info for the current node
                self.stored_cpu_info[self._node_id] = {}
                self.stored_cpu_info[
                    self._node_id]['CPU_LIST'] = self.current_cpu_info
                self.stored_cpu_info[self._node_id]['FAULT_LIST'] = []
                # Update stored cpu info
                to_update = True

        except Exception as e:
            logger.error(
                f"Error while processing cpu info, shutting down CPUFaultSensor : {e}"
            )
            self.shutdown()

        # Send alerts
        for cpu, alert_type in self.alerts_for.items():
            if self._generate_alert(
                    cpu,
                    alert_type) == True and alert_type == "fault_resolved":
                # Delete from the FAULT_LIST
                self.stored_cpu_info[self._node_id]['FAULT_LIST'].remove(cpu)

        # Update stored cpu info
        if to_update:
            store.put(self.stored_cpu_info, self.CPU_FAULT_SENSOR_DATA)

    def fill_specific_info(self):
        """Fills the specific info to be sent via alert"""
        if not self.specific_info:
            # Create a set of all relevant cpus
            cpu_list = set(self.prev_cpu_info + self.current_cpu_info)
            # Iterate through the set
            for cpu in cpu_list:
                item = {}
                item['resource_id'] = self.RESOURCE_ID + str(cpu)
                # Keep default state online
                item['state'] = "online"
                if cpu in self.alerts_for.keys():
                    if self.alerts_for[cpu] == "fault":
                        item['state'] = "offline"
                self.specific_info.append(item)

    def _create_json_message(self, cpu, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        # Populate specific info
        self.fill_specific_info()
        alert_specific_info = self.specific_info

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID + str(cpu),
            "event_time": epoch_time
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, cpu, alert_type):
        """Queues the message to NodeData Message Handler"""
        try:
            json_msg = self._create_json_message(cpu, alert_type)
            if json_msg:
                # RAAL stands for - RAise ALert
                logger.info(f"RAAL: {json_msg}")
                self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)
            return True
        except Exception as e:
            logger.error(f"Exception while sending alert : {e}")
            return False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(CPUFaultSensor, self).shutdown()
コード例 #9
0
class NodeData(Debug):
    """Obtains data about the node and makes it available"""

    SENSOR_NAME = "NodeData"

    # conf attribute initialization
    PROBE = 'probe'

    @staticmethod
    def name():
        """@return: name of the module."""
        return NodeData.SENSOR_NAME

    @staticmethod
    def impact():
        """Returns impact of the module."""
        return ("Server CPU, network, disk space, process and local mount "
                "data can not be monitored.")

    def __init__(self):
        super(NodeData, self).__init__()

        self.os_utils = OSUtils()
        self._epoch_time = str(int(time.time()))
        # Total number of CPUs
        self.cpus = psutil.cpu_count()
        self.host_id = self.os_utils.get_fqdn()

        # Calculate the load averages on separate blocking threads
        self.load_1min_average = []
        self.load_5min_average = []
        self.load_15min_average = []
        self.prev_bmcip = None
        load_1min_avg = threading.Thread(target=self._load_1min_avg).start()
        load_5min_avg = threading.Thread(target=self._load_5min_avg).start()
        load_15min_avg = threading.Thread(target=self._load_15min_avg).start()

        self.conf_reader = ConfigReader()

        nw_fault_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self._utility_instance = None

        try:
            # Creating the instance of ToolFactory class
            self.tool_factory = ToolFactory()
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(nw_fault_utility)
            if self._utility_instance:
                # Initialize the path as /sys/class/net/
                self.nw_interface_path = self._utility_instance.get_sys_dir_path(
                    'net')
        except KeyError as key_error:
            logger.error(
                f'NodeData, Unable to get the instance of {nw_fault_utility} Utility'
            )
        except Exception as err:
            logger.error(
                f'NodeData, Problem occured while getting the instance of {nw_fault_utility}'
            )

    def read_data(self, subset, debug, units="MB"):
        """Updates data based on a subset"""
        self._set_debug(debug)
        self._log_debug("read_data, subset: %s, units: %s" % (subset, units))

        try:
            # Determine the units factor value
            self.units_factor = 1
            if units == "GB":
                self.units_factor = 1000000000
            elif units == "MB":
                self.units_factor = 1000000
            elif units == "KB":
                self.units_factor = 1000

            self.host_id = self.os_utils.get_fqdn()
            # get_fqdn() function checks the socket.gethostname() to get the host name if it not available
            # then it try to find host name from socket.gethostbyaddr(socket.gethostname())[0] and return the
            # meaningful host name.

            self.local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z')

            # Branch off and gather data based upon value sent into subset
            if subset == "host_update":
                self._get_host_update_data()

            elif subset == "local_mount_data":
                self._get_local_mount_data()

            elif subset == "cpu_data":
                self._get_cpu_data()

            elif subset == "if_data":
                self._get_if_data()

            elif subset == "disk_space_alert":
                self._get_disk_space_alert_data()

        except Exception as e:
            raise Exception(f"Failed to read data, {e}")

        return True

    def _get_host_update_data(self):
        """Retrieves node information for the host_update json message"""
        logged_in_users = []
        uname_keys = ("sysname", "nodename", "version", "release", "machine")
        self.up_time = int(psutil.boot_time())
        self.boot_time = self._epoch_time
        self.uname = dict(zip(uname_keys, os.uname()))
        self.total_memory = dict(psutil.virtual_memory()._asdict())
        self.process_count = len(psutil.pids())
        for users in psutil.users():
            logged_in_users.append(dict(users._asdict()))
        self.logged_in_users = logged_in_users
        # Calculate the current number of running processes at this moment
        total_running_proc = 0
        for proc in psutil.process_iter():
            try:
                pinfo = proc.as_dict(attrs=['status'])
                if pinfo['status'] not in (psutil.STATUS_ZOMBIE,
                                           psutil.STATUS_DEAD,
                                           psutil.STATUS_STOPPED,
                                           psutil.STATUS_IDLE,
                                           psutil.STATUS_SLEEPING):
                    total_running_proc += 1
            except psutil.NoSuchProcess:
                logger.warn(
                    f"(psutil) Process '{proc.name()}' exited unexpectedly.")
        self.running_process_count = total_running_proc

    def _get_local_mount_data(self):
        """Retrieves node information for the local_mount_data json message"""
        self.total_space = int(psutil.disk_usage("/")[0]) // int(
            self.units_factor)
        self.free_space = int(psutil.disk_usage("/")[2]) // int(
            self.units_factor)
        self.total_swap = int(psutil.swap_memory()[0]) // int(
            self.units_factor)
        self.free_swap = int(psutil.swap_memory()[2]) // int(self.units_factor)
        self.free_inodes = int(100 - math.ceil((float(os.statvfs("/").f_files - os.statvfs("/").f_ffree) \
                             / os.statvfs("/").f_files) * 100))

    def _get_cpu_data(self):
        """Retrieves node information for the cpu_data json message"""
        cpu_core_usage_dict = dict()
        cpu_data = psutil.cpu_times_percent()
        self._log_debug(
            "_get_cpu_data, cpu_data: %s %s %s %s %s %s %s %s %s %s" %
            cpu_data)

        self.csps = 0  # What the hell is csps - cycles per second?
        self.user_time = int(cpu_data[0])
        self.nice_time = int(cpu_data[1])
        self.system_time = int(cpu_data[2])
        self.idle_time = int(cpu_data[3])
        self.iowait_time = int(cpu_data[4])
        self.interrupt_time = int(cpu_data[5])
        self.softirq_time = int(cpu_data[6])
        self.steal_time = int(cpu_data[7])

        self.cpu_usage = psutil.cpu_percent(interval=1, percpu=False)
        # Array to hold data about each CPU core
        self.cpu_core_data = []
        index = 0
        while index < self.cpus:
            self._log_debug(
                "_get_cpu_data, index: %s, 1 min: %s, 5 min: %s, 15 min: %s" %
                (index, self.load_1min_average[index],
                 self.load_5min_average[index],
                 self.load_15min_average[index]))

            cpu_core_data = {
                "coreId": index,
                "load1MinAvg": int(self.load_1min_average[index]),
                "load5MinAvg": int(self.load_5min_average[index]),
                "load15MinAvg": int(self.load_15min_average[index]),
                "ips": 0
            }
            self.cpu_core_data.append(cpu_core_data)
            index += 1

    def _get_if_data(self):
        """Retrieves node information for the if_data json message"""
        net_data = psutil.net_io_counters(pernic=True)
        # Array to hold data about each network interface
        self.if_data = []
        bmc_data = self._get_bmc_info()
        for interface, if_data in net_data.items():
            self._log_debug("_get_if_data, interface: %s %s" %
                            (interface, net_data))
            nw_status = self._fetch_nw_status()
            nw_cable_conn_status = self.fetch_nw_cable_conn_status(interface)
            if_data = {
                "ifId":
                interface,
                "networkErrors":
                (net_data[interface].errin + net_data[interface].errout),
                "droppedPacketsIn":
                net_data[interface].dropin,
                "packetsIn":
                net_data[interface].packets_recv,
                "trafficIn":
                net_data[interface].bytes_recv,
                "droppedPacketsOut":
                net_data[interface].dropout,
                "packetsOut":
                net_data[interface].packets_sent,
                "trafficOut":
                net_data[interface].bytes_sent,
                "nwStatus":
                nw_status[interface][0],
                "ipV4":
                nw_status[interface][1],
                "nwCableConnStatus":
                nw_cable_conn_status
            }
            self.if_data.append(if_data)
        self.if_data.append(bmc_data)

    def _fetch_nw_status(self):
        nw_dict = {}
        nws = os.popen("ip --br a | awk '{print $1, $2, $3}'").read().split(
            '\n')[:-1]
        for nw in nws:
            if nw.split(' ')[2]:
                ip = nw.split(' ')[2].split("/")[0]
            else:
                ip = ""
            nw_dict[nw.split(' ')[0]] = [nw.split(' ')[1], ip]
        logger.debug("network info going is : {}".format(nw_dict))
        return nw_dict

    def fetch_nw_cable_conn_status(self, interface):
        carrier_status = None
        try:
            carrier_status = Network().get_link_state(interface)
        except NetworkError as err:
            # NetworkError i.e. all OSError exceptions indicate that
            # the carrier file is not available to access which
            # constitute the UNKOWN status for network cable.
            logger.debug(err)
            carrier_status = "UNKNOWN"
        except Exception as e:
            # All other exceptions are unexpected and are logged as errors.
            logger.excpetion(
                "Problem occured while reading from nw carrier file:"
                f" {self.nw_interface_path}/{interface}/carrier. Error: {e}")
        return carrier_status

    def _get_bmc_info(self):
        """
        nwCableConnection will be default UNKNOWN,
        Until solution to find bmc eth port cable connection status is found.
        """
        try:
            bmcdata = {
                'ifId': 'ebmc0',
                'ipV4Prev': "",
                'ipV4': "",
                'nwStatus': "DOWN",
                'nwCableConnStatus': 'UNKNOWN'
            }
            ipdata = sp.Popen(
                "sudo ipmitool lan print",
                shell=True,
                stdout=sp.PIPE,
                stderr=sp.PIPE).communicate()[0].decode().strip()
            bmcip = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ipdata)
            if bmcip:
                bmcip = bmcip[0]
                pingbmchost = "ping -c1 -W1 -q " + bmcip
                child = sp.Popen(pingbmchost.split(), stdout=sp.PIPE)
                streamdata = child.communicate(
                )[0]  #child must be communicated before fetching return code.
                retcode = child.returncode
                if self.prev_bmcip is not None and self.prev_bmcip != bmcip:
                    bmcdata['ipV4Prev'] = self.prev_bmcip
                    bmcdata['ipV4'] = bmcip
                    self.prev_bmcip = bmcip
                else:
                    self.prev_bmcip = bmcdata['ipV4Prev'] = bmcdata[
                        'ipV4'] = bmcip
                if retcode == 0:
                    bmcdata['nwStatus'] = "UP"
                else:
                    logger.warn("BMC Host:{0} is not reachable".format(bmcip))
        except Exception as e:
            logger.error(
                "Exception occurs while fetching bmc_info:{}".format(e))
        return bmcdata

    def _get_disk_space_alert_data(self):
        """Retrieves node information for the disk_space_alert_data json message"""
        self.total_space = int(psutil.disk_usage("/")[0]) // int(
            self.units_factor)
        self.free_space = int(psutil.disk_usage("/")[2]) // int(
            self.units_factor)
        self.disk_used_percentage = psutil.disk_usage("/")[3]

    def _load_1min_avg(self):
        """Loop forever calculating the one minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_1min_average.append(-1)
            index += 1

        while True:
            # API call blocks for one minute and then returns the value
            self.load_1min_average = psutil.cpu_percent(interval=1,
                                                        percpu=True)

    def _load_5min_avg(self):
        """Loop forever calculating the five minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_5min_average.append(-1)
            index += 1

        while True:
            # API call blocks for five minutes and then returns the value
            self.load_5min_average = psutil.cpu_percent(interval=5,
                                                        percpu=True)

    def _load_15min_avg(self):
        """Loop forever calculating the fifteen minute average load"""
        # Initialize list to -1 indicating the time interval has not occurred yet
        index = 0
        while index < self.cpus:
            self.load_15min_average.append(-1)
            index += 1

        while True:
            # API call blocks for fifteen minutes and then returns the value
            self.load_15min_average = psutil.cpu_percent(interval=15,
                                                         percpu=True)
コード例 #10
0
class SASPortSensor(SensorThread, InternalMsgQ):
    """SAS Port Sensor which runs on its own thread periodically and
       is responsible for sensing changes is SAS ports/cable using
       available tool/utility"""

    SENSOR_NAME = "SASPortSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:interface:sas"

    # section in the configuration store
    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"
    SITE_ID = "site_id"
    CLUSTER_ID = "cluster_id"
    NODE_ID = "node_id"
    RACK_ID = "rack_id"
    POLLING_INTERVAL = "polling_interval"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "SASHBA-0"
    DEFAULT_POLLING_INTERVAL = '30'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    # Number of SAS Ports
    NUM_SAS_PORTS = 4
    # Number of Phys in a Port
    NUM_PHYS_PER_PORT = 4
    # Current Data Version
    CURRENT_DATA_VERSION = 1

    @staticmethod
    def name():
        """@return: name of the module."""
        return SASPortSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        self.phy_dir_to_linkrate_mapping = None

        # Flag to indicate suspension of module
        self._suspended = False
        self._count = 0
        self.phy_link_count = 0
        self.sas_ports_status = {}
        self.port_phy_list_dict = {}
        self.sas_phy_stored_alert = None

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY, 'DC01')
        self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY, 'RC01')
        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')
        self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY, 'CC01')

        # Get the sas port implementor from configuration
        sas_port_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self.polling_interval = int(
            Conf.get(SSPL_CONF,
                     f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}",
                     self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[
                    phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert()

        except KeyError as key_error:
            logger.error("Unable to get the instance of {} \
                Utility. Hence shutting down the sensor".format(
                sas_port_utility))
            self.shutdown()
        except Exception as e:
            if e == errno.ENOENT:
                logger.error("Problem occured while reading from sas_phy \
                    directory. directory path doesn't directory. Hence \
                    shuting down the sensor")
            elif e == errno.EACCES:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     Not enough permission to read from the directory. \
                     Hence shuting down the sensor")
            else:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     {0}. Hence shuting down the sensor".format(e))
            self.shutdown()

        return True

    def update_sas_ports_status(self):
        """
        Reads current phy status and updates port connectivity status
        Assumption : phys will be present in multiples of 4
        """
        phy_list = [*self.phy_dir_to_linkrate_mapping]
        phy_list = sort_phy_list(phy_list)

        # Now we have a sorted list of phys
        # Phys 0-3 for the 0th sas port, and so on in groups of 4 phys
        # List containing status of all phys
        hba = []
        for phy in phy_list:
            if self.phy_dir_to_linkrate_mapping[phy][1] == 'up':
                hba.append(1)
            else:
                hba.append(0)

        for i in range(0, self.NUM_SAS_PORTS):
            # Save phy names forming this port for future use
            self.port_phy_list_dict[i] = phy_list[ self.NUM_PHYS_PER_PORT * i : \
                                                        self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT ]
            # Check port status
            s = set(hba[self.NUM_PHYS_PER_PORT * i:self.NUM_PHYS_PER_PORT * i +
                        self.NUM_PHYS_PER_PORT])
            if len(s) == 1 and 0 in s:
                port_status = 'down'
            elif len(s) == 1 and 1 in s:
                port_status = 'up'
            else:
                port_status = 'degraded'
            # Store the data
            self.sas_ports_status[i] = port_status

    def check_and_send_conn_alert(self):
        """
        Sends conn fault alert if all phys go down
        Sends conn fault_resolved alert if at least 1 sas port (4 phys) comes up
        """
        # Case 1 : all fault for fault alert
        cur_all_fault = True

        # Case 2 : all fault_resolved for fault_resolved alert
        cur_all_fault_resolved = True

        # Previous conn alert that was sent
        prev_conn_alert = self.sas_phy_stored_alert['conn']

        # Current
        for port, value in self.sas_phy_stored_alert.items():
            if port in ['version', 'conn']:
                # This is key for conn alert, skip
                continue

            # Case 1 : All faults in current status
            if value != 'fault':
                cur_all_fault = False

            # Case 2 : All fault_resolved in current status
            elif value != 'fault_resolved':
                cur_all_fault_resolved = False

        if prev_conn_alert == 'fault_resolved' and cur_all_fault:
            # Send conn fault alert
            alert_type = 'fault'
            self._generate_alert(alert_type, -1)
            self.sas_phy_stored_alert['conn'] = alert_type

        elif prev_conn_alert == 'fault' and cur_all_fault_resolved:
            # Send conn fault_resolved alert
            alert_type = 'fault_resolved'
            self._generate_alert(alert_type, -1)
            self.sas_phy_stored_alert['conn'] = alert_type

    def handle_current_version_data(self):
        """Contains logic to check and send alert if data has version == 1."""
        # Compare current status of each port with previous alert_type
        for port, value in self.sas_phy_stored_alert.items():
            if port in ['version', 'conn']:
                # Skip
                continue
            if value == 'fault_resolved' and \
                        self.sas_ports_status[port] == 'down':
                alert_type = 'fault'
                self._generate_alert(alert_type, port)
                self.sas_phy_stored_alert[port] = alert_type
            elif value == 'fault' and \
                        self.sas_ports_status[port] == 'up':
                alert_type = 'fault_resolved'
                self._generate_alert(alert_type, port)
                self.sas_phy_stored_alert[port] = alert_type
        # See if conn failure/conn resolved alert needs to be sent
        self.check_and_send_conn_alert()
        # Save data to store
        store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

    def check_and_send_alert(self):
        """Checks whether conditions are met and sends alert if required
        Alerts will be sent if -
        1. All 4 phys of a sas port go up -> down : fault alert
        2. All 4 phys of a sas port come down -> up : fault_resolved alert
        Sensor data stored in persistent storage is a dict of { sas_port_number : alert_type }
        """
        # Update sas ports status
        self.update_sas_ports_status()

        # Check the version of stored alert
        version = None
        try:
            # Try to get the version
            # Exception will be raised if stored alert is None or no Version is available
            version = self.sas_phy_stored_alert['version']
        except Exception:
            logger.warning(
                f"Found no data or old data format for SASPortSensor, \
                            updating data format to version {self.CURRENT_DATA_VERSION}"
            )
            # Versioning is not implemented or there is no data, write new data
            # Initialize dummy fault_resolved for all sas ports and conn
            self.sas_phy_stored_alert = {}
            self.sas_phy_stored_alert['version'] = self.CURRENT_DATA_VERSION
            self.sas_phy_stored_alert['conn'] = 'fault_resolved'
            for i in range(0, self.NUM_SAS_PORTS):
                self.sas_phy_stored_alert[i] = 'fault_resolved'
            # Save data to store
            store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

        if version == self.CURRENT_DATA_VERSION:
            self.handle_current_version_data()

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = None
        status = None

        new_phy_up = 0
        new_phy_down = 0

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(self.polling_interval, self._priority,
                                  self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        try:
            phy_link_rate_dict = \
                self._utility_instance.get_phy_negotiated_link_rate()
            if phy_link_rate_dict:
                for key, value in phy_link_rate_dict.items():
                    link_rate = value.strip()
                    prev_linkrate_value = \
                        self.phy_dir_to_linkrate_mapping[key][0].strip()
                    prev_alert_type = \
                        self.phy_dir_to_linkrate_mapping[key][1].strip()
                    status = prev_alert_type

                    # Compare local dict wrt global dictionary for change in the
                    # negotiated link rate
                    if link_rate.lower() != prev_linkrate_value.lower():
                        # If current link rate has no value like 12/6/3 Gbit
                        # and previously it was up, then it's a fault condition
                        if 'Gbit'.lower() not in link_rate.lower(
                        ) and prev_alert_type.lower() == 'up':
                            # Increment count for new phy down which were up previously
                            new_phy_down += 1

                            # Make respective phy_status as fault
                            status = 'fault'

                        # Check if 12/6/3 Gbit is there in the current link rate and
                        # the previous alert_type is fault. If so, means phy is Up again
                        elif 'Gbit'.lower() in link_rate.lower(
                        ) and prev_alert_type.lower() == 'fault':

                            # Mark respective phy_status as Up
                            status = 'up'

                            # Increment count for new phy up
                            new_phy_up += 1

                        # Finally update the global dict with current link rate
                        # and respctive phy status
                        self.phy_dir_to_linkrate_mapping[key] = (link_rate,
                                                                 status)

                # Get current phy status i.e number of Up phys
                new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down

                # Get the last sent alert info
                self.sas_phy_stored_alert = store.get(
                    self.SAS_PORT_SENSOR_DATA)
                self.check_and_send_alert()
                # Update current active phy count for next iteration
                self.phy_link_count = new_phy_link_count

        except Exception as ae:
            logger.exception(ae)

        # Fire every 30 seconds to see if there's a change in the phy status
        self._scheduler.enter(self.polling_interval, self._priority, self.run,
                              ())

    def _create_json_message(self, alert_type, port):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        specific_info = {}
        specific_info_list = []
        description = "N/A"

        # specific_info will contain all 16 phys for conn level alert
        # Only 4 phys for port level alert
        for key, val in self.phy_dir_to_linkrate_mapping.items():
            if port != -1:
                # This is a port level alert, skip phys that are not relevant
                if key not in self.port_phy_list_dict[port]:
                    # Skip adding this phy
                    continue
            # Key will be phy-0:0. So, aplit it using ':'
            # So, structure will be SASHBA-0:phy-0
            phy_number = key.split(":")[1]
            specific_info[
                "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number
            specific_info[
                "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[
                    key][0].strip()
            specific_info_list.append(specific_info)
            specific_info = {}

        alert_specific_info = specific_info_list

        if port == -1:
            # This is a SAS HBA level connection alert
            if alert_type == 'fault':
                description = "SAS connection error detected in SAS HBA %s." % self.RESOURCE_ID
            elif alert_type == 'fault_resolved':
                description = "SAS connection re-established in SAS HBA %s." % self.RESOURCE_ID

            info = {
                "site_id": self._site_id,
                "cluster_id": self._cluster_id,
                "rack_id": self._rack_id,
                "node_id": self._node_id,
                "resource_type": self.RESOURCE_TYPE,  # node:interface:sas
                "resource_id": self.RESOURCE_ID,  # SASHBA-0
                "event_time": epoch_time,
                "description": description
            }
        else:
            # This is a port level alert
            if alert_type == 'fault':
                description = (
                    "No connectivity detected on the SAS port %s, possible"
                    "causes could be missing SAS cable, bad cable connection,"
                    "faulty cable or SAS port failure." % port)
            elif alert_type == 'fault_resolved':
                description = "Connection established on SAS port."

            info = {
                "site_id": self._site_id,
                "cluster_id": self._cluster_id,
                "rack_id": self._rack_id,
                "node_id": self._node_id,
                "resource_type":
                self.RESOURCE_TYPE + ':port',  # node:interface:sas:port
                "resource_id":
                self.RESOURCE_ID + f'-port-{port}',  # SASHBA-0-port-0
                "event_time": epoch_time,
                "description": description
            }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type, port):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type, port)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(SASPortSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(SASPortSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(SASPortSensor, self).shutdown()
コード例 #11
0
    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY, 'DC01')
        self._rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY, 'RC01')
        self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01')
        self._cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY, 'CC01')

        # Get the sas port implementor from configuration
        sas_port_utility = Conf.get(
            SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs")

        self.polling_interval = int(
            Conf.get(SSPL_CONF,
                     f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}",
                     self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[
                    phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert()

        except KeyError as key_error:
            logger.error("Unable to get the instance of {} \
                Utility. Hence shutting down the sensor".format(
                sas_port_utility))
            self.shutdown()
        except Exception as e:
            if e == errno.ENOENT:
                logger.error("Problem occured while reading from sas_phy \
                    directory. directory path doesn't directory. Hence \
                    shuting down the sensor")
            elif e == errno.EACCES:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     Not enough permission to read from the directory. \
                     Hence shuting down the sensor")
            else:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     {0}. Hence shuting down the sensor".format(e))
            self.shutdown()

        return True
コード例 #12
0
class SASPortSensor(SensorThread, InternalMsgQ):
    """SAS Port Sensor which runs on its own thread periodically and
       is responsible for sensing changes is SAS ports/cable using
       available tool/utility"""

    SENSOR_NAME = "SASPortSensor"
    PRIORITY = 1
    RESOURCE_TYPE = "node:interface:sas"

    # section in the configuration store
    SYSTEM_INFORMATION = "SYSTEM_INFORMATION"
    SITE_ID = "site_id"
    CLUSTER_ID = "cluster_id"
    NODE_ID = "node_id"
    RACK_ID = "rack_id"
    POLLING_INTERVAL = "polling_interval"
    CACHE_DIR_NAME = "server"

    RESOURCE_ID = "SASHBA-0"
    DEFAULT_POLLING_INTERVAL = '30'

    PROBE = "probe"

    # Dependency list
    DEPENDENCIES = {
        "plugins": ["NodeDataMsgHandler", "LoggingMsgHandler"],
        "rpms": []
    }

    MIN_PHY_COUNT = 4

    @staticmethod
    def name():
        """@return: name of the module."""
        return SASPortSensor.SENSOR_NAME

    def __init__(self, utility_instance=None):
        """init method"""
        super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY)

        # Initialize the utility instance
        self._utility_instance = utility_instance

        self.phy_dir_to_linkrate_mapping = None

        # Flag to indicate suspension of module
        self._suspended = False
        self._count = 0
        self.phy_link_count = 0

    def initialize(self, conf_reader, msgQlist, product):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(SASPortSensor, self).initialize(conf_reader)

        super(SASPortSensor, self).initialize_msgQ(msgQlist)

        self._site_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.SITE_ID),
            '001')
        self._cluster_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.CLUSTER_ID),
            '001')
        self._rack_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.RACK_ID),
            '001')
        self._node_id = self._conf_reader._get_value_with_default(
            self.SYSTEM_INFORMATION,
            COMMON_CONFIGS.get(self.SYSTEM_INFORMATION).get(self.NODE_ID),
            '001')

        # Get the sas port implementor from configuration
        sas_port_utility = self._conf_reader._get_value_with_default(
            self.name().capitalize(), self.PROBE, "sysfs")

        self.polling_interval = int(
            self._conf_reader._get_value_with_default(
                self.SENSOR_NAME.upper(), self.POLLING_INTERVAL,
                self.DEFAULT_POLLING_INTERVAL))

        # Creating the instance of ToolFactory class
        self.tool_factory = ToolFactory()

        cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SAS_PORT_SENSOR_DATA = os.path.join(
            cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}')

        alert_type = None

        try:
            # Get the instance of the utility using ToolFactory
            self._utility_instance = self._utility_instance or \
                                self.tool_factory.get_instance(sas_port_utility)
            self._utility_instance.initialize()
            phy_status = None

            link_value_phy_status_collection = ()

            # Call to sas phy dirctory which will return a dictionary
            # which has phy_name to negotiated link rate mapping
            # Ex: {"phy-0:0": "<12.0, Unknown>"}
            self.phy_dir_to_linkrate_mapping = \
                    self._utility_instance.get_phy_negotiated_link_rate()

            # Iterate over populated dictionary and restructure it
            # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP.
            # {"phy-0:0": ("link_rate", <Up/Down>)}
            for phy, value in self.phy_dir_to_linkrate_mapping.items():
                if 'Gbit'.lower() in value.strip().lower():
                    phy_status = 'Up'
                    # Increment global phy_link count for UP status
                    self.phy_link_count += 1
                else:
                    phy_status = 'fault'
                link_value_phy_status_collection = (value, phy_status)
                self.phy_dir_to_linkrate_mapping[
                    phy] = link_value_phy_status_collection

            # Get the stored previous alert info
            self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA)
            self.check_and_send_alert(self.phy_link_count)

        except KeyError as key_error:
            logger.error("Unable to get the instance of {} \
                Utility. Hence shutting down the sensor".format(
                sas_port_utility))
            self.shutdown()
        except Exception as e:
            if e == errno.ENOENT:
                logger.error("Problem occured while reading from sas_phy \
                    directory. directory path doesn't directory. Hence \
                    shuting down the sensor")
            elif e == errno.EACCES:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     Not enough permission to read from the directory. \
                     Hence shuting down the sensor")
            else:
                logger.error(
                    "Problem occured while reading from sas_phy directory. \
                     {0}. Hence shuting down the sensor".format(e))
            self.shutdown()

        return True

    def check_and_send_alert(self, new_phy_link_count):
        """Checks whether conditions are met and sends alert if required
        Alerts will be sent if -
        1. All phys are down -> fault alert
        2. 4 phys are up -> fault_resolved alert
        3. Next group of 4 phys comes up -> informational alert

        Sensor data stored in Consul is a tuple (alert_type, phy_link_count)
        """
        if self.sas_phy_stored_alert == None:
            # No info is stored for this node in Consul
            # Initialize alert_type to dummy fault_resolved
            self.sas_phy_stored_alert = ('fault_resolved', new_phy_link_count)
            # Save data to Consul
            store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)
        elif self.sas_phy_stored_alert[0] == 'fault':
            # Previous alert sent for this node was fault, check if fault is resolved
            if new_phy_link_count >= self.MIN_PHY_COUNT:
                alert_type = 'fault_resolved'
                # Send alert
                self._generate_alert(alert_type)
                # Save data to Consul
                self.sas_phy_stored_alert = (alert_type, new_phy_link_count)
                store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)
        elif self.sas_phy_stored_alert[0] in ['fault_resolved', 'insertion']:
            # Check if we need to send informational alert
            if new_phy_link_count > self.sas_phy_stored_alert[
                    1] and new_phy_link_count % self.MIN_PHY_COUNT == 0:
                alert_type = 'insertion'
                # Send alert
                self._generate_alert(alert_type)
                # Save data to Consul
                self.sas_phy_stored_alert = (alert_type, new_phy_link_count)
                store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)
            # Check to see if we need to send fault alert
            if new_phy_link_count == 0:
                alert_type = 'fault'
                # Send alert
                self._generate_alert(alert_type)
                # Save data to Consul
                self.sas_phy_stored_alert = (alert_type, new_phy_link_count)
                store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA)

    def run(self):
        """Run the sensor on its own thread"""

        alert_type = None
        status = None

        new_phy_up = 0
        new_phy_down = 0

        # Do not proceed if module is suspended
        if self._suspended == True:
            self._scheduler.enter(self.polling_interval, self._priority,
                                  self.run, ())
            return

        # Check for debug mode being activated
        self._read_my_msgQ_noWait()

        try:
            phy_link_rate_dict = \
                self._utility_instance.get_phy_negotiated_link_rate()
            if phy_link_rate_dict:
                for key, value in phy_link_rate_dict.items():
                    link_rate = value.strip()
                    prev_linkrate_value = \
                        self.phy_dir_to_linkrate_mapping[key][0].strip()
                    prev_alert_type = \
                        self.phy_dir_to_linkrate_mapping[key][1].strip()
                    status = prev_alert_type

                    # Compare local dict wrt global dictionary for change in the
                    # negitiated link rate
                    if link_rate.lower() != prev_linkrate_value.lower():
                        # If current link rate has no value like 12/6/3 Gbit
                        # and previously it was up, then it's a fault condition
                        if 'Gbit'.lower() not in link_rate.lower(
                        ) and prev_alert_type.lower() == 'up':
                            # Increment count for new phy down which were up previously
                            new_phy_down += 1

                            # Make respective phy_status as fault
                            status = 'fault'

                        # Check if 12/6/3 Gbit is there in the current link rate and
                        # the previous alert_type is fault. If so, means phy is Up again
                        elif 'Gbit'.lower() in link_rate.lower(
                        ) and prev_alert_type.lower() == 'fault':

                            # Mark respective phy_status as Up
                            status = 'Up'

                            # Increment count for new phy up
                            new_phy_up += 1

                        # Finally update the global dict with current link rate
                        # and respctive phy status
                        self.phy_dir_to_linkrate_mapping[key] = (link_rate,
                                                                 status)

                # Get current phy status i.e number of Up phys
                new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down

                # Get the last sent alert info
                # It is a tuple of (alert_type, phy_link_count)
                self.sas_phy_stored_alert = store.get(
                    self.SAS_PORT_SENSOR_DATA)
                self.check_and_send_alert(new_phy_link_count)
                # Update current active phy count for next iteration
                self.phy_link_count = new_phy_link_count

        except Exception as ae:
            logger.exception(ae)

        # Fire every 30 seconds to see if there's a change in the phy status
        self._scheduler.enter(self.polling_interval, self._priority, self.run,
                              ())

    def _create_json_message(self, alert_type):
        """Creates a defined json message structure which can flow inside SSPL
           modules"""

        internal_json_msg = None
        severity_reader = SeverityReader()
        severity = severity_reader.map_severity(alert_type)
        epoch_time = str(int(time.time()))

        alert_id = self._get_alert_id(epoch_time)
        host_name = socket.gethostname()

        specific_info = {}
        specific_info_list = []

        for key, val in self.phy_dir_to_linkrate_mapping.items():
            # key will be phy-0:0. So, aplit it using ':'
            # So, structure will be SASHBA-0:phy-0
            phy_number = key.split(":")[1]
            specific_info[
                "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number
            specific_info[
                "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[
                    key][0].strip()
            specific_info_list.append(specific_info)
            specific_info = {}

        alert_specific_info = specific_info_list

        info = {
            "site_id": self._site_id,
            "cluster_id": self._cluster_id,
            "rack_id": self._rack_id,
            "node_id": self._node_id,
            "resource_type": self.RESOURCE_TYPE,
            "resource_id": self.RESOURCE_ID,
            "event_time": epoch_time
        }

        internal_json_msg = json.dumps({
            "sensor_request_type": {
                "node_data": {
                    "status": "update",
                    "host_id": host_name,
                    "alert_type": alert_type,
                    "severity": severity,
                    "alert_id": alert_id,
                    "info": info,
                    "specific_info": alert_specific_info
                }
            }
        })

        return internal_json_msg

    def _get_alert_id(self, epoch_time):
        """Returns alert id which is a combination of
           epoch_time and salt value
        """
        salt = str(uuid.uuid4().hex)
        alert_id = epoch_time + salt
        return alert_id

    def _generate_alert(self, alert_type):
        """Queues the message to NodeData Message Handler"""

        json_msg = self._create_json_message(alert_type)
        if json_msg:
            self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg)

    def suspend(self):
        """Suspends the module thread. It should be non-blocking"""
        super(SASPortSensor, self).suspend()
        self._suspended = True

    def resume(self):
        """Resumes the module thread. It should be non-blocking"""
        super(SASPortSensor, self).resume()
        self._suspended = False

    def shutdown(self):
        """Clean up scheduler queue and gracefully shutdown thread"""
        super(SASPortSensor, self).shutdown()
コード例 #13
0
class ServerMap(ResourceMap):
    """ServerMap class provides resource map and related information
    like health, manifest, etc,.
    """

    name = "server"

    def __init__(self):
        """Initialize server."""
        super().__init__()
        self.log = CustomLog(const.HEALTH_SVC_NAME)
        self.validate_server_type_support()
        self.sysfs = ToolFactory().get_instance('sysfs')
        self.sysfs.initialize()
        self.sysfs_base_path = self.sysfs.get_sysfs_base_path()
        self.cpu_path = self.sysfs_base_path + const.CPU_PATH
        hw_resources = {
            'cpu': self.get_cpu_info,
            'platform_sensors': self.get_platform_sensors_info,
            'memory': self.get_mem_info,
            'fans': self.get_fans_info,
            'nw_ports': self.get_nw_ports_info,
            'sas_hba': self.get_sas_hba_info,
            'sas_ports': self.get_sas_ports_info,
            'disks': self.get_disks_info,
            'psus': self.get_psu_info
        }
        sw_resources = {
            'cortx_sw_services': self.get_cortx_service_info,
            'external_sw_services': self.get_external_service_info,
            'raid': self.get_raid_info
        }
        self.server_resources = {"hw": hw_resources, "sw": sw_resources}
        self._ipmi = IpmiFactory().get_implementor("ipmitool")
        self.platform_sensor_list = ['Temperature', 'Voltage', 'Current']

    def validate_server_type_support(self):
        """Check for supported server type."""
        server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY)
        logger.debug(self.log.svc_log(f"Server Type:{server_type}"))
        if not server_type:
            msg = "ConfigError: server type is unknown."
            logger.error(self.log.svc_log(msg))
            raise ResourceMapError(errno.EINVAL, msg)
        if server_type.lower(
        ) not in const.RESOURCE_MAP["server_type_supported"]:
            msg = f"Health provider is not supported for server type '{server_type}'"
            logger.error(self.log.svc_log(msg))
            raise ResourceMapError(errno.EINVAL, msg)

    def get_health_info(self, rpath):
        """
        Fetch health information for given rpath.

        rpath: Resource path to fetch its health
               Examples:
                    node>compute[0]
                    node>compute[0]>hw
                    node>compute[0]>hw>disks
        """
        logger.info(self.log.svc_log(f"Get Health data for rpath:{rpath}"))
        info = {}
        resource_found = False
        nodes = rpath.strip().split(">")
        leaf_node, _ = self.get_node_details(nodes[-1])

        # Fetch health information for all sub nodes
        if leaf_node == "compute":
            info = self.get_server_health_info()
            resource_found = True
        elif leaf_node in self.server_resources:
            for resource, method in self.server_resources[leaf_node].items():
                try:
                    info.update({resource: method()})
                    resource_found = True
                except Exception as err:
                    logger.error(
                        self.log.svc_log(f"{err.__class__.__name__}: {err}"))
                    info = None
        else:
            for node in nodes:
                resource, _ = self.get_node_details(node)
                for res_type in self.server_resources:
                    method = self.server_resources[res_type].get(resource)
                    if not method:
                        logger.error(
                            self.log.svc_log(
                                f"No mapping function found for {res_type}"))
                        continue
                    try:
                        info = method()
                        resource_found = True
                    except Exception as err:
                        logger.error(
                            self.log.svc_log(
                                f"{err.__class__.__name__}: {err}"))
                        info = None
                if resource_found:
                    break

        if not resource_found:
            msg = f"Invalid rpath or health provider doesn't have support for'{rpath}'."
            logger.error(self.log.svc_log(f"{msg}"))
            raise ResourceMapError(errno.EINVAL, msg)

        return info

    @staticmethod
    def _is_any_resource_unhealthy(fru, data):
        """Check for any unhealthy resource at child level."""
        for child in data[fru]:
            if isinstance(child, dict):
                if child.get("health") and \
                    child["health"]["status"].lower() != "ok":
                    return True
        return False

    def get_server_health_info(self):
        """Returns overall server information"""
        unhealthy_resource_found = False
        server_details = Platform().get_server_details()
        # Currently only one instance of server is considered
        server = []
        info = {}
        info["make"] = server_details["Board Mfg"]
        info["model"] = server_details["Product Name"]
        try:
            build_instance = BuildInfo()
            info["product_family"] = build_instance.get_attribute("NAME")
            info["version"] = build_instance.get_attribute("VERSION")
            info["build"] = build_instance.get_attribute("BUILD")
        except Exception as err:
            logger.error(
                self.log.svc_log(f"Unable to get build info due to {err}"))
        info["resource_usage"] = {}
        info["resource_usage"]["cpu_usage"] = self.get_cpu_overall_usage()
        info["resource_usage"]["disk_usage"] = self.get_disk_overall_usage()
        info["resource_usage"]["memory_usage"] = self.get_memory_overall_usage(
        )

        for res_type in self.server_resources:
            info.update({res_type: {}})
            for fru, method in self.server_resources[res_type].items():
                try:
                    info[res_type].update({fru: method()})
                    unhealthy_resource_found = self._is_any_resource_unhealthy(
                        fru, info[res_type])
                except Exception as err:
                    logger.error(
                        self.log.svc_log(f"{err.__class__.__name__}:{err}"))
                    info[res_type].update({fru: None})

        info["uid"] = socket.getfqdn()
        info["last_updated"] = int(time.time())
        info["health"] = {}
        info["health"][
            "status"] = "OK" if not unhealthy_resource_found else "Degraded"
        health_desc = 'good' if info["health"]["status"] == 'OK' else 'bad'
        info["health"]["description"] = f"Server is in {health_desc} health."
        info["health"]["recommendation"] = const.DEFAULT_RECOMMENDATION \
            if info["health"]["status"] != "OK" else "NA"
        info["health"]["specifics"] = []
        server.append(info)
        return server

    @staticmethod
    def get_cpu_usage(index=2, percpu=False):
        """Get CPU usage list."""
        i = 0
        cpu_usage = None
        while i < index:
            cpu_usage = psutil.cpu_percent(interval=None, percpu=percpu)
            time.sleep(1)
            i = i + 1
        return cpu_usage

    def get_cpu_list(self, mode):
        """Returns the CPU list as per specified mode."""
        cpu_info_path = Path(self.cpu_path + mode)
        # Read the text from /cpu/online file
        cpu_info = cpu_info_path.read_text()
        # Drop the \n character from the end of string
        cpu_info = cpu_info.rstrip('\n')
        # Convert the string to list of indexes
        cpu_list = self.sysfs.convert_cpu_info_list(cpu_info)
        return cpu_list

    def get_cpu_info(self, add_overall_usage=False):
        """Update and return CPU information in specific format."""
        per_cpu_data = []
        cpu_present = self.get_cpu_list("present")
        cpu_online = self.get_cpu_list("online")
        cpu_usage = self.get_cpu_usage(percpu=True)
        cpu_usage_dict = dict(zip(cpu_online, cpu_usage))
        overall_cpu_usage = list(psutil.getloadavg())
        cpu_count = len(cpu_present)
        overall_usage = {
            "current": self.get_cpu_usage(percpu=False),
            "1_min_avg": overall_cpu_usage[0],
            "5_min_avg": overall_cpu_usage[1],
            "15_min_avg": overall_cpu_usage[2]
        }

        for cpu_id in range(0, cpu_count):
            uid = f"CPU-{cpu_id}"
            cpu_dict = self.get_health_template(uid, is_fru=False)
            online_status = "Online" if cpu_id in cpu_online else "Offline"
            health_status = "OK" if online_status == "Online" else "NA"
            usage = "NA" if health_status == "NA" \
                else cpu_usage_dict[cpu_id]
            specifics = [{"cpu_usage": usage, "state": online_status}]
            self.set_health_data(cpu_dict,
                                 status=health_status,
                                 specifics=specifics)
            per_cpu_data.append(cpu_dict)

        cpu_data = [{
            "overall_usage": overall_usage,
            "cpu_count": cpu_count,
            "last_updated": int(time.time()),
            "cpus": per_cpu_data
        }]
        if not add_overall_usage:
            cpu_data = per_cpu_data

        logger.debug(self.log.svc_log(f"CPU Health Data:{cpu_data}"))
        return cpu_data

    def get_cpu_overall_usage(self):
        """Returns CPU overall usage."""
        overall_usage = None
        cpu_data = self.get_cpu_info(add_overall_usage=True)
        if cpu_data[0].get("overall_usage"):
            overall_usage = cpu_data[0].get("overall_usage")
        else:
            logger.error(self.log.svc_log("Failed to get overall cpu usage"))
        return overall_usage

    def get_disk_info(self, add_overall_usage=False):
        """Update and return Disk information in specific format."""
        per_disk_data = []
        overall_usage = None
        disk_data = [{
            "overall_usage": overall_usage,
            "last_updated": int(time.time()),
            "disks": per_disk_data
        }]
        if not add_overall_usage:
            disk_data = per_disk_data

        logger.debug(self.log.svc_log(f"Disk Health Data:{disk_data}"))
        return disk_data

    def format_ipmi_platform_sensor_reading(self, reading):
        """builds json resposne from ipmi tool response.
        reading arg sample: ('CPU1 Temp', '01', 'ok', '3.1', '36 degrees C')
        """
        uid = '_'.join(reading[0].split())
        sensor_id = reading[0]
        sensor_props = self._ipmi.get_sensor_props(sensor_id)
        lower_critical = sensor_props[1].get('Lower Critical', 'NA')
        upper_critical = sensor_props[1].get('Upper Critical', 'NA')
        lower_non_recoverable = sensor_props[1].get('Lower Non-Recoverable',
                                                    'NA')
        upper_non_recoverable = sensor_props[1].get('Upper Non-Recoverable',
                                                    'NA')
        status = 'OK' if reading[2] == 'ok' else 'NA'
        health_desc = 'good' if status == 'OK' else 'bad'
        description = f"{uid} sensor is in {health_desc} health."
        recommendation = const.DEFAULT_RECOMMENDATION if status != 'OK' else 'NA'
        specifics = [{
            "Sensor Reading": f"{reading[-1]}",
            "lower_critical_threshold": lower_critical,
            "upper_critical_threshold": upper_critical,
            "lower_non_recoverable": lower_non_recoverable,
            "upper_non_recoverable": upper_non_recoverable,
        }]
        resp = self.get_health_template(uid, is_fru=False)
        self.set_health_data(resp, status, description, recommendation,
                             specifics)
        return resp

    def get_platform_sensors_info(self):
        """Get the sensor information based on sensor_type and instance."""
        response = {sensor: [] for sensor in self.platform_sensor_list}
        for sensor in self.platform_sensor_list:
            sensor_reading = self._ipmi.get_sensor_list_by_type(sensor)
            if not sensor_reading:
                logger.debug(
                    self.log.svc_log(f"No sensor data received for :{sensor}"))
                continue
            for reading in sensor_reading:
                response[sensor].append(
                    self.format_ipmi_platform_sensor_reading(reading))
        logger.debug(
            self.log.svc_log(f"Platform Sensor Health Data:{response}"))
        return response

    def get_mem_info(self):
        """Collect & return system memory info in specific format."""
        default_mem_usage_threshold = int(
            Conf.get(SSPL_CONF,
                     "NODEDATAMSGHANDLER>host_memory_usage_threshold", 80))
        data = []
        status = "OK"
        description = "Host memory is in good health."
        self.mem_info = dict(psutil.virtual_memory()._asdict())
        curr_mem_usage_threshold = int(self.mem_info['percent'])
        if curr_mem_usage_threshold > int(default_mem_usage_threshold):
            status = "Overloaded"
            description = (
                f"Current host memory usage is {curr_mem_usage_threshold},"
                f"beyond configured threshold of {default_mem_usage_threshold}."
            )

        memory_dict = self.prepare_mem_json(status, description)
        data.append(memory_dict)
        logger.debug(self.log.svc_log(f"Memory Health Data:{data}"))
        return data

    def prepare_mem_json(self, status, description):
        """Update and return memory information dict."""
        total_memory = {}
        for key, value in self.mem_info.items():
            if key == 'percent':
                total_memory['percent'] = str(self.mem_info['percent']) + '%'
            else:
                total_memory[key] = str(self.mem_info[key] >> 20) + 'MB'
        uid = "main_memory"
        specifics = [{
            "total": total_memory['total'],
            "available": total_memory['available'],
            "percent": total_memory['percent'],
            "used": total_memory['used'],
            "free": total_memory['free'],
            "active": total_memory['active'],
            "inactive": total_memory['inactive'],
            "buffers": total_memory['buffers'],
            "cached": total_memory['cached'],
            "shared": total_memory['shared'],
            "slab": total_memory['slab']
        }]
        memory_dict = self.get_health_template(uid, is_fru=False)
        self.set_health_data(memory_dict,
                             status=status,
                             description=description,
                             specifics=specifics)
        return memory_dict

    def get_memory_overall_usage(self):
        """Returns Memory overall usage."""
        overall_usage = None
        mem_info = self.get_mem_info()
        if mem_info[0].get("health"):
            overall_usage = mem_info[0]["health"]["specifics"]
        else:
            logger.error(
                self.log.svc_log("Failed to get memory overall usage"))
        return overall_usage

    def get_fans_info(self):
        """Get the Fan sensor information using ipmitool."""
        data = []
        sensor_reading = self._ipmi.get_sensor_list_by_type('Fan')
        if sensor_reading is None:
            msg = f"Failed to get Fan sensor reading using ipmitool"
            logger.error(self.log.svc_log(msg))
            return
        for fan_reading in sensor_reading:
            sensor_id = fan_reading[0]
            fan_dict = self.get_health_template(sensor_id, is_fru=True)
            sensor_props = self._ipmi.get_sensor_props(sensor_id)
            status = 'OK' if fan_reading[2] == 'ok' else 'NA'
            lower_critical = sensor_props[1].get('Lower Critical', 'NA')
            upper_critical = sensor_props[1].get('Upper Critical', 'NA')
            specifics = [{
                "Sensor Reading": f"{fan_reading[-1]}",
                "lower_critical_threshold": lower_critical,
                "upper_critical_threshold": upper_critical
            }]

            self.set_health_data(fan_dict, status=status, specifics=specifics)

            data.append(fan_dict)
            logger.debug(self.log.svc_log(f"Fan Health Data:{fan_dict}"))
        return data

    def get_sas_hba_info(self):
        """Return SAS-HBA current health."""
        sas_hba_data = []
        sas_instance = SAS()
        try:
            hosts = sas_instance.get_host_list()  # ['host1']
        except SASError as err:
            hosts = []
            logger.error(self.log.svc_log(err))
        except Exception as err:
            hosts = []
            logger.exception(self.log.svc_log(err))

        for host in hosts:
            host_id = const.SAS_RESOURCE_ID + host.replace('host', '')
            host_data = self.get_health_template(host_id, False)
            try:
                ports = sas_instance.get_port_list(host)
                # ports = ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3']
            except SASError as err:
                ports = []
                logger.error(self.log.svc_log(err))
            except Exception as err:
                ports = []
                logger.exception(self.log.svc_log(err))
            health = "OK"
            specifics = {'num_ports': len(ports), 'ports': []}
            for port in ports:
                try:
                    port_data = sas_instance.get_port_data(port)
                except SASError as err:
                    port_data = []
                    logger.error(self.log.svc_log(err))
                except Exception as err:
                    port_data = []
                    logger.exception(self.log.svc_log(err))
                specifics['ports'].append(port_data)
                if not port_data or port_data['state'] != 'running':
                    health = "NA"
            self.set_health_data(host_data, health, specifics=[specifics])
            sas_hba_data.append(host_data)
        return sas_hba_data

    def get_sas_ports_info(self):
        """Return SAS Ports current health."""
        sas_ports_data = []
        sas_instance = SAS()
        try:
            ports = sas_instance.get_port_list()
            # eg: ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3']
        except SASError as err:
            ports = []
            logger.error(self.log.svc_log(err))
        except Exception as err:
            ports = []
            logger.exception(self.log.svc_log(err))

        for port in ports:
            port_id = 'sas_' + port
            port_data = self.get_health_template(port_id, False)
            try:
                phys = sas_instance.get_phy_list_for_port(port)
                # eg: [ 'phy-1:0', 'phy-1:1', 'phy-1:2', 'phy-1:3']
            except SASError as err:
                phys = []
                logger.error(self.log.svc_log(err))
            except Exception as err:
                phys = []
                logger.exception(self.log.svc_log(err))
            specifics = {'num_phys': len(phys), 'phys': []}
            health = "OK"
            for phy in phys:
                try:
                    phy_data = sas_instance.get_phy_data(phy)
                except SASError as err:
                    phy_data = {}
                    logger.error(self.log.svc_log(err))
                except Exception:
                    phy_data = {}
                    logger.exception(self.log.svc_log(err))
                specifics['phys'].append(phy_data)
                if not phy_data or phy_data['state'] != 'enabled' or \
                   'Gbit' not in phy_data['negotiated_linkrate']:
                    health = "NA"
            self.set_health_data(port_data, health, specifics=[specifics])
            sas_ports_data.append(port_data)
        return sas_ports_data

    def get_nw_ports_info(self):
        """Return the Network ports information."""
        network_cable_data = []
        io_counters = psutil.net_io_counters(pernic=True)

        nw_instance = Network()
        for interface, addrs in psutil.net_if_addrs().items():
            nic_info = self.get_health_template(interface, False)
            specifics = {}
            for addr in addrs:
                if addr.family == socket.AF_INET:
                    specifics["ipV4"] = addr.address
            if interface in io_counters:
                io_info = io_counters[interface]
                specifics = {
                    "networkErrors": io_info.errin + io_info.errout,
                    "droppedPacketsIn": io_info.dropin,
                    "droppedPacketsOut": io_info.dropout,
                    "packetsIn": io_info.packets_recv,
                    "packetsOut": io_info.packets_sent,
                    "trafficIn": io_info.bytes_recv,
                    "trafficOut": io_info.bytes_sent
                }
            # Get the interface health status.
            nw_status, nw_cable_conn_status = \
                self.get_nw_status(nw_instance, interface)
            specifics["nwStatus"] = nw_status
            specifics["nwCableConnStatus"] = nw_cable_conn_status
            # Map and set the interface health status and description.
            map_status = {
                "CONNECTED": "OK",
                "DISCONNECTED": "Disabled/Failed",
                "UNKNOWN": "NA"
            }
            health_status = map_status[nw_cable_conn_status]
            desc = "Network Interface '%s' is %sin good health." % (
                interface, '' if health_status == "OK" else 'not ')
            self.set_health_data(nic_info,
                                 health_status,
                                 description=desc,
                                 specifics=[specifics])
            network_cable_data.append(nic_info)
        return network_cable_data

    def get_nw_status(self, nw_interface, interface):
        """Read & Return the latest network status from sysfs files."""
        try:
            nw_status = nw_interface.get_operational_state(interface)
        except NetworkError as err:
            nw_status = "UNKNOWN"
            logger.error(self.log.svc_log(err))
        except Exception as err:
            nw_status = "UNKNOWN"
            logger.exception(self.log.svc_log(err))
        try:
            nw_cable_conn_status = nw_interface.get_link_state(interface)
        except NetworkError as err:
            nw_cable_conn_status = "UNKNOWN"
            logger.exception(self.log.svc_log(err))
        except Exception as err:
            nw_cable_conn_status = "UNKNOWN"
            logger.exception(self.log.svc_log(err))
        return nw_status, nw_cable_conn_status

    def get_cortx_service_info(self):
        """Get cortx service info in required format."""
        service_info = []
        cortx_services = Service().get_cortx_service_list()
        for service in cortx_services:
            response = self.get_systemd_service_info(service)
            if response is not None:
                service_info.append(response)
        return service_info

    def get_external_service_info(self):
        """Get external service info in required format."""
        service_info = []
        external_services = Service().get_external_service_list()
        for service in external_services:
            response = self.get_systemd_service_info(service)
            if response is not None:
                service_info.append(response)
        return service_info

    def get_systemd_service_info(self, service_name):
        """Get info of specified service using dbus API."""
        try:
            unit = Service()._bus.get_object(
                const.SYSTEMD_BUS,
                Service()._manager.LoadUnit(service_name))
            properties_iface = Interface(unit, dbus_interface=PROPERTIES_IFACE)
        except DBusException as err:
            logger.error(
                self.log.svc_log(
                    f"Unable to initialize {service_name} due to {err}"))
            return None
        path_array = properties_iface.Get(const.SERVICE_IFACE, 'ExecStart')
        try:
            command_line_path = str(path_array[0][0])
        except IndexError as err:
            logger.error(
                self.log.svc_log(
                    f"Unable to find {service_name} path due to {err}"))
            command_line_path = "NA"

        is_installed = True if command_line_path != "NA" or 'invalid' in properties_iface.Get(
            const.UNIT_IFACE, 'UnitFileState') else False
        uid = str(properties_iface.Get(const.UNIT_IFACE, 'Id'))
        if not is_installed:
            health_status = "NA"
            health_description = f"Software enabling {uid} is not installed"
            recommendation = "NA"
            specifics = [{
                "service_name": uid,
                "description": "NA",
                "installed": str(is_installed).lower(),
                "pid": "NA",
                "state": "NA",
                "substate": "NA",
                "status": "NA",
                "license": "NA",
                "version": "NA",
                "command_line_path": "NA"
            }]
        else:
            service_license = "NA"
            version = "NA"
            service_description = str(
                properties_iface.Get(const.UNIT_IFACE, 'Description'))
            state = str(properties_iface.Get(const.UNIT_IFACE, 'ActiveState'))
            substate = str(properties_iface.Get(const.UNIT_IFACE, 'SubState'))
            service_status = 'enabled' if 'disabled' not in properties_iface.Get(
                const.UNIT_IFACE, 'UnitFileState') else 'disabled'
            pid = "NA" if state == "inactive" else str(
                properties_iface.Get(const.SERVICE_IFACE, 'ExecMainPID'))
            try:
                version = Service().get_service_info_from_rpm(uid, "VERSION")
            except ServiceError as err:
                logger.error(
                    self.log.svc_log(
                        f"Unable to get service version due to {err}"))
            try:
                service_license = Service().get_service_info_from_rpm(
                    uid, "LICENSE")
            except ServiceError as err:
                logger.error(
                    self.log.svc_log(
                        f"Unable to get service license due to {err}"))

            specifics = [{
                "service_name": uid,
                "description": service_description,
                "installed": str(is_installed).lower(),
                "pid": pid,
                "state": state,
                "substate": substate,
                "status": service_status,
                "license": service_license,
                "version": version,
                "command_line_path": command_line_path
            }]
            if service_status == 'enabled' and state == 'active' \
                    and substate == 'running':
                health_status = 'OK'
                health_description = f"{uid} is in good health"
                recommendation = "NA"
            else:
                health_status = state
                health_description = f"{uid} is not in good health"
                recommendation = const.DEFAULT_RECOMMENDATION

        service_info = self.get_health_template(uid, is_fru=False)
        self.set_health_data(service_info, health_status, health_description,
                             recommendation, specifics)
        return service_info

    def get_raid_info(self):
        raids_data = []
        for raid in RAIDs.get_configured_raids():
            raid_data = self.get_health_template(raid.id, False)
            health, description = raid.get_health()
            devices = raid.get_devices()
            specifics = [{
                "location":
                raid.raid,
                "data_integrity_status":
                raid.get_data_integrity_status(),
                "devices":
                devices
            }]
            self.set_health_data(raid_data,
                                 health,
                                 specifics=specifics,
                                 description=description)
            raids_data.append(raid_data)
        return raids_data

    @staticmethod
    def get_disk_overall_usage():
        units_factor_GB = 1000000000
        overall_usage = {
            "totalSpace":
            f'{int(psutil.disk_usage("/")[0])//int(units_factor_GB)} GB',
            "usedSpace":
            f'{int(psutil.disk_usage("/")[1])//int(units_factor_GB)} GB',
            "freeSpace":
            f'{int(psutil.disk_usage("/")[2])//int(units_factor_GB)} GB',
            "diskUsedPercentage": psutil.disk_usage("/")[3],
        }
        return overall_usage

    def get_disks_info(self):
        """Update and return server drive information in specific format."""
        disks = []
        for disk in Disk.get_disks():
            uid = disk.path if disk.path else disk.id
            disk_health = self.get_health_template(uid, True)
            health_data = disk.get_health()
            health = "OK" if (health_data['SMART_health']
                              == "PASSED") else "Fault"
            self.set_health_data(disk_health,
                                 health,
                                 specifics=[{
                                     "SMART": health_data
                                 }])
            disks.append(disk_health)
        logger.debug(self.log.svc_log(f"Disk Health Data:{disks}"))
        return disks

    def get_psu_info(self):
        """Update and return PSU information in specific format."""
        psus_health_data = []
        for psu in self.get_psus():
            data = self.get_health_template(f'{psu["Location"]}', True)
            health = "OK" if (psu["Status"] == "Present, OK") else "Fault"
            self.set_health_data(data, health, specifics=psu)
            psus_health_data.append(data)
        logger.debug(self.log.svc_log(f"PSU Health Data:{psus_health_data}"))
        return psus_health_data

    @staticmethod
    def get_psus():
        response, _, _ = SimpleProcess("dmidecode -t 39").run()
        matches = re.findall(
            "System Power Supply|Power Unit Group:.*|"
            "Location:.*|Name:.*|Serial Number:.*|"
            "Max Power Capacity:.*|Status: .*|"
            "Plugged:.*|Hot Replaceable:.*", response.decode())
        psus = []
        stack = []
        while matches:
            item = matches.pop()
            while item != "System Power Supply":
                stack.append(item)
                item = matches.pop()
            psu = {}
            while stack:
                key, value = stack.pop().strip().split(":")
                psu[key] = value.strip()
            psus.append(psu)
        return psus