def __init__(self): super(RealStorPSUSensor, self).__init__( self.SENSOR_NAME, self.PRIORITY) self._faulty_psu_file_path = None self.rssencl = singleton_realstorencl # psus persistent cache self.psu_prcache = None # Holds PSUs with faults. Used for future reference. self._previously_faulty_psus = {} self.pollfreq_psusensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORPSUSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_psusensor == 0: self.pollfreq_psusensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils()
def __init__(self): super(RealStorEnclosureSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.rssencl = singleton_realstorencl # Flag to indicate suspension of module self._suspended = False self.os_utils = OSUtils()
def __init__(self): super(RAIDsensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Current RAID status information self._RAID_status = None # Location of hpi data directory populated by dcs-collector self._start_delay = 10 # Flag to indicate suspension of module self._suspended = False self.os_utils = OSUtils()
def __init__(self, utility_instance=None): """init method""" super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance # CPU info self.stored_cpu_info = None self.prev_cpu_info = None self.current_cpu_info = None self.os_utils = OSUtils()
def get_alert(cls, service, alert): if service.state in ["active", "failed"]: description = alert.description.format( service.name, service.state, service.threshold_waiting_time) else: description = alert.description.format(service.name, service.state, service.nonactive_threshold) return { "sensor_request_type": { "service_status_alert": { "host_id": OSUtils.get_fqdn(), "severity": SeverityReader().map_severity(alert.alert_type), "alert_id": MonUtils.get_alert_id(str(int(time.time()))), "alert_type": alert.alert_type, "info": { "resource_type": cls.RESOURCE_TYPE, "resource_id": service.name, "event_time": str(int(time.time())), "description": description, "impact": alert.impact.format(service.name), "recommendation": alert.recommendation, }, "specific_info": { "service_name": service.name, "previous_state": service.previous_state, "state": service.state, "previous_substate": service.previous_substate, "substate": service.substate, "previous_pid": service.previous_pid, "pid": service.pid, } } } }
def __init__(self, utility_instance=None): """init method""" super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance self.phy_dir_to_linkrate_mapping = None # Flag to indicate suspension of module self._suspended = False self._count = 0 self.phy_link_count = 0 self.sas_ports_status = {} self.port_phy_list_dict = {} self.sas_phy_stored_alert = None self.os_utils = OSUtils()
def __init__(self): super(NodeData, self).__init__() self.os_utils = OSUtils() self._epoch_time = str(int(time.time())) # Total number of CPUs self.cpus = psutil.cpu_count() self.host_id = self.os_utils.get_fqdn() # Calculate the load averages on separate blocking threads self.load_1min_average = [] self.load_5min_average = [] self.load_15min_average = [] self.prev_bmcip = None load_1min_avg = threading.Thread(target=self._load_1min_avg).start() load_5min_avg = threading.Thread(target=self._load_5min_avg).start() load_15min_avg = threading.Thread(target=self._load_15min_avg).start() self.conf_reader = ConfigReader() nw_fault_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self._utility_instance = None try: # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(nw_fault_utility) if self._utility_instance: # Initialize the path as /sys/class/net/ self.nw_interface_path = self._utility_instance.get_sys_dir_path( 'net') except KeyError as key_error: logger.error( f'NodeData, Unable to get the instance of {nw_fault_utility} Utility' ) except Exception as err: logger.error( f'NodeData, Problem occured while getting the instance of {nw_fault_utility}' )
def __init__(self): super(RealStorDiskSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.last_alert = None self.rssencl = singleton_realstorencl # disks persistent cache self.disks_prcache = f"{self.rssencl.frus}disks/" self.pollfreq_disksensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORDISKSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_disksensor == 0: self.pollfreq_disksensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = None self._event_wait_results = set() self.os_utils = OSUtils()
def __init__(self): super(RealStorLogicalVolumeSensor, self).__init__( self.SENSOR_NAME, self.PRIORITY) self._faulty_disk_group_file_path = None self._faulty_logical_volume_file_path = None self.rssencl = singleton_realstorencl # logical volumes persistent cache self._logical_volume_prcache = None # disk groups persistent cache self._disk_group_prcache = None # Holds Disk Groups with faults. Used for future reference. self._previously_faulty_disk_groups = {} # Holds Logical Volumes with faults. Used for future reference. self._previously_faulty_logical_volumes = {} self.pollfreq_DG_logical_volume_sensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORLOGICALVOLUMESENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 10)) if self.pollfreq_DG_logical_volume_sensor == 0: self.pollfreq_DG_logical_volume_sensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils() cvg_info = Conf.get(GLOBAL_CONF, CVG_INFO_KEY) self.cvg_info_dict = {} if cvg_info: self.cvg_info_dict = {cvg['name']: idx for idx, cvg in \ enumerate(cvg_info) if 'name' in cvg}
def __init__(self): super(RealStorFanSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.rssencl = singleton_realstorencl self._faulty_fan_file_path = None self._faulty_fan_modules_list = {} self._fan_modules_list = {} # fan modules psus persistent cache self._fanmodule_prcache = None self.pollfreq_fansensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORFANSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_fansensor == 0: self.pollfreq_fansensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils()
class NodeData(Debug): """Obtains data about the node and makes it available""" SENSOR_NAME = "NodeData" # conf attribute initialization PROBE = 'probe' @staticmethod def name(): """@return: name of the module.""" return NodeData.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return ("Server CPU, network, disk space, process and local mount " "data can not be monitored.") def __init__(self): super(NodeData, self).__init__() self.os_utils = OSUtils() self._epoch_time = str(int(time.time())) # Total number of CPUs self.cpus = psutil.cpu_count() self.host_id = self.os_utils.get_fqdn() # Calculate the load averages on separate blocking threads self.load_1min_average = [] self.load_5min_average = [] self.load_15min_average = [] self.prev_bmcip = None load_1min_avg = threading.Thread(target=self._load_1min_avg).start() load_5min_avg = threading.Thread(target=self._load_5min_avg).start() load_15min_avg = threading.Thread(target=self._load_15min_avg).start() self.conf_reader = ConfigReader() nw_fault_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self._utility_instance = None try: # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(nw_fault_utility) if self._utility_instance: # Initialize the path as /sys/class/net/ self.nw_interface_path = self._utility_instance.get_sys_dir_path( 'net') except KeyError as key_error: logger.error( f'NodeData, Unable to get the instance of {nw_fault_utility} Utility' ) except Exception as err: logger.error( f'NodeData, Problem occured while getting the instance of {nw_fault_utility}' ) def read_data(self, subset, debug, units="MB"): """Updates data based on a subset""" self._set_debug(debug) self._log_debug("read_data, subset: %s, units: %s" % (subset, units)) try: # Determine the units factor value self.units_factor = 1 if units == "GB": self.units_factor = 1000000000 elif units == "MB": self.units_factor = 1000000 elif units == "KB": self.units_factor = 1000 self.host_id = self.os_utils.get_fqdn() # get_fqdn() function checks the socket.gethostname() to get the host name if it not available # then it try to find host name from socket.gethostbyaddr(socket.gethostname())[0] and return the # meaningful host name. self.local_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S %Z') # Branch off and gather data based upon value sent into subset if subset == "host_update": self._get_host_update_data() elif subset == "local_mount_data": self._get_local_mount_data() elif subset == "cpu_data": self._get_cpu_data() elif subset == "if_data": self._get_if_data() elif subset == "disk_space_alert": self._get_disk_space_alert_data() except Exception as e: raise Exception(f"Failed to read data, {e}") return True def _get_host_update_data(self): """Retrieves node information for the host_update json message""" logged_in_users = [] uname_keys = ("sysname", "nodename", "version", "release", "machine") self.up_time = int(psutil.boot_time()) self.boot_time = self._epoch_time self.uname = dict(zip(uname_keys, os.uname())) self.total_memory = dict(psutil.virtual_memory()._asdict()) self.process_count = len(psutil.pids()) for users in psutil.users(): logged_in_users.append(dict(users._asdict())) self.logged_in_users = logged_in_users # Calculate the current number of running processes at this moment total_running_proc = 0 for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['status']) if pinfo['status'] not in (psutil.STATUS_ZOMBIE, psutil.STATUS_DEAD, psutil.STATUS_STOPPED, psutil.STATUS_IDLE, psutil.STATUS_SLEEPING): total_running_proc += 1 except psutil.NoSuchProcess: logger.warn( f"(psutil) Process '{proc.name()}' exited unexpectedly.") self.running_process_count = total_running_proc def _get_local_mount_data(self): """Retrieves node information for the local_mount_data json message""" self.total_space = int(psutil.disk_usage("/")[0]) // int( self.units_factor) self.free_space = int(psutil.disk_usage("/")[2]) // int( self.units_factor) self.total_swap = int(psutil.swap_memory()[0]) // int( self.units_factor) self.free_swap = int(psutil.swap_memory()[2]) // int(self.units_factor) self.free_inodes = int(100 - math.ceil((float(os.statvfs("/").f_files - os.statvfs("/").f_ffree) \ / os.statvfs("/").f_files) * 100)) def _get_cpu_data(self): """Retrieves node information for the cpu_data json message""" cpu_core_usage_dict = dict() cpu_data = psutil.cpu_times_percent() self._log_debug( "_get_cpu_data, cpu_data: %s %s %s %s %s %s %s %s %s %s" % cpu_data) self.csps = 0 # What the hell is csps - cycles per second? self.user_time = int(cpu_data[0]) self.nice_time = int(cpu_data[1]) self.system_time = int(cpu_data[2]) self.idle_time = int(cpu_data[3]) self.iowait_time = int(cpu_data[4]) self.interrupt_time = int(cpu_data[5]) self.softirq_time = int(cpu_data[6]) self.steal_time = int(cpu_data[7]) self.cpu_usage = psutil.cpu_percent(interval=1, percpu=False) # Array to hold data about each CPU core self.cpu_core_data = [] index = 0 while index < self.cpus: self._log_debug( "_get_cpu_data, index: %s, 1 min: %s, 5 min: %s, 15 min: %s" % (index, self.load_1min_average[index], self.load_5min_average[index], self.load_15min_average[index])) cpu_core_data = { "coreId": index, "load1MinAvg": int(self.load_1min_average[index]), "load5MinAvg": int(self.load_5min_average[index]), "load15MinAvg": int(self.load_15min_average[index]), "ips": 0 } self.cpu_core_data.append(cpu_core_data) index += 1 def _get_if_data(self): """Retrieves node information for the if_data json message""" net_data = psutil.net_io_counters(pernic=True) # Array to hold data about each network interface self.if_data = [] bmc_data = self._get_bmc_info() for interface, if_data in net_data.items(): self._log_debug("_get_if_data, interface: %s %s" % (interface, net_data)) nw_status = self._fetch_nw_status() nw_cable_conn_status = self.fetch_nw_cable_conn_status(interface) if_data = { "ifId": interface, "networkErrors": (net_data[interface].errin + net_data[interface].errout), "droppedPacketsIn": net_data[interface].dropin, "packetsIn": net_data[interface].packets_recv, "trafficIn": net_data[interface].bytes_recv, "droppedPacketsOut": net_data[interface].dropout, "packetsOut": net_data[interface].packets_sent, "trafficOut": net_data[interface].bytes_sent, "nwStatus": nw_status[interface][0], "ipV4": nw_status[interface][1], "nwCableConnStatus": nw_cable_conn_status } self.if_data.append(if_data) self.if_data.append(bmc_data) def _fetch_nw_status(self): nw_dict = {} nws = os.popen("ip --br a | awk '{print $1, $2, $3}'").read().split( '\n')[:-1] for nw in nws: if nw.split(' ')[2]: ip = nw.split(' ')[2].split("/")[0] else: ip = "" nw_dict[nw.split(' ')[0]] = [nw.split(' ')[1], ip] logger.debug("network info going is : {}".format(nw_dict)) return nw_dict def fetch_nw_cable_conn_status(self, interface): carrier_status = None try: carrier_status = Network().get_link_state(interface) except NetworkError as err: # NetworkError i.e. all OSError exceptions indicate that # the carrier file is not available to access which # constitute the UNKOWN status for network cable. logger.debug(err) carrier_status = "UNKNOWN" except Exception as e: # All other exceptions are unexpected and are logged as errors. logger.excpetion( "Problem occured while reading from nw carrier file:" f" {self.nw_interface_path}/{interface}/carrier. Error: {e}") return carrier_status def _get_bmc_info(self): """ nwCableConnection will be default UNKNOWN, Until solution to find bmc eth port cable connection status is found. """ try: bmcdata = { 'ifId': 'ebmc0', 'ipV4Prev': "", 'ipV4': "", 'nwStatus': "DOWN", 'nwCableConnStatus': 'UNKNOWN' } ipdata = sp.Popen( "sudo ipmitool lan print", shell=True, stdout=sp.PIPE, stderr=sp.PIPE).communicate()[0].decode().strip() bmcip = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ipdata) if bmcip: bmcip = bmcip[0] pingbmchost = "ping -c1 -W1 -q " + bmcip child = sp.Popen(pingbmchost.split(), stdout=sp.PIPE) streamdata = child.communicate( )[0] #child must be communicated before fetching return code. retcode = child.returncode if self.prev_bmcip is not None and self.prev_bmcip != bmcip: bmcdata['ipV4Prev'] = self.prev_bmcip bmcdata['ipV4'] = bmcip self.prev_bmcip = bmcip else: self.prev_bmcip = bmcdata['ipV4Prev'] = bmcdata[ 'ipV4'] = bmcip if retcode == 0: bmcdata['nwStatus'] = "UP" else: logger.warn("BMC Host:{0} is not reachable".format(bmcip)) except Exception as e: logger.error( "Exception occurs while fetching bmc_info:{}".format(e)) return bmcdata def _get_disk_space_alert_data(self): """Retrieves node information for the disk_space_alert_data json message""" self.total_space = int(psutil.disk_usage("/")[0]) // int( self.units_factor) self.free_space = int(psutil.disk_usage("/")[2]) // int( self.units_factor) self.disk_used_percentage = psutil.disk_usage("/")[3] def _load_1min_avg(self): """Loop forever calculating the one minute average load""" # Initialize list to -1 indicating the time interval has not occurred yet index = 0 while index < self.cpus: self.load_1min_average.append(-1) index += 1 while True: # API call blocks for one minute and then returns the value self.load_1min_average = psutil.cpu_percent(interval=1, percpu=True) def _load_5min_avg(self): """Loop forever calculating the five minute average load""" # Initialize list to -1 indicating the time interval has not occurred yet index = 0 while index < self.cpus: self.load_5min_average.append(-1) index += 1 while True: # API call blocks for five minutes and then returns the value self.load_5min_average = psutil.cpu_percent(interval=5, percpu=True) def _load_15min_avg(self): """Loop forever calculating the fifteen minute average load""" # Initialize list to -1 indicating the time interval has not occurred yet index = 0 while index < self.cpus: self.load_15min_average.append(-1) index += 1 while True: # API call blocks for fifteen minutes and then returns the value self.load_15min_average = psutil.cpu_percent(interval=15, percpu=True)
class RealStorFanSensor(SensorThread, InternalMsgQ): SENSOR_NAME = "RealStorFanSensor" SENSOR_TYPE = "enclosure_fan_module_alert" RESOURCE_TYPE = "enclosure:hw:fan" PRIORITY = 1 # Fan Modules directory name FAN_MODULES_DIR = "fanmodules" # Dependency list DEPENDENCIES = { "plugins": ["RealStorEnclMsgHandler"], "rpms": [] } @staticmethod def name(): """@return: name of the monitoring module.""" return RealStorFanSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Fan modules in storage enclosure can not be monitored." @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return RealStorFanSensor.DEPENDENCIES def __init__(self): super(RealStorFanSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.rssencl = singleton_realstorencl self._faulty_fan_file_path = None self._faulty_fan_modules_list = {} self._fan_modules_list = {} # fan modules psus persistent cache self._fanmodule_prcache = None self.pollfreq_fansensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORFANSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_fansensor == 0: self.pollfreq_fansensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorFanSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorFanSensor, self).initialize_msgQ(msgQlist) self._fanmodule_prcache = os.path.join(self.rssencl.frus, \ self.FAN_MODULES_DIR) # Persistence file location. This file stores faulty FanModule data self._faulty_fan_file_path = os.path.join( self._fanmodule_prcache, "fanmodule_data.json") # Load faulty Fan Module data from file if available self._faulty_fan_modules_list = store.get(\ self._faulty_fan_file_path) if self._faulty_fan_modules_list is None: self._faulty_fan_modules_list = {} store.put(self._faulty_fan_modules_list,\ self._faulty_fan_file_path) return True def read_data(self): """Return the Current fan_module information""" return self._fan_modules_list def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(30, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() # Periodically check if there is any fault in the fan_module self._check_for_fan_module_fault() self._scheduler.enter(self.pollfreq_fansensor, self._priority, self.run, ()) def _check_for_fan_module_fault(self): """Iterates over fan modules list. maintains a dictionary in order to keep track of previous health of the FRU in order to set alert_type""" self._fan_modules_list = self._get_fan_modules_list() alert_type = None if not self._fan_modules_list: return try: for fan_module in self._fan_modules_list: fru_status = fan_module.get("health").lower() durable_id = fan_module.get("durable-id").lower() health_reason = fan_module.get("health-reason").lower() if fru_status == self.rssencl.HEALTH_FAULT and \ self._check_if_fan_module_is_installed(health_reason): if durable_id not in self._faulty_fan_modules_list: alert_type = self.rssencl.FRU_MISSING self._faulty_fan_modules_list[durable_id] = alert_type else: prev_alert_type = self._faulty_fan_modules_list[durable_id] if prev_alert_type != self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_MISSING self._faulty_fan_modules_list[durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_FAULT or \ fru_status == self.rssencl.HEALTH_DEGRADED: if durable_id not in self._faulty_fan_modules_list: alert_type = self.rssencl.FRU_FAULT self._faulty_fan_modules_list[durable_id] = alert_type else: prev_alert_type = self._faulty_fan_modules_list[durable_id] if prev_alert_type != self.rssencl.FRU_FAULT: alert_type = self.rssencl.FRU_FAULT self._faulty_fan_modules_list[durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_OK: if durable_id in self._faulty_fan_modules_list: prev_alert_type = \ self._faulty_fan_modules_list[durable_id] if prev_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION else: alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._faulty_fan_modules_list[durable_id] # Persist faulty Fan Module list to file only if there is any # type of alert generated if alert_type: internal_json_message = \ self._create_internal_json_msg(fan_module, alert_type) self._send_json_message(internal_json_message) # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detectedcted if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._faulty_fan_modules_list,\ self._faulty_fan_file_path) else: self._faulty_fan_modules_list = store.get(self._faulty_fan_file_path) alert_type = None except Exception as e: logger.exception(e) def _check_if_fan_module_is_installed(self, health_reason): """ This function returns true if given string contains substring otherwise, it returns false. To achieve this, it uses search method of python re module""" not_installed_health_string = "not installed" return bool(re.search(not_installed_health_string, health_reason)) def _get_fan_modules_list(self): """Returns fan module list using API /show/fan-modules""" url = self.rssencl.build_url( self.rssencl.URI_CLIAPI_SHOWFANMODULES) response = self.rssencl.ws_request( url, self.rssencl.ws.HTTP_GET) if not response: logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Fan-modules status unavailable as ws request {url} failed") return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"to get fan-modules failed with http err {response.status_code}") return response_data = json.loads(response.text) fan_modules_list = response_data["fan-modules"] return fan_modules_list def _get_fan_attributes(self, fan_module): """Returns individual fan attributes from each fan-module""" fan_list = [] fans = {} fan_key = "" fan_attribute_list = [ 'status', 'name', 'speed', 'durable-id', 'health', 'fw-revision', 'health-reason', 'serial-number', 'location', 'position', 'part-number', 'health-recommendation', 'hw-revision', 'locator-led' ] fru_fans = fan_module.get("fan", []) for fan in fru_fans: for fan_key in filter(lambda common_key: common_key in fan_attribute_list, fan): fans[fan_key] = fan.get(fan_key) fan_list.append(fans) return fan_list def _create_internal_json_msg(self, fan_module, alert_type): """Creates internal json structure which is sent to realstor_msg_handler for further processing""" fan_module_info_key_list = \ ['name', 'location', 'status', 'health', 'health-reason', 'health-recommendation', 'enclosure-id', 'durable-id', 'position'] fan_module_info_dict = {} fan_module_extended_info_dict = {} fans_list = self._get_fan_attributes(fan_module) for fan_module_key, fan_module_value in fan_module.items(): if fan_module_key in fan_module_info_key_list: fan_module_info_dict[fan_module_key] = fan_module_value fan_module_info_dict["fans"] = fans_list severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('FAN MODULE') resource_id = fan_module_info_dict.get("name", "") host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "fru": fru, "resource_id": resource_id, "event_time": epoch_time } # Creates internal json message request structure. # this message will be passed to the StorageEnclHandler internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": fan_module_info_dict } }}) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _send_json_message(self, json_msg): """Transmit data to RealStorMsgHandler to be processed and sent out""" self._event.clear() # Send the event to real stor message handler # to generate json message and send out self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RealStorFanSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RealStorFanSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RealStorFanSensor, self).shutdown()
class CPUFaultSensor(SensorThread, InternalMsgQ): """CPU Fault Sensor which runs on its own thread on each boot up and is responsible for sensing changes in online CPUs using available tool/utility""" SENSOR_NAME = "CPUFaultSensor" PRIORITY = 1 RESOURCE_TYPE = "node:os:cpu:core" # Section in the configuration store SYSTEM_INFORMATION_KEY = "SYSTEM_INFORMATION" CACHE_DIR_NAME = "server" RESOURCE_ID = "CPU-" PROBE = "probe" # Dependency list DEPENDENCIES = {"plugins": ["NodeDataMsgHandler"], "rpms": []} @staticmethod def name(): """@return: name of the module.""" return CPUFaultSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Server CPU presence and status change can not be monitored." def __init__(self, utility_instance=None): """init method""" super(CPUFaultSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance # CPU info self.stored_cpu_info = None self.prev_cpu_info = None self.current_cpu_info = None self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(CPUFaultSensor, self).initialize(conf_reader) super(CPUFaultSensor, self).initialize_msgQ(msgQlist) # get the cpu fault implementor from configuration cpu_fault_utility = Conf.get(SSPL_CONF, f"{self.name().upper()}>{self.PROBE}", 'sysfs') # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(cpu_fault_utility) except Exception as err: raise Exception( "Error while initializing. " f"Unable to get the instance of {cpu_fault_utility} Utility, {err}" ) self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.CPU_FAULT_SENSOR_DATA = os.path.join( cache_dir_path, f'CPU_FAULT_SENSOR_DATA_{self._node_id}') return True def read_stored_cpu_info(self): """Read the most recent stored cpu info""" try: if self.stored_cpu_info is None: self.stored_cpu_info = store.get(self.CPU_FAULT_SENSOR_DATA) if self.stored_cpu_info is not None and self._node_id in self.stored_cpu_info.keys( ): self.prev_cpu_info = self.stored_cpu_info[ self._node_id]['CPU_LIST'] except Exception as e: raise Exception(f"Error while reading stored cpu info, {e}") def read_current_cpu_info(self): """Read current cpu info""" try: self.current_cpu_info = self._utility_instance.get_cpu_info() except Exception as e: raise Exception(f"Error while reading current cpu info, {e}") def run(self): """Run the sensor on its own thread""" # Check for debug mode being activated self._read_my_msgQ_noWait() # Read recent stored cpu info self.read_stored_cpu_info() # Store alerts to be sent here self.alerts_for = {} # Specific info field for alerts self.specific_info = [] # Read current cpu info self.read_current_cpu_info() to_update = False # Compare with previous cpu info # If a cpu is present in prev_cpu_info and not present in current_cpu_info : fault alert is generated # If a cpu is present in current_cpu_info and not present in prev_cpu_info : two possibilities # 1) if cpu has an outstanding fault alert : it is a repaired cpu, hence generate fault_resolved # 2) if cpu has no outstanding alert : it is a newly added cpu, do not do anything try: if self.prev_cpu_info: if self.current_cpu_info != self.prev_cpu_info: # Create a set of all relevant cpus cpu_list = set(self.prev_cpu_info + self.current_cpu_info) # Iterate through the set for cpu in cpu_list: if cpu not in self.current_cpu_info and cpu not in self.stored_cpu_info[ self._node_id]['FAULT_LIST']: # This is a failed cpu self.stored_cpu_info[ self._node_id]['FAULT_LIST'].append(cpu) self.alerts_for[cpu] = "fault" elif cpu not in self.prev_cpu_info and cpu in self.stored_cpu_info[ self._node_id]['FAULT_LIST']: # This is a repaired cpu self.alerts_for[cpu] = "fault_resolved" # Update stored cpu info for next run self.stored_cpu_info[ self._node_id]['CPU_LIST'] = self.current_cpu_info to_update = True else: # Previous cpu info not available, need to store current info if not self.stored_cpu_info: # No info is available self.stored_cpu_info = {} # Add info for the current node self.stored_cpu_info[self._node_id] = {} self.stored_cpu_info[ self._node_id]['CPU_LIST'] = self.current_cpu_info self.stored_cpu_info[self._node_id]['FAULT_LIST'] = [] # Update stored cpu info to_update = True except Exception as e: raise Exception(f"Failed while processing cpu info, {e}") # Send alerts for cpu, alert_type in self.alerts_for.items(): if self._generate_alert( cpu, alert_type) == True and alert_type == "fault_resolved": # Delete from the FAULT_LIST self.stored_cpu_info[self._node_id]['FAULT_LIST'].remove(cpu) # Update stored cpu info if to_update: store.put(self.stored_cpu_info, self.CPU_FAULT_SENSOR_DATA) def fill_specific_info(self): """Fills the specific info to be sent via alert""" if not self.specific_info: # Create a set of all relevant cpus cpu_list = set(self.prev_cpu_info + self.current_cpu_info) # Iterate through the set for cpu in cpu_list: item = {} item['resource_id'] = self.RESOURCE_ID + str(cpu) # Keep default state online item['state'] = "online" if cpu in self.alerts_for.keys(): if self.alerts_for[cpu] == "fault": item['state'] = "offline" self.specific_info.append(item) def _create_json_message(self, cpu, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = self.os_utils.get_fqdn() # Populate specific info self.fill_specific_info() alert_specific_info = self.specific_info res_id = self.RESOURCE_ID + str(cpu) for item in alert_specific_info: if item['resource_id'] == res_id: if alert_type == "fault": description = "Faulty CPU detected, %s state is %s" % ( item['resource_id'], item["state"]) else: description = "Fault resolved for CPU, %s state is %s" % ( item['resource_id'], item["state"]) info = { "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID + str(cpu), "event_time": epoch_time, "description": description } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _generate_alert(self, cpu, alert_type): """Queues the message to NodeData Message Handler""" try: json_msg = self._create_json_message(cpu, alert_type) if json_msg: self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg) return True except Exception as e: logger.error(f"Exception while sending alert : {e}") return False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(CPUFaultSensor, self).shutdown()
class RealStorDiskSensor(SensorThread, InternalMsgQ): """Monitors RealStor enclosure disks state and raise sspl events for detected faults, insertion,removal events """ SENSOR_NAME = "RealStorDiskSensor" RESOURCE_TYPE = "enclosure:hw:disk" PRIORITY = 1 RSS_DISK_GET_ALL = "all" # Mandatory attributes in disk json data disk_generic_info = [ "enclosure-id", "enclosure-wwn", "slot", "description", "architecture", "interface", "serial-number", "size", "vendor", "model", "revision", "temperature", "status", "LED-status", "locator-LED", "blink", "smart", "health", "health-reason", "health-recommendation" ] # local resource cache latest_disks = {} memcache_disks = {} DISK_IDENTIFIER = "Disk 0." NUMERIC_IDENTIFIER = "numeric" invalidate_latest_disks_info = False # Dependency list DEPENDENCIES = { "plugins": ["RealStorEnclMsgHandler"], "rpms": [] } @staticmethod def name(): """@return: name of the module.""" return RealStorDiskSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Disks in storage enclosure can not be monitored." @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return RealStorDiskSensor.DEPENDENCIES def __init__(self): super(RealStorDiskSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.last_alert = None self.rssencl = singleton_realstorencl # disks persistent cache self.disks_prcache = f"{self.rssencl.frus}disks/" self.pollfreq_disksensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORDISKSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_disksensor == 0: self.pollfreq_disksensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = None self._event_wait_results = set() self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorDiskSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorDiskSensor, self).initialize_msgQ(msgQlist) return True def read_data(self): """Return the last raised alert, none otherwise""" return self.last_alert def run(self): """Run disk monitoring periodically on its own thread.""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(self.pollfreq_disksensor, self._priority, self.run, ()) return # Allow RealStor Encl MC to start services. #time.sleep(self.rssencl.REALSTOR_MC_BOOTWAIT) # Check for debug mode being activated self._read_my_msgQ_noWait() # poll all disk status and raise events if # insertion/removal detected self._rss_check_disks_presence() #Do not proceed further if latest disks info can't be validated due to store function error if not self.invalidate_latest_disks_info: # Polling system status self.rssencl.get_system_status() # check for disk faults & raise if found self._rss_check_disk_faults() else: logger.warn("Can not validate disk faults or presence due to persistence store error") # Reset debug mode if persistence is not enabled self._disable_debug_if_persist_false() # Fire every configured seconds to poll disks status self._scheduler.enter(self.pollfreq_disksensor, self._priority, self.run, ()) def _rss_raise_disk_alert(self, alert_type, disk_info): """Raise disk alert with supported alert type""" #logger.debug("Raise - alert type {0}, info {1}".format(alert_type,disk_info)) if not disk_info: logger.warn("disk_info None, ignoring") return if alert_type not in self.rssencl.fru_alerts: logger.error(f"Supplied alert type [{alert_type}] not supported") return # form json with default values disk = dict.fromkeys(self.disk_generic_info, "NA") disk['slot'] = -1 disk['blink'] = 0 disk['enclosure-id'] = 0 # Build data for must fields in fru disk data for item in self.disk_generic_info: if item in disk_info: disk[item] = disk_info[item] encl = self.rssencl.ENCL_FAMILY disk[encl] = self.rssencl.LDR_R1_ENCL # Build data for platform specific fields in fru disk data # get remaining extra key value pairs from passed disk_info extended_info = {key:disk_info[key] for key in disk_info if key not in\ disk and self.NUMERIC_IDENTIFIER not in key} # notify realstor encl msg handler self._send_json_msg(alert_type, disk, extended_info) def _rss_check_disks_presence(self): """Match cached realstor disk info with latest retrieved disks info """ self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL) if not self.memcache_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn("Last polled drives info in-memory cache " "unavailable , unable to check drive presence change") return if not self.latest_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn("Latest polled drives info in-memory cache " "unavailable, unable to check drive presence change") return # keys are disk slot numbers removed_disks = set(self.memcache_disks.keys()) - set(self.latest_disks.keys()) inserted_disks = set(self.latest_disks.keys()) - set(self.memcache_disks.keys()) # get populated slots in both caches populated = set(self.memcache_disks.keys()) & set(self.latest_disks.keys()) # check for replaced disks for slot in populated: if self.memcache_disks[slot]['serial-number'] != self.latest_disks[slot]['serial-number']: if slot not in removed_disks: removed_disks.add(slot) if slot not in inserted_disks: inserted_disks.add(slot) # If no difference seen between cached & latest set of disk list, # means no disk removal or insertion happened if not (removed_disks or inserted_disks): #logger.info("Disk presence state _NOT_ changed !!!") return self._event = Event() for slot in removed_disks: #get removed drive data from disk cache disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev" path_exists, _ = store.exists(disk_datafile) if not path_exists: disk_datafile = f"{self.disks_prcache}disk_{slot}.json" disk_info = store.get(disk_datafile) #raise alert for missing drive self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info) # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.delete(disk_datafile) self._event.clear() self._event = None for slot in inserted_disks: #get inserted drive data from disk cache disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json") #raise alert for added drive self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info) # Update health status for inserted disk in memfault cache, # to raise fault alert after insertion if inserted disk status is not OK. if disk_info["health"] != "OK": for id_fault, cached_fault in enumerate(self.rssencl.memcache_faults): #fetch disk slot from component_id present in memcache_faults. try: component_id = cached_fault["component-id"] if component_id.startswith('Disk 0'): disk_id = int(cached_fault["component-id"].split()[1].split('.')[1]) if disk_id == slot: self.rssencl.memcache_faults[id_fault]['health'] = "OK" except Exception as e: logger.error(f"Error in updating health status for \ inserted disk in memfault cache {e}") # Update cached disk data after comparison self.memcache_disks = self.latest_disks self.rssencl.memcache_frus.update({"disks":self.memcache_disks}) return def rss_cliapi_poll_disks(self, disk): """Retreive realstor disk info using cli api /show/disks""" # make ws request url = self.rssencl.build_url( self.rssencl.URI_CLIAPI_SHOWDISKS) if(disk != self.RSS_DISK_GET_ALL): diskId = disk.partition("0.")[2] if(diskId.isdigit()): url = f"{url}/{disk}" url = f"{url}/detail" response = self.rssencl.ws_request( url, self.rssencl.ws.HTTP_GET) if not response: logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed") return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"to poll disks failed with err {response.status_code}") return try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error(f"{url} returned mal-formed json:\n{badjson}") if jresponse: api_resp = self.rssencl.get_api_status(jresponse['status']) #logger.debug("%s api response:%d" % (url.format(),api_resp)) if ((api_resp == -1) and (response.status_code == self.rssencl.ws.HTTP_OK)): logger.warn("/show/disks api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: drives = jresponse['drives'] # reset latest drive cache to build new self.latest_disks = {} self.invalidate_latest_disks_info = False for drive in drives: slot = drive.get("slot", -1) sn = drive.get("serial-number", "NA") health = drive.get("health", "NA") if slot != -1: self.latest_disks[slot] = {"serial-number":sn, "health":health} #dump drive data to persistent cache dcache_path = f"{self.disks_prcache}disk_{slot}.json" # If drive is replaced, previous drive info needs # to be retained in disk_<slot>.json.prev file and # then only dump new data to disk_<slot>.json path_exists, ret_val = store.exists(dcache_path) if path_exists and ret_val == "Success": prevdrive = store.get(dcache_path) if prevdrive is not None: prevsn = prevdrive.get("serial-number","NA") prevhealth = prevdrive.get("health", "NA") if prevsn != sn or prevhealth != health: # Rename path store.put(store.get(dcache_path), dcache_path + ".prev") store.delete(dcache_path) store.put(drive, dcache_path) elif not path_exists and ret_val == "Success": store.put(drive, dcache_path) else: # Invalidate latest disks info if persistence store error encountered logger.warn(f"store.exists {dcache_path} return value {ret_val}") self.invalidate_latest_disks_info = True break if self.invalidate_latest_disks_info is True: # Reset latest disks info self.latest_disks = {} #If no in-memory cache, build from persistent cache if not self.memcache_disks: self._rss_build_disk_cache_from_persistent_cache() # if no memory cache still if not self.memcache_disks: self.memcache_disks = self.latest_disks def _rss_build_disk_cache_from_persistent_cache(self): """Retreive realstor system state info using cli api /show/system""" files = store.get_keys_with_prefix(self.disks_prcache) if not files: logger.debug("No files in Disk cache folder, ignoring") return for filename in files: if filename.startswith('disk_') and filename.endswith('.json'): if f"{filename}.prev" in files: filename = f"{filename}.prev" drive = store.get(self.disks_prcache + filename) slotstr = re.findall("disk_(\d+).json", filename)[0] if not slotstr.isdigit(): logger.debug(f"slot {slotstr} not numeric, ignoring") continue slot = int(slotstr) if drive : sn = drive.get("serial-number","NA") self.memcache_disks[slot] = {"serial-number":sn} #logger.debug("Disk cache built from persistent cache {0}". # format(self.memcache_disks)) def _rss_check_disk_faults(self): """Retreive realstor system state info using cli api /show/system""" if not self.rssencl.check_system_faults_changed(): #logger.debug("System faults state _NOT_ changed !!! ") return try: # Extract new system faults faults = self.rssencl.latest_faults # TODO optimize to avoid nested 'for' loops. # Second 'for' loop in check_new_fault() self._event = Event() if faults: for fault in faults: #logger.debug("Faulty component-id {0}, IDENT {1}"\ # .format(fault["component-id"], self.DISK_IDENTIFIER)) # Check faulting component type if self.DISK_IDENTIFIER in fault["component-id"]: # If fault on disk, get disk full info including health if self.rssencl.check_new_fault(fault): # Extract slot from "component-id":"Disk 0.39" slot = fault["component-id"].split()[1].split('.')[1] # Alert send only if disks_prcache updated with latest disk data if self.latest_disks[int(slot)]["health"] != "OK": #get drive data from disk cache disk_info = store.get( self.disks_prcache+"disk_{0}.json".format(slot)) # raise alert for disk fault self._rss_raise_disk_alert(self.rssencl.FRU_FAULT, disk_info) # To ensure all msg is sent to message bus or added in consul for resending. self._event_wait_results.add( self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT)) self._event.clear() # Check for resolved faults for cached in self.rssencl.memcache_faults: if not any(d.get("component-id", None) == cached["component-id"] \ for d in self.rssencl.latest_faults) and self.DISK_IDENTIFIER in cached["component-id"]: # Extract slot from "component-id":"Disk 0.39" logger.info(f"Found resolved disk fault for {cached['component-id']}") slot = cached["component-id"].split()[1].split('.')[1] # Alert send only if disks_prcache updated with latest disk data if self.latest_disks[int(slot)]["health"] == "OK": # get drive data from disk cache disk_info = store.get( self.disks_prcache+"disk_{0}.json".format(slot)) # raise alert for resolved disk fault self._rss_raise_disk_alert(self.rssencl.FRU_FAULT_RESOLVED, disk_info) # To ensure all msg is sent to message bus or added in consul for resending. self._event_wait_results.add( self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT)) self._event.clear() # If all messages are sent to message bus or added in consul for resending. # then only update cache if self._event_wait_results and all(self._event_wait_results): self.rssencl.update_memcache_faults() self._event_wait_results.clear() self._event = None except Exception as e: logger.exception(f"Error in _rss_check_disk_faults {e}") def _gen_json_msg(self, alert_type, details, ext): """ Generate json message""" severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('disk') resource_id = ext.get("durable-id") host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "fru": fru, "resource_id": resource_id, "event_time": epoch_time } specific_info = dict() specific_info.update(details) specific_info.update(ext) for k in specific_info.keys(): if specific_info[k] == "": specific_info[k] = "N/A" json_msg = json.dumps( {"sensor_request_type" : { "enclosure_alert" : { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": specific_info }, }}) return json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _send_json_msg(self, alert_type, details, ext): """Transmit alert data to RealStorEnclMsgHandler to be processed and sent out """ internal_json_msg = self._gen_json_msg(alert_type, details, ext) self.last_alert = internal_json_msg # Send the event to storage encl message handler to generate json message and send out self._write_internal_msgQ(RealStorEnclMsgHandler.name(), internal_json_msg, self._event) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RealStorDiskSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RealStorDiskSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RealStorDiskSensor, self).shutdown()
class RealStorPSUSensor(SensorThread, InternalMsgQ): """Monitors PSU data using RealStor API""" SENSOR_NAME = "RealStorPSUSensor" RESOURCE_CATEGORY = "enclosure:hw:psu" PRIORITY = 1 # PSUs directory name PSUS_DIR = "psus" # Dependency list DEPENDENCIES = { "plugins": ["RealStorEnclMsgHandler"], "rpms": [] } @staticmethod def name(): """@return: name of the monitoring module.""" return RealStorPSUSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "PSUs in storage enclosure can not be monitored." @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return RealStorPSUSensor.DEPENDENCIES def __init__(self): super(RealStorPSUSensor, self).__init__( self.SENSOR_NAME, self.PRIORITY) self._faulty_psu_file_path = None self.rssencl = singleton_realstorencl # psus persistent cache self.psu_prcache = None # Holds PSUs with faults. Used for future reference. self._previously_faulty_psus = {} self.pollfreq_psusensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORPSUSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_psusensor == 0: self.pollfreq_psusensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorPSUSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorPSUSensor, self).initialize_msgQ(msgQlist) self.psu_prcache = os.path.join(self.rssencl.frus, self.PSUS_DIR) # Persistence file location. This file stores faulty PSU data self._faulty_psu_file_path = os.path.join( self.psu_prcache, "psudata.json") self._log_debug( f"_faulty_psu_file_path: {self._faulty_psu_file_path}") # Load faulty PSU data from file if available self._previously_faulty_psus = store.get(\ self._faulty_psu_file_path) if self._previously_faulty_psus is None: self._previously_faulty_psus = {} store.put(self._previously_faulty_psus,\ self._faulty_psu_file_path) return True def read_data(self): """This method is part of interface. Currently it is not in use. """ return {} def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(10, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() psus = None psus = self._get_psus() if psus: self._get_msgs_for_faulty_psus(psus) # Reset debug mode if persistence is not enabled self._disable_debug_if_persist_false() # Fire every 10 seconds to see if We have a faulty PSU self._scheduler.enter(self.pollfreq_psusensor, self._priority, self.run, ()) def _get_psus(self): """Receives list of PSUs from API. URL: http://<host>/api/show/power-supplies """ url = self.rssencl.build_url( self.rssencl.URI_CLIAPI_SHOWPSUS) response = self.rssencl.ws_request( url, self.rssencl.ws.HTTP_GET) if not response: logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: PSUs status unavailable as ws request {url} failed") return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"to get power-supplies failed with err {response.status_code}") return response_data = json.loads(response.text) psus = response_data.get("power-supplies") return psus def _get_msgs_for_faulty_psus(self, psus, send_message = True): """Checks for health of psus and returns list of messages to be sent to handler if there are any. """ self._log_debug( f"RealStorPSUSensor._get_msgs_for_faulty_psus -> {psus} {send_message}") faulty_psu_messages = [] internal_json_msg = None psu_health = None durable_id = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_psus state_changed = False if not psus: return for psu in psus: psu_health = psu["health"].lower() durable_id = psu["durable-id"] psu_health_reason = psu["health-reason"] # Check for missing and fault case if psu_health == self.rssencl.HEALTH_FAULT: self._log_debug("Found fault in PSU {0}".format(durable_id)) alert_type = self.rssencl.FRU_FAULT # Check for removal if self._check_if_psu_not_installed(psu_health_reason): alert_type = self.rssencl.FRU_MISSING state_changed = not (durable_id in self._previously_faulty_psus and self._previously_faulty_psus[durable_id]["alert_type"] == alert_type) if state_changed: self._previously_faulty_psus[durable_id] = { "health": psu_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for fault case elif psu_health == self.rssencl.HEALTH_DEGRADED: self._log_debug("Found degraded in PSU {0}".format(durable_id)) state_changed = durable_id not in self._previously_faulty_psus if state_changed: alert_type = self.rssencl.FRU_FAULT self._previously_faulty_psus[durable_id] = { "health": psu_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for healthy case elif psu_health == self.rssencl.HEALTH_OK: self._log_debug("Found ok in PSU {0}".format(durable_id)) state_changed = durable_id in self._previously_faulty_psus if state_changed: # Send message to handler if send_message: previous_alert_type = \ self._previously_faulty_psus[durable_id]["alert_type"] alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) if send_message: self._send_json_msg(internal_json_msg) del self._previously_faulty_psus[durable_id] # Persist faulty PSU list to file only if something is changed if state_changed: # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_psus,\ self._faulty_psu_file_path) else: self._previously_faulty_psus = store.get(self._faulty_psu_file_path) state_changed = False alert_type = "" return faulty_psu_messages def _get_hostname(self): try: return self.os_utils.get_fqdn() except Exception as e: logger.exception("Got exception {} when trying to get hostname" " using getfqdn().".format(e)) logger.info(" Trying with ip addr command") try: from subprocess import run, PIPE from re import findall IP_CMD = "ip -f inet addr show scope global up | grep inet" IP_REGEX = b'\\b(\\d{1,3}(?:\\.\d{1,3}){3})/\d{1,2}\\b' ip_out = run(IP_CMD, stdout=PIPE, shell=True, check=True) ip_list = re.findall(IP_REGEX, ip_out.stdout) if ip_list: return ip_list[0] except Exception as e: logger.exception("Got exception {} when trying to get hostname" " using ip addr command.".format(e)) # Ultimate fallback, when we are completely out of options logger.info("Using localhost") return "localhost" def _create_internal_msg(self, psu_detail, alert_type): """Forms a dictionary containing info about PSUs to send to message handler. """ self._log_debug( f"RealStorPSUSensor._create_internal_msg -> {psu_detail} {alert_type}") if not psu_detail: return {} severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('POWER_SUPPLY') resource_id = psu_detail.get("durable-id") host_name = self._get_hostname() info = { "resource_type": self.RESOURCE_CATEGORY, "fru": fru, "resource_id": resource_id, "event_time": epoch_time } specific_info = { "enclosure-id": psu_detail.get("enclosure-id"), "serial-number": psu_detail.get("serial-number"), "description": psu_detail.get("description"), "revision": psu_detail.get("revision"), "model": psu_detail.get("model"), "vendor": psu_detail.get("vendor"), "location": psu_detail.get("location"), "part-number": psu_detail.get("part-number"), "fru-shortname": psu_detail.get("fru-shortname"), "mfg-date": psu_detail.get("mfg-date"), "mfg-vendor-id": psu_detail.get("mfg-vendor-id"), "dc12v": psu_detail.get("dc12v"), "dc5v": psu_detail.get("dc12v"), "dc33v": psu_detail.get("dc33v"), "dc12i": psu_detail.get("dc12i"), "dc5i": psu_detail.get("dc5i"), "dctemp": psu_detail.get("dctemp"), "health": psu_detail.get("health"), "health-reason": psu_detail.get("health-reason"), "health-recommendation": psu_detail.get("health-recommendation"), "status": psu_detail.get("status"), "durable-id": psu_detail.get("durable-id"), "position": psu_detail.get("position"), } for k in specific_info.keys(): if specific_info[k] == "": specific_info[k] = "N/A" # Creates internal json message request structure. # this message will be passed to the StorageEnclHandler internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": specific_info } }}) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _send_json_msg(self, json_msg): """Sends JSON message to Handler""" self._log_debug( "RealStorPSUSensor._send_json_msg -> {0}".format(json_msg)) if not json_msg: return self._event.clear() self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event) def _check_if_psu_not_installed(self, health_reason): """Checks if PSU is not installed by checking <not installed> line in health-reason key. It uses re.findall method to check if desired string exists in health-reason. Returns boolean based on length of the list of substrings found in health-reason. So if length is 0, it returns False, else True. """ return bool(re.findall("not installed", health_reason)) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RealStorPSUSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RealStorPSUSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RealStorPSUSensor, self).shutdown()
class RealStorEnclosureSensor(SensorThread, InternalMsgQ): """Monitors Enclosure""" # Dependency list DEPENDENCIES = {"plugins": ["RealStorEnclMsgHandler"], "rpms": []} SENSOR_NAME = "RealStorEnclosureSensor" SENSOR_RESP_TYPE = "enclosure_alert" RESOURCE_CATEGORY = "hw" RESOURCE_TYPE = "enclosure" ENCL_FAULT_RESOLVED_EVENTS = ["The network-port Ethernet link is down for controller A",\ "The network-port Ethernet link is down for controller B",\ "The Management Controller IP address changed",\ "The Management Controller booted up.",\ "Both controllers have shut down; no restart",\ "Storage Controller booted up (cold boot - power up).",\ "Management Controller configuration parameters were set"] PRIORITY = 1 alert_type = None previous_alert_type = None fault_alert = False encl_status = None system_status = None @staticmethod def name(): """@return: name of the monitoring module.""" return RealStorEnclosureSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Storage enclosure can not be monitored." @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return RealStorEnclosureSensor.DEPENDENCIES def __init__(self): super(RealStorEnclosureSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self.rssencl = singleton_realstorencl # Flag to indicate suspension of module self._suspended = False self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorEnclosureSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorEnclosureSensor, self).initialize_msgQ(msgQlist) self.ENCL_SENSOR_DATA_PATH = os.path.join(self.rssencl.encl_cache, 'enclosure_data.json') # Get the stored previous alert info self.persistent_encl_data = store.get(self.ENCL_SENSOR_DATA_PATH) if self.persistent_encl_data: if self.persistent_encl_data['fault_alert'].lower() == "true": self.fault_alert = True else: self.fault_alert = False self.previous_alert_type = self.persistent_encl_data[ 'previous_alert_type'] else: self.persistent_encl_data = { 'fault_alert': str(self.fault_alert), 'previous_alert_type': str(self.previous_alert_type), } store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH) return True def read_data(self): """This method is part of interface. Currently it is not in use. """ return {} def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(10, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() # Timeout counter for controller login failed and ws request failed mc_timeout_counter = self.rssencl.mc_timeout_counter # mc_timeout_counter==0, fault_alert==True & prev_alert_type!=FAULT_RESOLVED # all can be met True with a sspl restart & persistent cache, so ws_response # status finally decides whether to send FAULT_RESOLVED alert or not. ws_response_status = self.rssencl.ws_response_status if mc_timeout_counter > 10 and self.fault_alert is False: self.alert_type = self.rssencl.FRU_FAULT self.encl_status = "Storage Enclosure unreachable,"+\ "Possible causes : Enclosure / Storage Controller /"+\ "Management Controller rebooting,"+\ "Network port blocked by firewall,"+\ "Network outage or Power outage." self.fault_alert = True elif mc_timeout_counter == 0 and ws_response_status == self.rssencl.ws.HTTP_OK \ and self.previous_alert_type != self.rssencl.FRU_FAULT_RESOLVED \ and self.fault_alert == True: # Check system status self.system_status = self.check_system_status() if self.system_status is not None: self.alert_type = self.rssencl.FRU_FAULT_RESOLVED enclosure_status = self.system_status[0:5] for status in enclosure_status: if status["severity"] == "INFORMATIONAL": msg = status["message"] for event in self.ENCL_FAULT_RESOLVED_EVENTS: if event in msg: self.encl_status = event break self.fault_alert = False if self.alert_type is not None: self.send_json_msg(self.alert_type, self.encl_status) self.alert_type = None self._scheduler.enter(30, self._priority, self.run, ()) def check_system_status(self): """Returns system staus using API /show/events""" url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWEVENTS) # apply filter to fetch last 20 events url = url + " last 20" response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn("System status unavailable as ws request failed") return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: raise Exception( f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"failed with http err {response.status_code}") return response_data = json.loads(response.text) enclosure_status = response_data["events"] return enclosure_status def send_json_msg(self, alert_type, encl_status): severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('enclosure') resource_id = "0" host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "fru": fru, "resource_id": resource_id, "event_time": epoch_time, "description": encl_status } internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": { "event": encl_status } } } }) self.previous_alert_type = alert_type self._write_internal_msgQ(RealStorEnclMsgHandler.name(), internal_json_msg) self.persistent_encl_data = { 'fault_alert': str(self.fault_alert), 'previous_alert_type': str(self.previous_alert_type), } store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH) def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def suspend(self): """Suspend the module thread. It should be non-blocking""" super(RealStorEnclosureSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RealStorEnclosureSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RealStorEnclosureSensor, self).shutdown()
class RAIDsensor(SensorThread, InternalMsgQ): SENSOR_NAME = "RAIDsensor" PRIORITY = 1 RESOURCE_TYPE = "node:os:raid_data" # Section and keys in configuration file RAIDSENSOR = SENSOR_NAME.upper() RAID_STATUS_FILE = 'RAID_status_file' RAID_CONF_FILE = '/etc/mdadm.conf' RAID_DOWN_DRIVE_STATUS = [{ "status": "Down/Missing" }, { "status": "Down/Missing" }] SYSTEM_INFORMATION = "SYSTEM_INFORMATION" prev_alert_type = {} alert_type = None # alerts FAULT_RESOLVED = "fault_resolved" FAULT = "fault" MISSING = "missing" INSERTION = "insertion" CACHE_DIR_NAME = "server" # Dependency list DEPENDENCIES = { "init": ["DiskMonitor"], } @staticmethod def name(): """@return: name of the monitoring module.""" return RAIDsensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Server RAID disks can not be monitored." def __init__(self): super(RAIDsensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Current RAID status information self._RAID_status = None # Location of hpi data directory populated by dcs-collector self._start_delay = 10 # Flag to indicate suspension of module self._suspended = False self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RAIDsensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RAIDsensor, self).initialize_msgQ(msgQlist) self._RAID_status_file = self._get_RAID_status_file() logger.info(f"Monitoring RAID status file: {self._RAID_status_file}") # The status file contents self._RAID_status_contents = "N/A" # The mdX status line in the status file self._RAID_status = {} self._faulty_drive_list = {} self._faulty_device_list = set() self._drives = {} self._total_drives = {} self._devices = [] self._missing_drv = {} self._prev_drive_dict = {} self.prev_alert_type = {} self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') # Allow systemd to process all the drives so we can map device name to serial numbers #time.sleep(120) cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.RAID_SENSOR_DATA_PATH = os.path.join( cache_dir_path, f'RAID_SENSOR_DATA_{self._node_id}') # Get the stored previous alert info self.persistent_raid_data = {} if os.path.isfile(self.RAID_SENSOR_DATA_PATH): self.persistent_raid_data = store.get(self.RAID_SENSOR_DATA_PATH) if self.persistent_raid_data: self._RAID_status_contents = self.persistent_raid_data[ '_RAID_status_contents'] self._RAID_status = self.persistent_raid_data['_RAID_status'] self._faulty_drive_list = self.persistent_raid_data[ '_faulty_drive_list'] self._faulty_device_list = self.persistent_raid_data[ '_faulty_device_list'] self._drives = self.persistent_raid_data['_drives'] self._total_drives = self.persistent_raid_data['_total_drives'] self._devices = self.persistent_raid_data['_devices'] self._missing_drv = self.persistent_raid_data['_missing_drv'] self._prev_drive_dict = self.persistent_raid_data[ '_prev_drive_dict'] self.prev_alert_type = self.persistent_raid_data['prev_alert_type'] else: self.persistent_raid_data = { '_RAID_status_contents': self._RAID_status_contents, '_RAID_status': self._RAID_status, '_faulty_drive_list': self._faulty_drive_list, '_faulty_device_list': self._faulty_device_list, '_drives': self._drives, '_total_drives': self._total_drives, '_devices': self._devices, '_missing_drv': self._missing_drv, '_prev_drive_dict': self._prev_drive_dict, 'prev_alert_type': self.prev_alert_type, } store.put(self.persistent_raid_data, self.RAID_SENSOR_DATA_PATH) return True def read_data(self): """Return the Current RAID status information""" return self._RAID_status def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(30, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() # self._set_debug(True) # self._set_debug_persist(True) # Check for a change in status file and notify the node data msg handler self._notify_NodeDataMsgHandler() # Reset debug mode if persistence is not enabled self._disable_debug_if_persist_false() # Fire every 30 seconds to see if there's a change in RAID status file self._scheduler.enter(30, self._priority, self.run, ()) def _notify_NodeDataMsgHandler(self): """See if the status files changed and notify node data message handler for generating JSON message""" self._drive_state_changed = False # resource_id for drive alerts resource_id = None if not os.path.isfile(self._RAID_status_file): logger.warn( f"status_file: {self._RAID_status_file} does not exist, ignoring." ) return # Read in status and see if it has changed with open(self._RAID_status_file, "r") as datafile: status = datafile.read() # Do nothing if the RAID status file has not changed if self._RAID_status_contents == status: self._log_debug( f"_notify_NodeDataMsgHandler status unchanged, ignoring: {status}" ) return # Update the RAID status contents of file self._RAID_status_contents = status # Process mdstat file and send json msg to NodeDataMsgHandler md_device_list, drive_dict, drive_status_changed = self._process_mdstat( ) # checks mdadm conf file for missing raid array and send json message to NodeDataMsgHandler self._process_missing_md_devices(md_device_list, drive_dict) for device in md_device_list: if drive_dict: if len(drive_dict[device]) < self._total_drives[device] and \ device in self.prev_alert_type and self.prev_alert_type[device] != self.MISSING: self.alert_type = self.MISSING if device in self._prev_drive_dict: missing_drive = set( self._prev_drive_dict[device]).difference( set(drive_dict[device])) try: missing_drive = "/dev/" + list(missing_drive)[0] except IndexError: missing_drive = "NA" else: missing_drive = "NA" resource_id = device + ":" + missing_drive self._missing_drv = { "path": missing_drive, "serialNumber": "None" } self._map_drive_status(device, drive_dict, "Missing") self._drive_state_changed = True elif len(drive_dict[device]) >= self._total_drives[device] and \ device in self.prev_alert_type and self.prev_alert_type[device] == self.MISSING: self.alert_type = self.INSERTION resource_id = device + ":/dev/" + drive_dict[device][0] self._map_drive_status(device, drive_dict[device][0], "Down/Recovery") self._drive_state_changed = True if self.alert_type is not None and self._drive_state_changed == True: self._prev_drive_dict[device] = drive_dict[device] self._send_json_msg(self.alert_type, resource_id, device, self._drives[device]) if drive_status_changed[device]: for drive in self._drives[device]: if drive.get("identity") is not None: drive_path = drive.get("identity").get("path") drive_name = drive_path[5:] resource_id = device + ":/dev/" + drive_name drive_status = drive.get("status") if drive_status not in ["U", "UP"] and device in self._faulty_drive_list and \ drive_name not in self._faulty_drive_list[device] and \ self.prev_alert_type[device] != self.MISSING: self.alert_type = self.FAULT self._map_drive_status(device, drive_name, "Down") self._drive_state_changed = True self._faulty_drive_list[device][ drive_name] = self.alert_type elif drive_status in ["U", "UP", "Down/Recovery"] and device in self._faulty_drive_list and \ drive_name in self._faulty_drive_list[device]: self.alert_type = self.FAULT_RESOLVED self._map_drive_status(device, drive_name, "UP") self._drive_state_changed = True del self._faulty_drive_list[device][drive_name] if self.alert_type is not None and self._drive_state_changed == True: self._prev_drive_dict[device] = drive_dict[ device] self._send_json_msg(self.alert_type, resource_id, device, self._drives[device]) def _process_mdstat(self): """Parse out status' and path info for each drive""" # Replace new line chars with spaces mdstat = self._RAID_status_contents.strip().split("\n") md_device_list = [] drive_dict = {} monitored_device = mdstat drive_status_changed = {} self._devices.clear() # Array of optional identity json sections for drives in array self._identity = {} # Read in each line looking for a 'mdXXX' value md_line_parsed = False for line in monitored_device: # The line following the mdXXX : ... contains the [UU] status that we need if md_line_parsed is True: # Format is [x/y][UUUU____...] drive_status_changed[self._device] = self._parse_raid_status( line, self._device) # Reset in case their are multiple configs in file md_line_parsed = False # Break the line apart into separate fields fields = line.split(" ") # Parse out status' and path info for each drive if "md" in fields[0]: self._device = f"/dev/{fields[0]}" self._devices.append(self._device) self._log_debug(f"md device found: {self._device}") md_device_list.append(self._device) drive_dict[self._device] = [] if self._device not in self.prev_alert_type: self.prev_alert_type[self._device] = None if self._device not in self._faulty_drive_list: self._faulty_drive_list[self._device] = {} # Parse out raid drive paths if they're present self._identity[self._device] = {} for field in fields: if "[" in field: if field not in drive_dict[self._device]: index = field.find("[") drive_name = field[:index] drive_dict[self._device].append(drive_name) self._add_drive(field, self._device) md_line_parsed = True return md_device_list, drive_dict, drive_status_changed def _add_drive(self, field, device): """Adds a drive to the list""" first_bracket_index = field.find('[') # Parse out the drive path drive_path = f"/dev/{field[: first_bracket_index]}" # Parse out the drive index into [UU] status which is Device Role field detail_command = f"/usr/sbin/mdadm --examine {drive_path} | grep 'Device Role'" response, error = self._run_command(detail_command) if error: self._log_debug( f"_add_drive, Error retrieving drive index into status, example: [U_]: {str(error)}" ) try: drive_index = int(response.split(" ")[-1]) except Exception as ae: self._log_debug(f"_add_drive, get drive_index error: {str(ae)}") return self._log_debug( f"_add_drive, drive index: {drive_index}, path: {drive_path}") # Create the json msg, serial number will be filled in by NodeDataMsgHandler identity_data = {"path": drive_path, "serialNumber": "None"} self._identity[device][drive_index] = identity_data def _parse_raid_status(self, status_line, device): """Parses the status of each drive denoted by U & _ for drive being Up or Down in raid """ # Parse out x for total number of drives first_bracket_index = status_line.find('[') # If no '[' found, return if first_bracket_index == -1: return False self._total_drives[device] = int(status_line[first_bracket_index + 1]) self._log_debug("_parse_raid_status, total_drives: %d" % self._total_drives[device]) # Break the line apart into separate fields fields = status_line.split(" ") # The last field is the list of U & _ status = fields[-1] self._log_debug("_parse_raid_status, status: %s, total drives: %d" % (status, self._total_drives[device])) # Array of raid drives in json format based on schema self._drives[device] = [] drive_index = 0 while drive_index < self._total_drives[device]: # Create the json msg and append it to the list if self._identity.get(device).get(drive_index) is not None: path = self._identity.get(device).get(drive_index).get("path") drive_status_msg = { "status": status[drive_index + 1], # Move past '[' "identity": { "path": path, "serialNumber": "None" } } else: drive_status_msg = { "status": status[drive_index + 1] } # Move past '[' self._log_debug(f"_parse_raid_status, drive_index: {drive_index}") self._log_debug( f"_parse_raid_status, drive_status_msg: {drive_status_msg}") self._drives[device].append(drive_status_msg) drive_index = drive_index + 1 # See if the status line has changed, if not there's nothing to do if device in self._RAID_status and self._RAID_status[device] == status: self._log_debug(f"RAID status has not changed, ignoring: {status}") return False else: self._log_debug( f"RAID status has changed, old: {self._RAID_status}, new: {status}" ) self._RAID_status[device] = status return True def _process_missing_md_devices(self, md_device_list, drive_dict): """ checks the md raid configuration file, compares all it's entries with list of arrays from mdstat file and sends missing entry """ if not os.path.isfile(self.RAID_CONF_FILE): logger.warn( f"_process_missing_md_devices, MDRaid configuration file {self.RAID_CONF_FILE} is missing" ) return conf_device_list = [] with open(self.RAID_CONF_FILE, 'r') as raid_conf_file: raid_conf_data = raid_conf_file.read().strip().split("\n") for line in raid_conf_data: try: raid_conf_field = line.split(" ") if "#" not in raid_conf_field[0] and "ARRAY" in raid_conf_field[0] and \ "/md" in raid_conf_field[1]: # Mapped the device i.e. /dev/md/1 and /dev/md1 will be the same device. map_device = raid_conf_field[1].split('md/') if len(map_device) > 1: conf_device_list.append(map_device[0] + 'md' + map_device[1]) else: conf_device_list.append(raid_conf_field[1]) except Exception as ae: self._log_debug( f"_process_missing_md_devices, error retrieving raid entry \ from {self.RAID_CONF_FILE} file: {str(ae)}") return # compare conf file raid array list with mdstat raid array list for device in conf_device_list: if device not in md_device_list and device not in self._faulty_device_list: # add that missing raid array entry into the list of raid devices self.alert_type = self.FAULT self._faulty_device_list.add(device) self._send_json_msg(self.alert_type, device, device, self.RAID_DOWN_DRIVE_STATUS) elif device in md_device_list and device in self._faulty_device_list: # add that missing raid array entry into the list of raid devices self.alert_type = self.FAULT_RESOLVED self._map_drive_status(device, drive_dict, "Down/Recovery") self._faulty_device_list.remove(device) self._send_json_msg(self.alert_type, device, device, self._drives[device]) def _map_drive_status(self, device, drives, drv_status): for drv in self._drives[device]: if isinstance(drives, str): if drv["status"] not in [ "U", "UP" ] and drv["identity"]["path"] == '/dev/' + drives: drv["status"] = drv_status else: for drive in drives[device]: # Drive info is not available in missing case. if drv_status == "Missing" and drv["status"] == "_": drv["status"] = drv_status drv["identity"] = self._missing_drv elif drv["status"] not in [ "U", "UP" ] and drv["identity"]["path"] == '/dev/' + drive: drv["status"] = drv_status if drv["status"] == "U": drv["status"] = "UP" def _send_json_msg(self, alert_type, resource_id, device, drives): """Transmit data to NodeDataMsgHandler to be processed and sent out""" epoch_time = str(int(time.time())) severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) self._alert_id = self._get_alert_id(epoch_time) host_name = self.os_utils.get_fqdn() if alert_type == self.MISSING: description = "RAID array or drive from RAID array is missing." elif alert_type == self.FAULT: description = "RAID array or drive from RAID array is faulty." elif alert_type == self.INSERTION: description = "Inserted drive in RAID array." elif alert_type == self.FAULT_RESOLVED: description = "Fault for RAID array or RAID drive is resolved" else: description = "Raid array alert" info = { "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time, "description": description } specific_info = {"device": device, "drives": drives} internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "sensor_type": "node:os:raid_data", "host_id": host_name, "alert_type": alert_type, "alert_id": self._alert_id, "severity": severity, "info": info, "specific_info": specific_info } } }) self.prev_alert_type[device] = alert_type self.alert_type = None # Send the event to node data message handler to generate json message and send out self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg) # Save the state to Persistent Cache. self.persistent_raid_data = { '_RAID_status_contents': self._RAID_status_contents, '_RAID_status': self._RAID_status, '_faulty_drive_list': self._faulty_drive_list, '_faulty_device_list': self._faulty_device_list, '_drives': self._drives, '_total_drives': self._total_drives, '_devices': self._devices, '_missing_drv': self._missing_drv, '_prev_drive_dict': self._prev_drive_dict, 'prev_alert_type': self.prev_alert_type, } store.put(self.persistent_raid_data, self.RAID_SENSOR_DATA_PATH) def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RAIDsensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RAIDsensor, self).resume() self._suspended = False def _run_command(self, command): """Run the command and get the response and error returned""" self._log_debug(f"_run_command: {command}") process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) response, error = process.communicate() if response: self._log_debug(f"_run_command, response: {str(response)}") if error: self._log_debug(f"_run_command: error: {str(error)}") return response.decode().rstrip('\n'), error.decode().rstrip('\n') def _get_RAID_status_file(self): """Retrieves the file containing the RAID status information""" return Conf.get(SSPL_CONF, f"{self.RAIDSENSOR}>{self.RAID_STATUS_FILE}", '/proc/mdstat') def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RAIDsensor, self).shutdown()
class RealStorLogicalVolumeSensor(SensorThread, InternalMsgQ): """Monitors Logical Volume data using RealStor API""" SENSOR_NAME = "RealStorLogicalVolumeSensor" SENSOR_RESP_TYPE = "enclosure_logical_volume_alert" RESOURCE_CATEGORY = "cortx" RESOURCE_TYPE_LVOL = "enclosure:cortx:logical_volume" RESOURCE_TYPE_DG = "enclosure:cortx:disk_group" PRIORITY = 1 # Dependency list DEPENDENCIES = { "plugins": ["RealStorEnclMsgHandler"], "rpms": [] } disk_groups_generic = ["object-name", "name", "size", "freespace", "storage-type", "pool", "pool-serial-number", "pool-percentage", "owner", "raidtype", "status", "create-date", "disk-description", "serial-number", "pool-sector-format", "health", "health-reason", "health-recommendation"] disk_groups_extended = ['blocksize', 'size-numeric', 'freespace-numeric', 'raw-size', 'raw-size-numeric', 'storage-type-numeric', 'storage-tier', 'storage-tier-numeric', 'total-pages', 'allocated-pages', 'available-pages', 'performance-rank', 'owner-numeric', 'preferred-owner', 'preferred-owner-numeric', 'raidtype-numeric', 'diskcount', 'sparecount', 'chunksize', 'status-numeric', 'lun', 'min-drive-size', 'min-drive-size-numeric', 'create-date-numeric', 'cache-read-ahead', 'cache-read-ahead-numeric', 'cache-flush-period', 'read-ahead-enabled', 'read-ahead-enabled-numeric', 'write-back-enabled', 'write-back-enabled-numeric', 'job-running', 'current-job', 'current-job-numeric', 'current-job-completion', 'num-array-partitions', 'largest-free-partition-space', 'largest-free-partition-space-numeric', 'num-drives-per-low-level-array', 'num-expansion-partitions', 'num-partition-segments', 'new-partition-lba', 'new-partition-lba-numeric', 'array-drive-type', 'array-drive-type-numeric', 'disk-description-numeric', 'is-job-auto-abortable', 'is-job-auto-abortable-numeric', 'blocks', 'disk-dsd-enable-vdisk', 'disk-dsd-enable-vdisk-numeric', 'disk-dsd-delay-vdisk', 'scrub-duration-goal', 'adapt-target-spare-capacity', 'adapt-target-spare-capacity-numeric', 'adapt-actual-spare-capacity', 'adapt-actual-spare-capacity-numeric', 'adapt-critical-capacity', 'adapt-critical-capacity-numeric', 'adapt-degraded-capacity', 'adapt-degraded-capacity-numeric', 'adapt-linear-volume-boundary', 'pool-sector-format-numeric', 'health-numeric'] volumes_generic = ["volume-description", "blocks", "health", "size", "volume-name", "wwn", "storage-pool-name", "total-size", "volume-class", "allocated-size", "owner", "object-name", "raidtype", "health-reason", "progress", "blocksize", "serial-number", "virtual-disk-serial", "write-policy", "volume-type", "health-recommendation", "virtual-disk-name", "storage-type", "capabilities"] volumes_extended = ["cache-optimization", "container-serial", "cs-primary", "replication-set", "attributes", "preferred-owner", "volume-parent", "allowed-storage-tiers", "cs-copy-dest", "cs-copy-src", "container-name", "group-key", "snapshot-retention-priority", "pi-format", "reserved-size-in-pages", "cs-secondary", "volume-group", "health-numeric", "large-virtual-extents", "cs-replication-role", "durable-id", "threshold-percent-of-pool", "tier-affinity", "volume-qualifier", "snapshot", "snap-pool", "read-ahead-size", "zero-init-page-on-allocation", "allocate-reserved-pages-first"] # Logical Volumes directory name LOGICAL_VOLUMES_DIR = "logical_volumes" # Disk Groups directory name DISK_GROUPS_DIR = "disk_groups" @staticmethod def name(): """@return: name of the monitoring module.""" return RealStorLogicalVolumeSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Disk groups and logical volumes of storage enclosure can not be monitored." @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return RealStorLogicalVolumeSensor.DEPENDENCIES def __init__(self): super(RealStorLogicalVolumeSensor, self).__init__( self.SENSOR_NAME, self.PRIORITY) self._faulty_disk_group_file_path = None self._faulty_logical_volume_file_path = None self.rssencl = singleton_realstorencl # logical volumes persistent cache self._logical_volume_prcache = None # disk groups persistent cache self._disk_group_prcache = None # Holds Disk Groups with faults. Used for future reference. self._previously_faulty_disk_groups = {} # Holds Logical Volumes with faults. Used for future reference. self._previously_faulty_logical_volumes = {} self.pollfreq_DG_logical_volume_sensor = \ int(Conf.get(SSPL_CONF, f"{self.rssencl.CONF_REALSTORLOGICALVOLUMESENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 10)) if self.pollfreq_DG_logical_volume_sensor == 0: self.pollfreq_DG_logical_volume_sensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils() cvg_info = Conf.get(GLOBAL_CONF, CVG_INFO_KEY) self.cvg_info_dict = {} if cvg_info: self.cvg_info_dict = {cvg['name']: idx for idx, cvg in \ enumerate(cvg_info) if 'name' in cvg} def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorLogicalVolumeSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorLogicalVolumeSensor, self).initialize_msgQ(msgQlist) self._logical_volume_prcache = os.path.join(self.rssencl.frus,\ self.LOGICAL_VOLUMES_DIR) self._disk_group_prcache = os.path.join(self.rssencl.frus,\ self.DISK_GROUPS_DIR) # Persistence file location. This file stores faulty Logical Volume data self._faulty_logical_volume_file_path = os.path.join( self._logical_volume_prcache, "logical_volume_data.json") # Persistence file location. This file stores faulty Disk Group data self._faulty_disk_group_file_path = os.path.join( self._disk_group_prcache, "disk_group_data.json") # Load faulty Logical Volume data from file if available self._previously_faulty_logical_volumes = store.get(\ self._faulty_logical_volume_file_path) # Load faulty Disk Group data from file if available self._previously_faulty_disk_groups = store.get(\ self._faulty_disk_group_file_path) if self._previously_faulty_logical_volumes is None: self._previously_faulty_logical_volumes = {} store.put(self._previously_faulty_logical_volumes,\ self._faulty_logical_volume_file_path) if self._previously_faulty_disk_groups is None: self._previously_faulty_disk_groups = {} store.put(self._previously_faulty_disk_groups,\ self._faulty_disk_group_file_path) return True def read_data(self): """This method is part of interface. Currently it is not in use. """ return {} def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(10, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() disk_groups = None logical_volumes = None disk_groups = self._get_disk_groups() if disk_groups: self._get_msgs_for_faulty_disk_groups(disk_groups) for disk_group in disk_groups: pool_serial_number = disk_group["pool-serial-number"] logical_volumes = self._get_logical_volumes(pool_serial_number) if logical_volumes: self._get_msgs_for_faulty_logical_volumes(logical_volumes, disk_group) # Reset debug mode if persistence is not enabled self._disable_debug_if_persist_false() # Fire every 10 seconds to see if We have a faulty Logical Volume self._scheduler.enter(self.pollfreq_DG_logical_volume_sensor, self._priority, self.run, ()) def _get_disk_groups(self): """Receives list of Disk Groups from API. URL: http://<host>/api/show/disk-groups """ url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKGROUPS) response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Disk Groups status unavailable as ws request {url} failed") return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"to get disk groups failed with err {response.status_code}") return response_data = json.loads(response.text) disk_groups = response_data.get("disk-groups") return disk_groups def _get_logical_volumes(self, pool_serial_number): """Receives list of Logical Volumes from API. URL: http://<host>/api/show/volumes/pool/<pool_serial_number> """ url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWVOLUMES) url = f"{url}/pool/{pool_serial_number}" response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Logical Volume status unavailable as ws request {url}" " failed") return if response.status_code != self.rssencl.ws.HTTP_OK: raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"to get logical volumes failed with err {response.status_code}") return response_data = json.loads(response.text) logical_volumes = response_data.get("volumes") return logical_volumes def _get_msgs_for_faulty_disk_groups(self, disk_groups, send_message=True): """Checks for health of disk groups and returns list of messages to be sent to handler if there are any. """ faulty_disk_group_messages = [] internal_json_msg = None disk_group_health = None serial_number = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_disk_groups state_changed = False if not disk_groups: return for disk_group in disk_groups: disk_group_health = disk_group["health"].lower() serial_number = disk_group["serial-number"] # Check for missing and fault case if disk_group_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (serial_number in self._previously_faulty_disk_groups and \ self._previously_faulty_disk_groups[serial_number]['health']=="degraded") or \ (serial_number not in self._previously_faulty_disk_groups): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_disk_groups[serial_number] = { "health": disk_group_health, "alert_type": alert_type} state_changed = True # Check for fault case elif disk_group_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded if (serial_number in self._previously_faulty_disk_groups and \ self._previously_faulty_disk_groups[serial_number]['health']=="fault") or \ (serial_number not in self._previously_faulty_disk_groups): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_disk_groups[serial_number] = { "health": disk_group_health, "alert_type": alert_type} state_changed = True # Check for healthy case elif disk_group_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if serial_number in self._previously_faulty_disk_groups: # Send message to handler if send_message: alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._previously_faulty_disk_groups[serial_number] state_changed = True # Persist faulty Disk Group list to file only if something is changed if state_changed: # Generate the alert contents internal_json_msg = self._create_internal_msg_dg(alert_type, disk_group) faulty_disk_group_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_disk_groups,\ self._faulty_disk_group_file_path) else: self._previously_faulty_disk_groups = store.get(self._faulty_disk_group_file_path) state_changed = False alert_type = "" return faulty_disk_group_messages def _get_msgs_for_faulty_logical_volumes(self, logical_volumes, disk_group, send_message=True): """Checks for health of logical volumes and returns list of messages to be sent to handler if there are any. """ faulty_logical_volume_messages = [] internal_json_msg = None logical_volume_health = None serial_number = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_logical_volumes state_changed = False if not logical_volumes: return for logical_volume in logical_volumes: logical_volume_health = logical_volume["health"].lower() serial_number = logical_volume["serial-number"] # Check for missing and fault case if logical_volume_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (serial_number in self._previously_faulty_logical_volumes and \ self._previously_faulty_logical_volumes[serial_number]['health']=="degraded") or \ (serial_number not in self._previously_faulty_logical_volumes): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_logical_volumes[serial_number] = { "health": logical_volume_health, "alert_type": alert_type} state_changed = True # Check for degraded case elif logical_volume_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded if (serial_number in self._previously_faulty_logical_volumes and \ self._previously_faulty_logical_volumes[serial_number]['health']=="fault") or \ (serial_number not in self._previously_faulty_logical_volumes): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_logical_volumes[serial_number] = { "health": logical_volume_health, "alert_type": alert_type} state_changed = True # Check for healthy case elif logical_volume_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if serial_number in self._previously_faulty_logical_volumes: # Send message to handler alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._previously_faulty_logical_volumes[serial_number] state_changed = True if state_changed: # Generate the alert contents internal_json_msg = self._create_internal_msg_lvol( logical_volume, alert_type, disk_group) faulty_logical_volume_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Persist faulty Logical Volume list to file only if something is changed # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_logical_volumes,\ self._faulty_logical_volume_file_path) else: self._previously_faulty_logical_volumes = store.get(self._faulty_logical_volume_file_path) state_changed = False alert_type = "" return faulty_logical_volume_messages def _create_internal_msg_lvol(self, logical_volume_detail, alert_type, disk_group): """Forms a dictionary containing info about Logical Volumes to send to message handler. """ if not logical_volume_detail: return {} generic_info = dict.fromkeys(self.volumes_generic, "NA") extended_info = dict.fromkeys(self.volumes_extended, "NA") disk_groups_info = dict.fromkeys(self.disk_groups_generic, "NA") severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = logical_volume_detail.get("volume-name", "") host_name = self.os_utils.get_fqdn() for key, value in logical_volume_detail.items(): if key in self.volumes_generic: generic_info.update({key : value}) elif key in self.volumes_extended: extended_info.update({key : value}) for key, value in disk_group.items(): if key in self.disk_groups_generic: disk_groups_info.update({key : value}) generic_info['disk-group'] = [disk_groups_info] generic_info.update(extended_info) info = { "resource_type": self.RESOURCE_TYPE_LVOL, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": generic_info } }}) return internal_json_msg def _create_internal_msg_dg(self, alert_type, disk_group_detail): """Forms a dictionary containing info about Disk Groups to send to message handler. """ if not disk_group_detail: return {} generic_info = dict.fromkeys(self.disk_groups_generic, "NA") extended_info = dict.fromkeys(self.disk_groups_extended, "NA") severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = disk_group_detail.get("name", "") host_name = self.os_utils.get_fqdn() for key, value in disk_group_detail.items(): if key in self.disk_groups_generic: generic_info.update({key : value}) elif key in self.disk_groups_extended: extended_info.update({key : value}) generic_info.update(extended_info) cvg_info = { "cvg_name": resource_id if resource_id in self.cvg_info_dict else "NA", "cvg_id": self.cvg_info_dict.get(resource_id, "NA") } generic_info.update(cvg_info) info = { "resource_type": self.RESOURCE_TYPE_DG, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": generic_info } }}) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _send_json_msg(self, json_msg): """Sends JSON message to Handler""" if not json_msg: return self._event.clear() self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RealStorLogicalVolumeSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RealStorLogicalVolumeSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RealStorLogicalVolumeSensor, self).shutdown()
class RealStorControllerSensor(SensorThread, InternalMsgQ): """Monitors Controller data using RealStor API""" # Dependency list DEPENDENCIES = { "plugins": ["RealStorEnclMsgHandler"], "rpms": [] } SENSOR_NAME = "RealStorControllerSensor" SENSOR_RESP_TYPE = "enclosure_controller_alert" RESOURCE_CATEGORY = "hw" RESOURCE_TYPE = "enclosure:hw:controller" PRIORITY = 1 # Controllers directory name CONTROLLERS_DIR = "controllers" @staticmethod def name(): """@return: name of the monitoring module.""" return RealStorControllerSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Controllers in storage enclosure can not be monitored." @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return RealStorControllerSensor.DEPENDENCIES def __init__(self): super(RealStorControllerSensor, self).__init__( self.SENSOR_NAME, self.PRIORITY) self._faulty_controller_file_path = None self.rssencl = singleton_realstorencl # controllers persistent cache self._controller_prcache = None # Holds Controllers with faults. Used for future reference. self._previously_faulty_controllers = {} self.pollfreq_controllersensor = \ int(Conf.get(SSPL_CONF,f"{self.rssencl.CONF_REALSTORCONTROLLERSENSOR}>{POLLING_FREQUENCY_OVERRIDE}", 0)) if self.pollfreq_controllersensor == 0: self.pollfreq_controllersensor = self.rssencl.pollfreq # Flag to indicate suspension of module self._suspended = False self._event = Event() self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorControllerSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorControllerSensor, self).initialize_msgQ(msgQlist) self._controller_prcache = os.path.join(self.rssencl.frus,\ self.CONTROLLERS_DIR) # Persistence file location. This file stores faulty Controller data self._faulty_controller_file_path = os.path.join( self._controller_prcache, "controllerdata.json") # Load faulty Controller data from file if available self._previously_faulty_controllers = store.get(\ self._faulty_controller_file_path) if self._previously_faulty_controllers is None: self._previously_faulty_controllers = {} store.put(self._previously_faulty_controllers,\ self._faulty_controller_file_path) return True def read_data(self): """This method is part of interface. Currently it is not in use. """ return {} def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(10, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() controllers = None controllers = self._get_controllers() if controllers: self._get_msgs_for_faulty_controllers(controllers) # Reset debug mode if persistence is not enabled self._disable_debug_if_persist_false() # Fire every 10 seconds to see if We have a faulty Controller self._scheduler.enter(self.pollfreq_controllersensor, self._priority, self.run, ()) def _get_controllers(self): """Receives list of Controllers from API. URL: http://<host>/api/show/controllers """ url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWCONTROLLERS) response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn(f"{self.rssencl.LDR_R1_ENCL}:: Controllers status unavailable as ws request {url}") return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: raise Exception(f"{self.rssencl.LDR_R1_ENCL}:: http request {url} " f"to get controllers failed with err {response.status_code}") return response_data = json.loads(response.text) controllers = response_data.get("controllers") return controllers def _get_msgs_for_faulty_controllers(self, controllers, send_message=True): """Checks for health of controllers and returns list of messages to be sent to handler if there are any. """ faulty_controller_messages = [] internal_json_msg = None controller_health = None durable_id = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_controllers state_changed = False prev_alert_type = None if not controllers: return for controller in controllers: controller_health = controller["health"].lower() controller_status = controller["status"].lower() durable_id = controller["durable-id"] # Check for missing and fault case if controller_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (durable_id in self._previously_faulty_controllers and \ self._previously_faulty_controllers[durable_id]['health']=="degraded") or \ (durable_id not in self._previously_faulty_controllers): alert_type = self.rssencl.FRU_FAULT # Check for removal if controller_status == self.rssencl.STATUS_NOTINSTALLED: alert_type = self.rssencl.FRU_MISSING self._previously_faulty_controllers[durable_id] = { "health": controller_health, "alert_type": alert_type} state_changed = True internal_json_msg = self._create_internal_msg( controller, alert_type) faulty_controller_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for fault case elif controller_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded # Controller can also go into degraded state after installation as well # So, Degrade state can be after missing alert as well. if (durable_id in self._previously_faulty_controllers and \ self._previously_faulty_controllers[durable_id]['health']=="fault") or \ (durable_id not in self._previously_faulty_controllers): if self._previously_faulty_controllers and \ self._previously_faulty_controllers.get(durable_id).get('alert_type'): prev_alert_type = self._previously_faulty_controllers[durable_id]["alert_type"] # If prev_alert_type is missing, then the next alert type will be insertion first if prev_alert_type and prev_alert_type.lower() == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( controller, alert_type) # send the message to the handler if send_message: self._send_json_msg(internal_json_msg) # And set alert_type as fault alert_type = self.rssencl.FRU_FAULT self._previously_faulty_controllers[durable_id] = { "health": controller_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg(controller, alert_type) faulty_controller_messages.append(internal_json_msg) state_changed = True # send the message to the handler if send_message: self._send_json_msg(internal_json_msg) # Check for healthy case elif controller_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if durable_id in self._previously_faulty_controllers: # Send message to handler if send_message: previous_alert_type = \ self._previously_faulty_controllers[durable_id]["alert_type"] alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( controller, alert_type) faulty_controller_messages.append(internal_json_msg) if send_message: self._send_json_msg(internal_json_msg) del self._previously_faulty_controllers[durable_id] state_changed = True # Persist faulty Controller list to file only if something is changed if state_changed: # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_controllers,\ self._faulty_controller_file_path) else: self._previously_faulty_controllers = store.get(self._faulty_controller_file_path) state_changed = False alert_type = "" return faulty_controller_messages def _create_internal_msg(self, controller_detail, alert_type): """Forms a dictionary containing info about Controllers to send to message handler. """ if not controller_detail: return {} severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('controller') resource_id = controller_detail.get("durable-id", "") host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "fru": fru, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": controller_detail } }}) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _send_json_msg(self, json_msg): """Sends JSON message to Handler""" if not json_msg: return self._event.clear() self._write_internal_msgQ(RealStorEnclMsgHandler.name(), json_msg, self._event) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RealStorControllerSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RealStorControllerSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RealStorControllerSensor, self).shutdown()
def __init__(self): super(RAIDIntegritySensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self._cache_state = None self.os_utils = OSUtils()
class RAIDIntegritySensor(SensorThread, InternalMsgQ): SENSOR_NAME = "RAIDIntegritySensor" PRIORITY = 1 RESOURCE_TYPE = "node:os:raid_integrity" # Section and keys in configuration file RAIDIntegritySensor = SENSOR_NAME.upper() SYSTEM_INFORMATION = "SYSTEM_INFORMATION" SCAN_FREQUENCY = "polling_interval" RETRY_INTERVAL = "retry_interval" TIMESTAMP_FILE_PATH_KEY = "timestamp_file_path" # Scan for RAID integrity error every 2 weeks (1209600 seconds) DEFAULT_SCAN_FREQUENCY = "1209600" # Minimum allowed frequency for RAID integrity scans is 1 day # (86400 seconds ), as frequent scans affect disk i/o performance MIN_SCAN_FREQUENCY = 86400 DEFAULT_RAID_DATA_PATH = RaidDataConfig.RAID_RESULT_DIR.value DEFAULT_TIMESTAMP_FILE_PATH = DEFAULT_RAID_DATA_PATH + "last_execution_time" alert_type = None # alerts FAULT_RESOLVED = "fault_resolved" FAULT = "fault" MISSING = "missing" SUCCESS = "success" FAILED = "failed" @staticmethod def name(): """@return: name of the monitoring module.""" return RAIDIntegritySensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Server RAID integrity can not be monitored." def __init__(self): super(RAIDIntegritySensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) self._cache_state = None self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RAIDIntegritySensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RAIDIntegritySensor, self).initialize_msgQ(msgQlist) self._alert_msg = None self._fault_state = None self._suspended = False self._timestamp_file_path = Conf.get( SSPL_CONF, f"{self.RAIDIntegritySensor}>{self.TIMESTAMP_FILE_PATH_KEY}", self.DEFAULT_TIMESTAMP_FILE_PATH) self._scan_frequency = Conf.get( SSPL_CONF, f"{self.RAIDIntegritySensor}>{self.SCAN_FREQUENCY}", self.DEFAULT_SCAN_FREQUENCY) self._next_scheduled_time = self._scan_frequency if self._scan_frequency < self.MIN_SCAN_FREQUENCY: self._scan_frequency = self.MIN_SCAN_FREQUENCY sysfs_path = Conf.get(SSPL_CONF, f'{SYSTEM_INFORMATION}>{SYSFS_PATH}') self.raid_dir = sysfs_path + BLOCK_DIR self.retry_interval = int( Conf.get(SSPL_CONF, f'{self.RAIDIntegritySensor}>{self.RETRY_INTERVAL}')) # Create DEFAULT_RAID_DATA_PATH if already not exist. self._create_file(self.DEFAULT_RAID_DATA_PATH) return True def read_data(self): return self._cache_state def run(self): """Run the sensor on its own thread""" # Do not proceed if module is suspended if self._suspended == True: if os.path.exists(self._timestamp_file_path): with open(self._timestamp_file_path, "r") as timestamp_file: last_processed_log_timestamp = timestamp_file.read().strip( ) current_time = int(time.time()) if current_time > int(last_processed_log_timestamp): self._next_scheduled_time = self._scan_frequency - \ (current_time - int(last_processed_log_timestamp)) logger.info("Scheduling RAID validate again after: %s seconds" % self._next_scheduled_time) self._scheduler.enter(self._next_scheduled_time, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() #cleanup self._cleanup() # Log RAIDIntegritySensor execution timestamp self._create_file(self._timestamp_file_path) self._log_timestamp() # Validate the raid data files and notify the node data msg handler self._raid_health_monitor() with open(self._timestamp_file_path, "r") as timestamp_file: last_processed_log_timestamp = timestamp_file.read().strip() current_time = int(time.time()) if current_time > int(last_processed_log_timestamp): self._next_scheduled_time = self._scan_frequency - \ (current_time - int(last_processed_log_timestamp)) logger.info("Scheduling RAID validate again after: %s seconds" % self._next_scheduled_time) self._scheduler.enter(self._next_scheduled_time, self._priority, self.run, ()) def _raid_health_monitor(self): try: devices = self._get_devices() if len(devices) == 0: return logger.debug("Fetched devices:{}".format(devices)) for device in devices: # Update the state as 'check' for RAID device file result = self._update_raid_device_file(device) if result == "failed": self._retry_execution(self._update_raid_device_file, device) logger.info("RAID device state is changed to 'check'") # Check RAID device array state is 'idle' or not result = self._check_raid_state(device) if result == "failed": logger.warn( "'Idle' state not found for RAID device:{}".format( device)) # Retry to check RAID state self._retry_execution(self._check_raid_state, device) logger.info( "'idle' state is found in Raid device:{}.".format(device)) # Check Mismatch count in RAID device files. result = self._check_mismatch_count(device) if result == "failed": # Persist RAID device fault state and send alert fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value if os.path.exists(fault_status_file): with open(fault_status_file, 'r') as fs: data = fs.read().rstrip() if self.FAULT_RESOLVED in data: self.alert_type = self.FAULT self._alert_msg = "RAID disks present in %s RAID array"\ ", needs synchronization. If fault persists for "\ "more than 2 days, Please contact Seagate support."%device self._send_json_msg(self.alert_type, device, self._alert_msg) self._update_fault_state_file( device, self.FAULT, fault_status_file) self._scan_frequency = self.MIN_SCAN_FREQUENCY else: self.alert_type = self.FAULT self._alert_msg = "RAID disks present in %s RAID array"\ ", needs synchronization. If fault persists for "\ "more than 2 days, Please contact Seagate support."%device self._send_json_msg(self.alert_type, device, self._alert_msg) self._update_fault_state_file(device, self.FAULT, fault_status_file) self._scan_frequency = self.MIN_SCAN_FREQUENCY # Retry to check mismatch_cnt self._retry_execution(self._check_mismatch_count, device) logger.debug( "No mismatch count is found in Raid device:{}".format( device)) except Exception as ae: raise Exception(f"Failed in monitoring RAID health, {ae}") def _get_devices(self): try: mdstat_file = RaidDataConfig.MDSTAT_FILE.value with open(mdstat_file, 'r') as fp: content = fp.readlines() device_array = [] for line in content: if "active" in line: device = line.split(":")[0].rstrip() device_array.append(device) if len(device_array) == 0: logger.error("No RAID device found in mdstat file.") return device_array except Exception as ae: raise Exception(f"Failed to get the device array, {ae}") def _check_mismatch_count(self, device): try: status = None mismatch_cnt_file = RaidDataConfig.MISMATCH_COUNT_FILE.value MISMATCH_COUNT_COMMAND = 'cat ' + self.raid_dir + device +\ mismatch_cnt_file logger.debug('Executing MISMATCH_CNT_COMMAND:{}'.format( MISMATCH_COUNT_COMMAND)) response, error = self._run_command(MISMATCH_COUNT_COMMAND) if error: logger.error("Error in cmd{} in raid health monitor".format( MISMATCH_COUNT_COMMAND)) if response == RaidDataConfig.MISMATCH_COUNT_RESPONSE.value: logger.debug("No mismatch count is found") status = "success" with open(self.output_file, 'a') as raid_file: raid_file.write( RaidDataConfig.MISMATCH_COUNT_RESPONSE.value) fault_status_file = self.DEFAULT_RAID_DATA_PATH + device + "_" + RaidDataConfig.RAID_MISMATCH_FAULT_STATUS.value if os.path.exists(fault_status_file): with open(fault_status_file, 'r') as fs: data = fs.read().rstrip() if self.FAULT in data: faulty_device = data.split(":")[0].rstrip() if device == faulty_device: self.alert_type = self.FAULT_RESOLVED self._alert_msg = "RAID disks present in %s RAID array are synchronized." % device self._send_json_msg(self.alert_type, device, self._alert_msg) self._update_fault_state_file( device, self.FAULT_RESOLVED, fault_status_file) self._scan_frequency = Conf.get( SSPL_CONF, f"{self.RAIDIntegritySensor}>{self.SCAN_FREQUENCY}", self.DEFAULT_SCAN_FREQUENCY) self._scan_frequency = max(self._scan_frequency, self.MIN_SCAN_FREQUENCY) else: status = "failed" logger.debug( "Mismatch found in {} file in raid_integrity_data!".format( mismatch_cnt_file)) return status except Exception as ae: logger.error( "Failed in checking mismatch_cnt in RAID file. ERROR:{}". format(str(ae))) raise def _check_raid_state(self, device): try: status = None raid_check = 0 sync_action_file = RaidDataConfig.SYNC_ACTION_FILE.value while raid_check <= RaidDataConfig.MAX_RETRIES.value: self.output_file = self._get_unique_filename( RaidDataConfig.RAID_RESULT_FILE_PATH.value, device) STATE_COMMAND = 'cat ' + self.raid_dir + device +\ sync_action_file logger.debug( 'Executing STATE_COMMAND:{}'.format(STATE_COMMAND)) response, error = self._run_command(STATE_COMMAND) if error: logger.warn("Error in cmd{} in raid health monitor".format( STATE_COMMAND)) raid_check += 1 else: if response == RaidDataConfig.STATE_COMMAND_RESPONSE.value: status = "success" with open(self.output_file, 'w') as raid_file: raid_file.write( RaidDataConfig.STATE_COMMAND_RESPONSE.value + "\n") break else: status = "failed" raid_check += 1 time.sleep(WAIT_BEFORE_RETRY) return status except Exception as ae: logger.error( "Failed in checking RAID device state. ERROR:{}".format( str(ae))) raise def _update_raid_device_file(self, device): try: status = "failed" raid_check = 0 sync_action_file = RaidDataConfig.SYNC_ACTION_FILE.value while raid_check <= RaidDataConfig.MAX_RETRIES.value: CHECK_COMMAND = "echo 'check' |sudo tee " + self.raid_dir +\ device + sync_action_file + " > /dev/null" logger.debug( 'Executing CHECK_COMMAND:{}'.format(CHECK_COMMAND)) response, error = self._run_command(CHECK_COMMAND) if error: logger.warn( "Failed in executing command:{}.".format(error)) raid_check += 1 time.sleep(1) else: logger.debug( "RAID device state is changed to 'check' with response : {}" .format(response)) status = "success" break return status except Exception as ae: logger.error("Failed to update RAID File. ERROR:{}".format( str(ae))) raise def _retry_execution(self, function_call, device): while True: logger.debug("Executing function:{} after {} time interval".format( function_call, self.retry_interval)) time.sleep(self.retry_interval) result = function_call(device) if result == self.SUCCESS: return def _get_unique_filename(self, filename, device): unique_timestamp = datetime.now().strftime("%d-%m-%Y_%I-%M-%S-%p") unique_filename = f"{filename}_{device}_{unique_timestamp}.txt" return unique_filename def _send_json_msg(self, alert_type, resource_id, error_msg): """Transmit data to NodeDataMsgHandler to be processed and sent out""" epoch_time = str(int(time.time())) severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) self._alert_id = self._get_alert_id(epoch_time) host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time, "description": error_msg } specific_info = {"error": error_msg} internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "sensor_type": "node:os:raid_integrity", "host_id": host_name, "alert_type": alert_type, "alert_id": self._alert_id, "severity": severity, "info": info, "specific_info": specific_info } } }) self.alert_type = None # Send the event to node data message handler to generate json message and send out self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg) def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def suspend(self): """Suspends the module thread. It should be non-blocking""" super(RAIDIntegritySensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(RAIDIntegritySensor, self).resume() self._suspended = False def _run_command(self, command): """Run the command and get the response and error returned""" logger.debug(f"_run_command: {command}") process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) response, error = process.communicate() if response: logger.debug(f"_run_command, response: {str(response)}") if error: logger.debug(f"_run_command: error: {str(error)}") return response.decode().rstrip('\n'), error.decode().rstrip('\n') def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(RAIDIntegritySensor, self).shutdown() def _create_file(self, path): dir_path = path[:path.rindex("/")] if not os.path.exists(dir_path): os.makedirs(dir_path) logger.debug("{} in creation of dir path : {}".format( self.SUCCESS, dir_path)) if not os.path.exists(path): file = open(path, "w+") file.close() def _log_timestamp(self): current_time = str(int(time.time())) with open(self._timestamp_file_path, "w") as timestamp_file: timestamp_file.write(current_time) def _update_fault_state_file(self, device, fstate, fault_state_file): self._fault_state = fstate data = device + ":" + self._fault_state self._create_file(fault_state_file) with open(fault_state_file, 'w') as fs: fs.write(data) def _cleanup(self): """Clean up the validate raid result files""" if os.path.exists(self._timestamp_file_path): os.remove(self._timestamp_file_path) path = RaidDataConfig.RAID_RESULT_DIR.value if os.path.exists(path): current_time = time.time() result_files = [ file for file in os.listdir(path) if file.endswith(".txt") ] for file in result_files: if os.path.getmtime(os.path.join( path, file)) < (current_time - 24 * 60 * 60): if os.path.isfile(os.path.join(path, file)): os.remove(os.path.join(path, file))
class SASPortSensor(SensorThread, InternalMsgQ): """SAS Port Sensor which runs on its own thread periodically and is responsible for sensing changes is SAS ports/cable using available tool/utility""" SENSOR_NAME = "SASPortSensor" PRIORITY = 1 RESOURCE_TYPE = "node:interface:sas" # section in the configuration store SYSTEM_INFORMATION = "SYSTEM_INFORMATION" POLLING_INTERVAL = "polling_interval" CACHE_DIR_NAME = "server" DEFAULT_POLLING_INTERVAL = '30' PROBE = "probe" # Dependency list DEPENDENCIES = {"plugins": ["NodeDataMsgHandler"], "rpms": []} # Number of SAS Ports NUM_SAS_PORTS = 4 # Number of Phys in a Port NUM_PHYS_PER_PORT = 4 # Current Data Version CURRENT_DATA_VERSION = 1 @staticmethod def name(): """@return: name of the module.""" return SASPortSensor.SENSOR_NAME @staticmethod def impact(): """Returns impact of the module.""" return "Server SAS ports can not be monitored." def __init__(self, utility_instance=None): """init method""" super(SASPortSensor, self).__init__(self.SENSOR_NAME, self.PRIORITY) # Initialize the utility instance self._utility_instance = utility_instance self.phy_dir_to_linkrate_mapping = None # Flag to indicate suspension of module self._suspended = False self._count = 0 self.phy_link_count = 0 self.sas_ports_status = {} self.port_phy_list_dict = {} self.sas_phy_stored_alert = None self.os_utils = OSUtils() def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(SASPortSensor, self).initialize(conf_reader) super(SASPortSensor, self).initialize_msgQ(msgQlist) self._node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, 'SN01') # Get the sas port implementor from configuration sas_port_utility = Conf.get( SSPL_CONF, f"{self.name().capitalize()}>{self.PROBE}", "sysfs") self.polling_interval = int( Conf.get(SSPL_CONF, f"{self.SENSOR_NAME.upper()}>{self.POLLING_INTERVAL}", self.DEFAULT_POLLING_INTERVAL)) self.HOST_ID = SAS().get_host_list()[0].replace('host', '') self.RESOURCE_ID = SAS_RESOURCE_ID + self.HOST_ID # eg. SASHBA-0 if host_id=0 # Creating the instance of ToolFactory class self.tool_factory = ToolFactory() cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SAS_PORT_SENSOR_DATA = os.path.join( cache_dir_path, f'SAS_PORT_SENSOR_DATA_{self._node_id}') alert_type = None try: # Get the instance of the utility using ToolFactory self._utility_instance = self._utility_instance or \ self.tool_factory.get_instance(sas_port_utility) self._utility_instance.initialize() phy_status = None link_value_phy_status_collection = () # Call to sas phy dirctory which will return a dictionary # which has phy_name to negotiated link rate mapping # Ex: {"phy-0:0": "<12.0, Unknown>"} self.phy_dir_to_linkrate_mapping = \ self._utility_instance.get_phy_negotiated_link_rate() # Iterate over populated dictionary and restructure it # Ex: if phy-0:0 is 12.0/6.0/3.0, considered as UP. # {"phy-0:0": ("link_rate", <Up/Down>)} for phy, value in self.phy_dir_to_linkrate_mapping.items(): if 'Gbit'.lower() in value.strip().lower(): phy_status = 'up' # Increment global phy_link count for UP status self.phy_link_count += 1 else: phy_status = 'fault' link_value_phy_status_collection = (value, phy_status) self.phy_dir_to_linkrate_mapping[ phy] = link_value_phy_status_collection # Get the stored previous alert info self.sas_phy_stored_alert = store.get(self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert() except KeyError as key_error: raise Exception( f"Unable to get the instance of {sas_port_utility} " f"utility, {key_error}") except Exception as e: if e == errno.ENOENT: raise Exception("Problem occurred while reading from sas_phy \ directory. directory path doesn't directory.") elif e == errno.EACCES: raise Exception( "Problem occurred while reading from sas_phy directory. \ Not enough permission to read from the directory.") else: raise Exception( f"Problem occurred while reading from sas_phy directory. {e}" ) return True def update_sas_ports_status(self): """ Reads current phy status and updates port connectivity status Assumption : phys will be present in multiples of 4 """ phy_list = [*self.phy_dir_to_linkrate_mapping] phy_list = sort_phy_list(phy_list) # Now we have a sorted list of phys # Phys 0-3 for the 0th sas port, and so on in groups of 4 phys # List containing status of all phys hba = [] for phy in phy_list: if self.phy_dir_to_linkrate_mapping[phy][1] == 'up': hba.append(1) else: hba.append(0) for i in range(0, self.NUM_SAS_PORTS): # Save phy names forming this port for future use self.port_phy_list_dict[i] = phy_list[ self.NUM_PHYS_PER_PORT * i : \ self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT ] # Check port status s = set(hba[self.NUM_PHYS_PER_PORT * i:self.NUM_PHYS_PER_PORT * i + self.NUM_PHYS_PER_PORT]) if len(s) == 1 and 0 in s: port_status = 'down' elif len(s) == 1 and 1 in s: port_status = 'up' else: port_status = 'degraded' # Store the data self.sas_ports_status[i] = port_status def check_and_send_conn_alert(self): """ Sends conn fault alert if all phys go down Sends conn fault_resolved alert if at least 1 sas port (4 phys) comes up """ # Case 1 : all fault for fault alert cur_all_fault = True # Case 2 : all fault_resolved for fault_resolved alert cur_all_fault_resolved = True # Previous conn alert that was sent prev_conn_alert = self.sas_phy_stored_alert['conn'] # Current for port, value in self.sas_phy_stored_alert.items(): if port in ['version', 'conn']: # This is key for conn alert, skip continue # Case 1 : All faults in current status if value != 'fault': cur_all_fault = False # Case 2 : All fault_resolved in current status elif value != 'fault_resolved': cur_all_fault_resolved = False if prev_conn_alert == 'fault_resolved' and cur_all_fault: # Send conn fault alert alert_type = 'fault' self._generate_alert(alert_type, -1) self.sas_phy_stored_alert['conn'] = alert_type elif prev_conn_alert == 'fault' and cur_all_fault_resolved: # Send conn fault_resolved alert alert_type = 'fault_resolved' self._generate_alert(alert_type, -1) self.sas_phy_stored_alert['conn'] = alert_type def handle_current_version_data(self): """Contains logic to check and send alert if data has version == 1.""" # Compare current status of each port with previous alert_type for port, value in self.sas_phy_stored_alert.items(): if port in ['version', 'conn']: # Skip continue if value == 'fault_resolved' and \ self.sas_ports_status[port] == 'down': alert_type = 'fault' self._generate_alert(alert_type, port) self.sas_phy_stored_alert[port] = alert_type elif value == 'fault' and \ self.sas_ports_status[port] == 'up': alert_type = 'fault_resolved' self._generate_alert(alert_type, port) self.sas_phy_stored_alert[port] = alert_type # See if conn failure/conn resolved alert needs to be sent self.check_and_send_conn_alert() # Save data to store store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) def check_and_send_alert(self): """Checks whether conditions are met and sends alert if required Alerts will be sent if - 1. All 4 phys of a sas port go up -> down : fault alert 2. All 4 phys of a sas port come down -> up : fault_resolved alert Sensor data stored in persistent storage is a dict of { sas_port_number : alert_type } """ # Update sas ports status self.update_sas_ports_status() # Check the version of stored alert version = None try: # Try to get the version # Exception will be raised if stored alert is None or no Version is available version = self.sas_phy_stored_alert['version'] except Exception: logger.warn(f"Found no data or old data format for SASPortSensor, \ updating data format to version {self.CURRENT_DATA_VERSION}" ) # Versioning is not implemented or there is no data, write new data # Initialize dummy fault_resolved for all sas ports and conn self.sas_phy_stored_alert = {} self.sas_phy_stored_alert['version'] = self.CURRENT_DATA_VERSION self.sas_phy_stored_alert['conn'] = 'fault_resolved' for i in range(0, self.NUM_SAS_PORTS): self.sas_phy_stored_alert[i] = 'fault_resolved' # Save data to store store.put(self.sas_phy_stored_alert, self.SAS_PORT_SENSOR_DATA) if version == self.CURRENT_DATA_VERSION: self.handle_current_version_data() def run(self): """Run the sensor on its own thread""" alert_type = None status = None new_phy_up = 0 new_phy_down = 0 # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) return # Check for debug mode being activated self._read_my_msgQ_noWait() try: phy_link_rate_dict = \ self._utility_instance.get_phy_negotiated_link_rate() if phy_link_rate_dict: for key, value in phy_link_rate_dict.items(): link_rate = value.strip() prev_linkrate_value = \ self.phy_dir_to_linkrate_mapping[key][0].strip() prev_alert_type = \ self.phy_dir_to_linkrate_mapping[key][1].strip() status = prev_alert_type # Compare local dict wrt global dictionary for change in the # negotiated link rate if link_rate.lower() != prev_linkrate_value.lower(): # If current link rate has no value like 12/6/3 Gbit # and previously it was up, then it's a fault condition if 'Gbit'.lower() not in link_rate.lower( ) and prev_alert_type.lower() == 'up': # Increment count for new phy down which were up previously new_phy_down += 1 # Make respective phy_status as fault status = 'fault' # Check if 12/6/3 Gbit is there in the current link rate and # the previous alert_type is fault. If so, means phy is Up again elif 'Gbit'.lower() in link_rate.lower( ) and prev_alert_type.lower() == 'fault': # Mark respective phy_status as Up status = 'up' # Increment count for new phy up new_phy_up += 1 # Finally update the global dict with current link rate # and respctive phy status self.phy_dir_to_linkrate_mapping[key] = (link_rate, status) # Get current phy status i.e number of Up phys new_phy_link_count = self.phy_link_count + new_phy_up - new_phy_down # Get the last sent alert info self.sas_phy_stored_alert = store.get( self.SAS_PORT_SENSOR_DATA) self.check_and_send_alert() # Update current active phy count for next iteration self.phy_link_count = new_phy_link_count except Exception as ae: raise Exception(ae) # Fire every 30 seconds to see if there's a change in the phy status self._scheduler.enter(self.polling_interval, self._priority, self.run, ()) def _create_json_message(self, alert_type, port): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = self.os_utils.get_fqdn() specific_info = {} specific_info_list = [] description = "N/A" # specific_info will contain all 16 phys for conn level alert # Only 4 phys for port level alert for key, val in self.phy_dir_to_linkrate_mapping.items(): if port != -1: # This is a port level alert, skip phys that are not relevant if key not in self.port_phy_list_dict[port]: # Skip adding this phy continue # Key will be phy-1:0. # Here phy-1:0 represent 0th phy for SASHBA-1. specific_info["resource_id"] = key specific_info[ "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[ key][0].strip() specific_info_list.append(specific_info) specific_info = {} alert_specific_info = specific_info_list if port == -1: # This is a SAS HBA level connection alert if alert_type == 'fault': description = "SAS connection error detected in SAS HBA %s." % self.RESOURCE_ID elif alert_type == 'fault_resolved': description = "SAS connection re-established in SAS HBA %s." % self.RESOURCE_ID info = { "resource_type": self.RESOURCE_TYPE, # node:interface:sas "resource_id": self.RESOURCE_ID, # eg. SASHBA-1 "event_time": epoch_time, "description": description } else: # This is a port level alert if alert_type == 'fault': description = ( "No connectivity detected on the SAS port %s, possible" "causes could be missing SAS cable, bad cable connection," "faulty cable or SAS port failure." % port) elif alert_type == 'fault_resolved': description = "Connection established on SAS port." info = { "resource_type": self.RESOURCE_TYPE + ':port', # node:interface:sas:port "resource_id": f'sas_port-{self.HOST_ID}:{port}', # eg. sas_port-1:0 represents 0th port of SASHBA-1 "event_time": epoch_time, "description": description } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg def _get_alert_id(self, epoch_time): """Returns alert id which is a combination of epoch_time and salt value """ salt = str(uuid.uuid4().hex) alert_id = epoch_time + salt return alert_id def _generate_alert(self, alert_type, port): """Queues the message to NodeData Message Handler""" json_msg = self._create_json_message(alert_type, port) if json_msg: self._write_internal_msgQ(NodeDataMsgHandler.name(), json_msg) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(SASPortSensor, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(SASPortSensor, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(SASPortSensor, self).shutdown()