def __init__(self): self._max_size = int( Conf.get(SSPL_CONF, f"{self.RABBITMQPROCESSOR}>{self.LIMIT_CONSUL_MEMORY}", 50000000)) self.cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.SSPL_MEMORY_USAGE = os.path.join(self.cache_dir_path, 'SSPL_MEMORY_USAGE') self._current_size = store.get(self.SSPL_MEMORY_USAGE) if self._current_size is None: store.put(0, self.SSPL_MEMORY_USAGE) self.SSPL_MESSAGE_HEAD_INDEX = os.path.join(self.cache_dir_path, 'SSPL_MESSAGE_HEAD_INDEX') self._head = store.get(self.SSPL_MESSAGE_HEAD_INDEX) if self._head is None: store.put(0, self.SSPL_MESSAGE_HEAD_INDEX) self.SSPL_MESSAGE_TAIL_INDEX = os.path.join(self.cache_dir_path, 'SSPL_MESSAGE_TAIL_INDEX') self._tail = store.get(self.SSPL_MESSAGE_TAIL_INDEX) if self._tail is None: store.put(0, self.SSPL_MESSAGE_TAIL_INDEX) self.SSPL_UNSENT_MESSAGES = os.path.join(self.cache_dir_path, 'MESSAGES')
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorFanSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorFanSensor, self).initialize_msgQ(msgQlist) self._fanmodule_prcache = os.path.join(self.rssencl.frus, \ self.FAN_MODULES_DIR) # Persistence file location. This file stores faulty FanModule data self._faulty_fan_file_path = os.path.join(self._fanmodule_prcache, "fanmodule_data.json") # Load faulty Fan Module data from file if available self._faulty_fan_modules_list = store.get(\ self._faulty_fan_file_path) if self._faulty_fan_modules_list is None: self._faulty_fan_modules_list = {} store.put(self._faulty_fan_modules_list,\ self._faulty_fan_file_path) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorPSUSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorPSUSensor, self).initialize_msgQ(msgQlist) self.psu_prcache = os.path.join(self.rssencl.frus, self.PSUS_DIR) # Persistence file location. This file stores faulty PSU data self._faulty_psu_file_path = os.path.join( self.psu_prcache, "psudata.json") self._log_debug( f"_faulty_psu_file_path: {self._faulty_psu_file_path}") # Load faulty PSU data from file if available self._previously_faulty_psus = store.get(\ self._faulty_psu_file_path) if self._previously_faulty_psus is None: self._previously_faulty_psus = {} store.put(self._previously_faulty_psus,\ self._faulty_psu_file_path) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorControllerSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorControllerSensor, self).initialize_msgQ(msgQlist) self._controller_prcache = os.path.join(self.rssencl.frus,\ self.CONTROLLERS_DIR) # Persistence file location. This file stores faulty Controller data self._faulty_controller_file_path = os.path.join( self._controller_prcache, "controllerdata.json") # Load faulty Controller data from file if available self._previously_faulty_controllers = store.get(\ self._faulty_controller_file_path) if self._previously_faulty_controllers is None: self._previously_faulty_controllers = {} store.put(self._previously_faulty_controllers,\ self._faulty_controller_file_path) return True
def _rss_build_disk_cache_from_persistent_cache(self): """Retreive realstor system state info using cli api /show/system""" files = store.get_keys_with_prefix(self.disks_prcache) if not files: logger.debug("No files in Disk cache folder, ignoring") return for filename in files: if filename.startswith('disk_') and filename.endswith('.json'): if f"{filename}.prev" in files: filename = f"{filename}.prev" drive = store.get(self.disks_prcache + filename) slotstr = re.findall("disk_(\d+).json", filename)[0] if not slotstr.isdigit(): logger.debug(f"slot {slotstr} not numeric, ignoring") continue slot = int(slotstr) if drive: sn = drive.get("serial-number", "NA") self.memcache_disks[slot] = {"serial-number": sn}
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorLogicalVolumeSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorLogicalVolumeSensor, self).initialize_msgQ(msgQlist) self._logical_volume_prcache = os.path.join(self.rssencl.frus,\ self.LOGICAL_VOLUMES_DIR) # Persistence file location. This file stores faulty Logical Volume data self._faulty_disk_group_file_path = os.path.join( self._logical_volume_prcache, "logicalvolumedata.json") # Load faulty Logical Volume data from file if available self._previously_faulty_disk_groups = store.get(\ self._faulty_disk_group_file_path) if self._previously_faulty_disk_groups is None: self._previously_faulty_disk_groups = {} store.put(self._previously_faulty_disk_groups,\ self._faulty_disk_group_file_path) return True
def delete(self): if self.is_empty(): return item = store.get(f"{self.SSPL_UNSENT_MESSAGES}/{self.head}") store.delete(f"{self.SSPL_UNSENT_MESSAGES}/{self.head}") self.head += 1 self.current_size -= sys.getsizeof(item)
def initialize(self, conf_reader, msgQlist, products): """Initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorSideplaneExpanderSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorSideplaneExpanderSensor, self).initialize_msgQ(msgQlist) self._sideplane_exp_prcache = os.path.join(self.rssencl.frus,\ self.SIDEPLANE_EXPANDERS_DIR) # Persistence file location. # This file stores faulty sideplane expander data self._faulty_sideplane_expander_file_path = os.path.join( self._sideplane_exp_prcache, "sideplane_expanders_data.json") # Load faulty sideplane expander data from file if available self._faulty_sideplane_expander_dict = \ store.get(\ self._faulty_sideplane_expander_file_path) if self._faulty_sideplane_expander_dict is None: self._faulty_sideplane_expander_dict = {} store.put(\ self._faulty_sideplane_expander_dict,\ self._faulty_sideplane_expander_file_path) return True
def initialize(self, conf_reader, msgQlist, products): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread and InternalMsgQ super(RealStorEnclosureSensor, self).initialize(conf_reader) # Initialize internal message queues for this module super(RealStorEnclosureSensor, self).initialize_msgQ(msgQlist) self.ENCL_SENSOR_DATA_PATH = os.path.join(self.rssencl.encl_cache, 'enclosure_data.json') # Get the stored previous alert info self.persistent_encl_data = store.get(self.ENCL_SENSOR_DATA_PATH) if self.persistent_encl_data: if self.persistent_encl_data['fault_alert'].lower() == "true": self.fault_alert = True else: self.fault_alert = False self.previous_alert_type = self.persistent_encl_data[ 'previous_alert_type'] else: self.persistent_encl_data = { 'fault_alert': str(self.fault_alert), 'previous_alert_type': str(self.previous_alert_type), } store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH) return True
def __init__(self): self._conf_reader = ConfigReader() self._max_size = int( self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR, self.LIMIT_CONSUL_MEMORY, 50000000)) self._current_size = store.get("SSPL_MEMORY_USAGE") if self._current_size is None: store.put(0, "SSPL_MEMORY_USAGE") self._head = store.get("SSPL_MESSAGE_HEAD_INDEX") if self._head is None: store.put(0, "SSPL_MESSAGE_HEAD_INDEX") self._tail = store.get("SSPL_MESSAGE_TAIL_INDEX") if self._tail is None: store.put(0, "SSPL_MESSAGE_TAIL_INDEX")
def from_cache(cls, service_name, unit): """ Initialize service from cache """ data = store.get(f"{CACHE_PATH}/{service_name}") service = cls(unit) service.new_service_state(data["service_monitor_state"]) service.state = data["service_state"] service.nonactive_enter_timestamp = data["nonactive_enter_timestamp"] service.active_enter_timestamp = data["active_enter_timestamp"] return service
def _check_module_recovered(module): """ Once SSPL is restarted, check current status of the module after certain recovery cycle time. If module is running and its previous state is fault, raise fault_resolved alert and update cache. """ module_name = module.name() # Wait till sensor module completes few run cycle. Then # raise module recovery fault_resolved alert. polling_cycle_time = Conf.get( SSPL_CONF, f"{SSPL_LL_SETTING}>sensor_polling_cycle_time", 60) time.sleep(polling_cycle_time) if not module.is_running(): return curr_state = "fault_resolved" per_data_path = os.path.join( module_cache_dir, f'{module_name.upper()}_{node_id}') if not os.path.isfile(per_data_path): module_persistent_data[module_name] = {} store.put(module_persistent_data[module_name], per_data_path) # Check previous state before sending fault resolved alert module_persistent_data[module_name] = store.get(per_data_path) prev_state = module_persistent_data[module_name].get('prev_state') if prev_state and curr_state != prev_state: module_persistent_data[module_name] = {"prev_state": curr_state} store.put(module_persistent_data[module_name], per_data_path) specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}") info = { "module_name": module_name, "alert_type": curr_state, "description": f"{module_name} is recovered", "impact": "", "recommendation": "", "severity": "info", "specific_info": specific_info } jsonMsg = ThreadMonitorMsg(info).getJson() module._write_internal_msgQ(EgressProcessor.name(), jsonMsg)
def head(self): return store.get(self.SSPL_MESSAGE_HEAD_INDEX)
def current_size(self): return store.get(self.SSPL_MEMORY_USAGE)
def _rss_check_disk_faults(self): """Retreive realstor system state info using cli api /show/system""" if not self.rssencl.check_system_faults_changed(): #logger.debug("System faults state _NOT_ changed !!! ") return try: # Extract new system faults faults = self.rssencl.latest_faults # TODO optimize to avoid nested 'for' loops. # Second 'for' loop in check_new_fault() self._event = Event() if faults: for fault in faults: #logger.debug("Faulty component-id {0}, IDENT {1}"\ # .format(fault["component-id"], self.DISK_IDENTIFIER)) # Check faulting component type if self.DISK_IDENTIFIER in fault["component-id"]: # If fault on disk, get disk full info including health if self.rssencl.check_new_fault(fault): # Extract slot from "component-id":"Disk 0.39" slot = fault["component-id"].split()[1].split( '.')[1] # Alert send only if disks_prcache updated with latest disk data if self.latest_disks[int(slot)]["health"] != "OK": #get drive data from disk cache disk_info = store.get( self.disks_prcache + "disk_{0}.json".format(slot)) # raise alert for disk fault self._rss_raise_disk_alert( self.rssencl.FRU_FAULT, disk_info) # To ensure all msg is sent to rabbitmq or added in consul for resending. self._event_wait_results.add( self._event.wait( self.rssencl. PERSISTENT_DATA_UPDATE_TIMEOUT)) self._event.clear() # Check for resolved faults for cached in self.rssencl.memcache_faults: if not any(d.get("component-id", None) == cached["component-id"] \ for d in self.rssencl.latest_faults) and self.DISK_IDENTIFIER in cached["component-id"]: # Extract slot from "component-id":"Disk 0.39" logger.info( f"Found resolved disk fault for {cached['component-id']}" ) slot = cached["component-id"].split()[1].split('.')[1] # Alert send only if disks_prcache updated with latest disk data if self.latest_disks[int(slot)]["health"] == "OK": # get drive data from disk cache disk_info = store.get(self.disks_prcache + "disk_{0}.json".format(slot)) # raise alert for resolved disk fault self._rss_raise_disk_alert( self.rssencl.FRU_FAULT_RESOLVED, disk_info) # To ensure all msg is sent to rabbitmq or added in consul for resending. self._event_wait_results.add( self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT)) self._event.clear() # If all messages are sent to rabbitmq or added in consul for resending. # then only update cache if self._event_wait_results and all(self._event_wait_results): self.rssencl.update_memcache_faults() self._event_wait_results.clear() self._event = None except Exception as e: logger.exception(f"Error in _rss_check_disk_faults {e}")
def execute_thread(module, msgQlist, conf_reader, product, resume=True): """ Run module as a thread. Recover the module if any error during initialization and run time of the module. If recovery count>0, module will be recovered from failure until the maximum recovery attempt. If not recoverable, corresponding module will be shutdown and failure alert will be raised due to its impact. If recovery count=0, no recovery attempt will be made. """ module_name = module.name() # Suspend module threads if resume == False: module.suspend() # Initialize persistent cache for sensor status per_data_path = os.path.join( module_cache_dir, f"{module_name.upper()}_{node_id}") if not os.path.isfile(per_data_path): module_persistent_data[module_name] = {} store.put(module_persistent_data[module_name], per_data_path) is_sensor_thread = False recovery_count = recovery_interval = 0 if isinstance(module, SensorThread): recovery_count, recovery_interval = _get_recovery_config(module_name) is_sensor_thread = True attempt = 0 while attempt <= recovery_count: attempt += 1 try: # Each module is passed a reference list to message queues so it # can transmit internal messages to other modules as desired module.start_thread(conf_reader, msgQlist, product) except Exception as err: curr_state = "fault" err_msg = f"{module_name}, {err}" logger.error(err_msg) if attempt > recovery_count: logger.debug(traceback.format_exc()) description = f"{module_name} is stopped and unrecoverable. {err_msg}" impact = module.impact() recommendation = "Restart SSPL service" logger.critical( f"{description}. Impact: {impact} Recommendation: {recommendation}") # Check previous state of the module and send fault alert if os.path.isfile(per_data_path): module_persistent_data[module_name] = store.get(per_data_path) prev_state = module_persistent_data[module_name].get('prev_state') if is_sensor_thread and curr_state != prev_state: module_persistent_data[module_name] = {"prev_state": curr_state} store.put(module_persistent_data[module_name], per_data_path) specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}") info = { "module_name": module_name, "alert_type": curr_state, "description": description, "impact": impact, "recommendation": recommendation, "severity": "critical", "specific_info": specific_info } jsonMsg = ThreadMonitorMsg(info).getJson() module._write_internal_msgQ(EgressProcessor.name(), jsonMsg) else: logger.debug(f"Recovering {module_name} from failure, " f"attempt: {attempt}") time.sleep(recovery_interval) # Shutdown if no recovery attempt logger.info(f"Terminating monitoring thread {module_name}") module.shutdown() retry = 5 while module.is_running(): module.shutdown() retry -= 1 if not retry: break time.sleep(2)
def rss_cliapi_poll_disks(self, disk): """Retreive realstor disk info using cli api /show/disks""" # make ws request url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKS) if (disk != self.RSS_DISK_GET_ALL): diskId = disk.partition("0.")[2] if (diskId.isdigit()): url = f"{url}/{disk}" url = f"{url}/detail" response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET) if not response: logger.warn( f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed" ) return if response.status_code != self.rssencl.ws.HTTP_OK: if url.find(self.rssencl.ws.LOOPBACK) == -1: logger.error( f"{self.rssencl.LDR_R1_ENCL}:: http request {url} to poll disks failed with \ err {response.status_code}") return try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error(f"{url} returned mal-formed json:\n{badjson}") if jresponse: api_resp = self.rssencl.get_api_status(jresponse['status']) #logger.debug("%s api response:%d" % (url.format(),api_resp)) if ((api_resp == -1) and (response.status_code == self.rssencl.ws.HTTP_OK)): logger.warn("/show/disks api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: drives = jresponse['drives'] # reset latest drive cache to build new self.latest_disks = {} self.invalidate_latest_disks_info = False for drive in drives: slot = drive.get("slot", -1) sn = drive.get("serial-number", "NA") health = drive.get("health", "NA") if slot != -1: self.latest_disks[slot] = { "serial-number": sn, "health": health } #dump drive data to persistent cache dcache_path = f"{self.disks_prcache}disk_{slot}.json" # If drive is replaced, previous drive info needs # to be retained in disk_<slot>.json.prev file and # then only dump new data to disk_<slot>.json path_exists, ret_val = store.exists(dcache_path) if path_exists and ret_val == "Success": prevdrive = store.get(dcache_path) if prevdrive is not None: prevsn = prevdrive.get("serial-number", "NA") prevhealth = prevdrive.get("health", "NA") if prevsn != sn or prevhealth != health: # Rename path store.put(store.get(dcache_path), dcache_path + ".prev") store.delete(dcache_path) store.put(drive, dcache_path) elif not path_exists and ret_val == "Success": store.put(drive, dcache_path) else: # Invalidate latest disks info if persistence store error encountered logger.warn( f"store.exists {dcache_path} return value {ret_val}" ) self.invalidate_latest_disks_info = True break if self.invalidate_latest_disks_info is True: # Reset latest disks info self.latest_disks = {} #If no in-memory cache, build from persistent cache if not self.memcache_disks: self._rss_build_disk_cache_from_persistent_cache() # if no memory cache still if not self.memcache_disks: self.memcache_disks = self.latest_disks
def _check_for_fan_module_fault(self): """Iterates over fan modules list. maintains a dictionary in order to keep track of previous health of the FRU in order to set alert_type""" self._fan_modules_list = self._get_fan_modules_list() alert_type = None if not self._fan_modules_list: return try: for fan_module in self._fan_modules_list: fru_status = fan_module.get("health").lower() durable_id = fan_module.get("durable-id").lower() health_reason = fan_module.get("health-reason").lower() if fru_status == self.rssencl.HEALTH_FAULT and \ self._check_if_fan_module_is_installed(health_reason): if durable_id not in self._faulty_fan_modules_list: alert_type = self.rssencl.FRU_MISSING self._faulty_fan_modules_list[durable_id] = alert_type else: prev_alert_type = self._faulty_fan_modules_list[ durable_id] if prev_alert_type != self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_MISSING self._faulty_fan_modules_list[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_FAULT or \ fru_status == self.rssencl.HEALTH_DEGRADED: if durable_id not in self._faulty_fan_modules_list: alert_type = self.rssencl.FRU_FAULT self._faulty_fan_modules_list[durable_id] = alert_type else: prev_alert_type = self._faulty_fan_modules_list[ durable_id] if prev_alert_type != self.rssencl.FRU_FAULT: alert_type = self.rssencl.FRU_FAULT self._faulty_fan_modules_list[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_OK: if durable_id in self._faulty_fan_modules_list: prev_alert_type = \ self._faulty_fan_modules_list[durable_id] if prev_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION else: alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._faulty_fan_modules_list[durable_id] # Persist faulty Fan Module list to file only if there is any # type of alert generated if alert_type: internal_json_message = \ self._create_internal_json_msg(fan_module, alert_type) self._send_json_message(internal_json_message) # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detectedcted if self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._faulty_fan_modules_list,\ self._faulty_fan_file_path) else: self._faulty_fan_modules_list = store.get( self._faulty_fan_file_path) alert_type = None except Exception as e: logger.exception(e)
def _get_msgs_for_faulty_psus(self, psus, send_message = True): """Checks for health of psus and returns list of messages to be sent to handler if there are any. """ self._log_debug( f"RealStorPSUSensor._get_msgs_for_faulty_psus -> {psus} {send_message}") faulty_psu_messages = [] internal_json_msg = None psu_health = None durable_id = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_psus state_changed = False if not psus: return for psu in psus: psu_health = psu["health"].lower() durable_id = psu["durable-id"] psu_health_reason = psu["health-reason"] # Check for missing and fault case if psu_health == self.rssencl.HEALTH_FAULT: self._log_debug("Found fault in PSU {0}".format(durable_id)) alert_type = self.rssencl.FRU_FAULT # Check for removal if self._check_if_psu_not_installed(psu_health_reason): alert_type = self.rssencl.FRU_MISSING state_changed = not (durable_id in self._previously_faulty_psus and self._previously_faulty_psus[durable_id]["alert_type"] == alert_type) if state_changed: self._previously_faulty_psus[durable_id] = { "health": psu_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for fault case elif psu_health == self.rssencl.HEALTH_DEGRADED: self._log_debug("Found degraded in PSU {0}".format(durable_id)) state_changed = durable_id not in self._previously_faulty_psus if state_changed: alert_type = self.rssencl.FRU_FAULT self._previously_faulty_psus[durable_id] = { "health": psu_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for healthy case elif psu_health == self.rssencl.HEALTH_OK: self._log_debug("Found ok in PSU {0}".format(durable_id)) state_changed = durable_id in self._previously_faulty_psus if state_changed: # Send message to handler if send_message: previous_alert_type = \ self._previously_faulty_psus[durable_id]["alert_type"] alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( psu, alert_type) faulty_psu_messages.append(internal_json_msg) if send_message: self._send_json_msg(internal_json_msg) del self._previously_faulty_psus[durable_id] # Persist faulty PSU list to file only if something is changed if state_changed: # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_psus,\ self._faulty_psu_file_path) else: self._previously_faulty_psus = store.get(self._faulty_psu_file_path) state_changed = False alert_type = "" return faulty_psu_messages
def _rss_check_disks_presence(self): """Match cached realstor disk info with latest retrieved disks info """ self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL) if not self.memcache_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn("Last polled drives info in-memory cache " "unavailable , unable to check drive presence change") return if not self.latest_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn("Latest polled drives info in-memory cache " "unavailable, unable to check drive presence change") return # keys are disk slot numbers removed_disks = set(self.memcache_disks.keys()) - set(self.latest_disks.keys()) inserted_disks = set(self.latest_disks.keys()) - set(self.memcache_disks.keys()) # get populated slots in both caches populated = set(self.memcache_disks.keys()) & set(self.latest_disks.keys()) # check for replaced disks for slot in populated: if self.memcache_disks[slot]['serial-number'] != self.latest_disks[slot]['serial-number']: if slot not in removed_disks: removed_disks.add(slot) if slot not in inserted_disks: inserted_disks.add(slot) # If no difference seen between cached & latest set of disk list, # means no disk removal or insertion happened if not (removed_disks or inserted_disks): #logger.info("Disk presence state _NOT_ changed !!!") return self._event = Event() for slot in removed_disks: #get removed drive data from disk cache disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev" path_exists, _ = store.exists(disk_datafile) if not path_exists: disk_datafile = f"{self.disks_prcache}disk_{slot}.json" disk_info = store.get(disk_datafile) #raise alert for missing drive self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info) # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.delete(disk_datafile) self._event.clear() self._event = None for slot in inserted_disks: #get inserted drive data from disk cache disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json") #raise alert for added drive self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info) # Update health status for inserted disk in memfault cache, # to raise fault alert after insertion if inserted disk status is not OK. if disk_info["health"] != "OK": for id_fault, cached_fault in enumerate(self.rssencl.memcache_faults): #fetch disk slot from component_id present in memcache_faults. try: component_id = cached_fault["component-id"] if component_id.startswith('Disk 0'): disk_id = int(cached_fault["component-id"].split()[1].split('.')[1]) if disk_id == slot: self.rssencl.memcache_faults[id_fault]['health'] = "OK" except Exception as e: logger.error(f"Error in updating health status for \ inserted disk in memfault cache {e}") # Update cached disk data after comparison self.memcache_disks = self.latest_disks self.rssencl.memcache_frus.update({"disks":self.memcache_disks}) return
def _get_msgs_for_faulty_controllers(self, controllers, send_message=True): """Checks for health of controllers and returns list of messages to be sent to handler if there are any. """ faulty_controller_messages = [] internal_json_msg = None controller_health = None durable_id = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_controllers state_changed = False prev_alert_type = None if not controllers: return for controller in controllers: controller_health = controller["health"].lower() controller_status = controller["status"].lower() durable_id = controller["durable-id"] # Check for missing and fault case if controller_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (durable_id in self._previously_faulty_controllers and \ self._previously_faulty_controllers[durable_id]['health']=="degraded") or \ (durable_id not in self._previously_faulty_controllers): alert_type = self.rssencl.FRU_FAULT # Check for removal if controller_status == self.rssencl.STATUS_NOTINSTALLED: alert_type = self.rssencl.FRU_MISSING self._previously_faulty_controllers[durable_id] = { "health": controller_health, "alert_type": alert_type} state_changed = True internal_json_msg = self._create_internal_msg( controller, alert_type) faulty_controller_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Check for fault case elif controller_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded # Controller can also go into degraded state after installation as well # So, Degrade state can be after missing alert as well. if (durable_id in self._previously_faulty_controllers and \ self._previously_faulty_controllers[durable_id]['health']=="fault") or \ (durable_id not in self._previously_faulty_controllers): if self._previously_faulty_controllers and \ self._previously_faulty_controllers.get(durable_id).get('alert_type'): prev_alert_type = self._previously_faulty_controllers[durable_id]["alert_type"] # If prev_alert_type is missing, then the next alert type will be insertion first if prev_alert_type and prev_alert_type.lower() == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( controller, alert_type) # send the message to the handler if send_message: self._send_json_msg(internal_json_msg) # And set alert_type as fault alert_type = self.rssencl.FRU_FAULT self._previously_faulty_controllers[durable_id] = { "health": controller_health, "alert_type": alert_type} internal_json_msg = self._create_internal_msg(controller, alert_type) faulty_controller_messages.append(internal_json_msg) state_changed = True # send the message to the handler if send_message: self._send_json_msg(internal_json_msg) # Check for healthy case elif controller_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if durable_id in self._previously_faulty_controllers: # Send message to handler if send_message: previous_alert_type = \ self._previously_faulty_controllers[durable_id]["alert_type"] alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION internal_json_msg = self._create_internal_msg( controller, alert_type) faulty_controller_messages.append(internal_json_msg) if send_message: self._send_json_msg(internal_json_msg) del self._previously_faulty_controllers[durable_id] state_changed = True # Persist faulty Controller list to file only if something is changed if state_changed: # Wait till msg is sent to message bus or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_controllers,\ self._faulty_controller_file_path) else: self._previously_faulty_controllers = store.get(self._faulty_controller_file_path) state_changed = False alert_type = "" return faulty_controller_messages
def get(self): if self.is_empty(): return item = store.get(f"{self.SSPL_UNSENT_MESSAGES}/{self.head}") return item
def _check_for_sideplane_expander_fault(self): """Iterates over sideplane expander list which has some fault. maintains a dictionary in order to keep track of previous health of the FRU, so that, alert_type can be set accordingly""" self.unhealthy_components = {} self._sideplane_expander_list = \ self._get_sideplane_expander_list() alert_type = None # Declaring the health_recommendation with default type NoneType. health_recommendation = None missing_health = " ".join( "Check that all I/O modules and power supplies in\ the enclosure are fully seated in their slots and that their latches are locked" .split()) if not self._sideplane_expander_list: return for sideplane_expander in self._sideplane_expander_list: try: self.unhealthy_components = \ sideplane_expander.get("unhealthy-component", []) fru_status = sideplane_expander.get("health").lower() durable_id = sideplane_expander.get("durable-id").lower() if self.unhealthy_components: health_recommendation = \ str(self.unhealthy_components[0] ["health-recommendation"]) # checking the health_recommendation not None if the fault response will be # theire it checks missing health. if fru_status == self.rssencl.HEALTH_FAULT and health_recommendation: if missing_health.strip(" ") in health_recommendation: if durable_id not in self._faulty_sideplane_expander_dict: alert_type = self.rssencl.FRU_MISSING self._faulty_sideplane_expander_dict[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_FAULT: if durable_id not in self._faulty_sideplane_expander_dict: alert_type = self.rssencl.FRU_FAULT self._faulty_sideplane_expander_dict[ durable_id] = alert_type elif fru_status == self.rssencl.HEALTH_OK: if durable_id in self._faulty_sideplane_expander_dict: previous_alert_type = self._faulty_sideplane_expander_dict.\ get(durable_id) alert_type = self.rssencl.FRU_FAULT_RESOLVED if previous_alert_type == self.rssencl.FRU_MISSING: alert_type = self.rssencl.FRU_INSERTION del self._faulty_sideplane_expander_dict[durable_id] if alert_type: internal_json_message = \ self._create_internal_json_message( sideplane_expander, self.unhealthy_components, alert_type) self._send_json_message(internal_json_message) # Wait till msg is sent to rabbitmq or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(\ self._faulty_sideplane_expander_dict,\ self._faulty_sideplane_expander_file_path) else: self._faulty_sideplane_expander_dict = store.get( self._faulty_sideplane_expander_file_path) alert_type = None except Exception as ae: logger.exception(ae)
def tail(self): return store.get(self.SSPL_MESSAGE_TAIL_INDEX)
def _rss_check_disks_presence(self): """Match cached realstor disk info with latest retrieved disks info """ self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL) if not self.memcache_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn( "Last polled drives info in-memory cache " "unavailable , unable to check drive presence change") return if not self.latest_disks: if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK: logger.warn( "Latest polled drives info in-memory cache " "unavailable, unable to check drive presence change") return # keys are disk slot numbers removed_disks = set(self.memcache_disks.keys()) - set( self.latest_disks.keys()) inserted_disks = set(self.latest_disks.keys()) - set( self.memcache_disks.keys()) # get populated slots in both caches populated = set(self.memcache_disks.keys()) & set( self.latest_disks.keys()) # check for replaced disks for slot in populated: if self.memcache_disks[slot]['serial-number'] != self.latest_disks[ slot]['serial-number']: if slot not in removed_disks: removed_disks.add(slot) if slot not in inserted_disks: inserted_disks.add(slot) # If no difference seen between cached & latest set of disk list, # means no disk removal or insertion happened if not (removed_disks or inserted_disks): #logger.info("Disk presence state _NOT_ changed !!!") return self._event = Event() for slot in removed_disks: #get removed drive data from disk cache disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev" path_exists, _ = store.exists(disk_datafile) if not path_exists: disk_datafile = f"{self.disks_prcache}disk_{slot}.json" disk_info = store.get(disk_datafile) #raise alert for missing drive self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info) # Wait till msg is sent to rabbitmq or added in consul for resending. # If timed out, do not update cache if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.delete(disk_datafile) self._event.clear() self._event = None for slot in inserted_disks: #get inserted drive data from disk cache disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json") #raise alert for added drive self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info) # Update cached disk data after comparison self.memcache_disks = self.latest_disks self.rssencl.memcache_frus.update({"disks": self.memcache_disks}) return
def _get_msgs_for_faulty_logical_volumes(self, logical_volumes, disk_group, send_message=True): """Checks for health of logical volumes and returns list of messages to be sent to handler if there are any. """ faulty_logical_volume_messages = [] internal_json_msg = None logical_volume_health = None serial_number = None alert_type = "" # Flag to indicate if there is a change in _previously_faulty_logical_volumes state_changed = False if not logical_volumes: return for logical_volume in logical_volumes: logical_volume_health = logical_volume["health"].lower() serial_number = logical_volume["serial-number"] # Check for missing and fault case if logical_volume_health == self.rssencl.HEALTH_FAULT: # Status change from Degraded ==> Fault or OK ==> Fault if (serial_number in self._previously_faulty_logical_volumes and \ self._previously_faulty_logical_volumes[serial_number]['health']=="degraded") or \ (serial_number not in self._previously_faulty_logical_volumes): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_logical_volumes[serial_number] = { "health": logical_volume_health, "alert_type": alert_type } state_changed = True # Check for degraded case elif logical_volume_health == self.rssencl.HEALTH_DEGRADED: # Status change from Fault ==> Degraded or OK ==> Degraded if (serial_number in self._previously_faulty_logical_volumes and \ self._previously_faulty_logical_volumes[serial_number]['health']=="fault") or \ (serial_number not in self._previously_faulty_logical_volumes): alert_type = self.rssencl.FRU_FAULT self._previously_faulty_logical_volumes[serial_number] = { "health": logical_volume_health, "alert_type": alert_type } state_changed = True # Check for healthy case elif logical_volume_health == self.rssencl.HEALTH_OK: # Status change from Fault ==> OK or Degraded ==> OK if serial_number in self._previously_faulty_logical_volumes: # Send message to handler alert_type = self.rssencl.FRU_FAULT_RESOLVED del self._previously_faulty_logical_volumes[serial_number] state_changed = True if state_changed: # Generate the alert contents internal_json_msg = self._create_internal_msg_lvol( logical_volume, alert_type, disk_group) faulty_logical_volume_messages.append(internal_json_msg) # Send message to handler if send_message: self._send_json_msg(internal_json_msg) # Persist faulty Logical Volume list to file only if something is changed # Wait till msg is sent to rabbitmq or added in consul for resending. # If timed out, do not update cache and revert in-memory cache. # So, in next iteration change can be detected if self._event.wait( self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT): store.put(self._previously_faulty_logical_volumes,\ self._faulty_logical_volume_file_path) else: self._previously_faulty_logical_volumes = store.get( self._faulty_logical_volume_file_path) state_changed = False alert_type = "" return faulty_logical_volume_messages
def get_system_status(self): """Retreive realstor system state info using cli api /show/system""" # poll system would get invoked through multiple realstor sensors # with less frequency compared to configured polling frequency # adding check to comply with polling frequency elapsed = time.time() - self.poll_system_ts if elapsed < self.pollfreq: logger.warn("/show/system request came in {0} seconds," "while configured polling frequency is {1} seconds," "ignoring".format(elapsed, self.pollfreq)) return system = None # make ws request url = self.build_url(self.URI_CLIAPI_SHOWSYSTEM) #logger.info("show system url: %s" % url) response = self.ws_request(url, self.ws.HTTP_GET) if not response: logger.warn("System status unavailable as ws request failed") return if response.status_code != self.ws.HTTP_OK: logger.info("{0}:: http request {1} polling system status failed" " with http err {2}".format(self.LDR_R1_ENCL, url, \ response.status_code)) return self.poll_system_ts = time.time() try: jresponse = json.loads(response.content) except ValueError as badjson: logger.error("%s returned mal-formed json:\n%s" % (url, badjson)) if jresponse: api_resp = self.get_api_status(jresponse['status']) if ((api_resp == -1) and (response.status_code == self.ws.HTTP_OK)): logger.warn("/show/system api response unavailable, " "marking success as http code is 200") api_resp = 0 if api_resp == 0: system = jresponse['system'][0] self.memcache_system = system if system: # Check if fault exists # TODO: use self.FAULT_KEY in system: system.key() generates # list and find item in that. if not self.FAULT_KEY in system.keys(): logger.debug("{0} Healthy, no faults seen".format( self.LDR_R1_ENCL)) self.latest_faults = {} return # Extract system faults self.latest_faults = system[self.FAULT_KEY] #If no in-memory fault cache built yet! if not self.memcache_faults: # build from persistent cache if available logger.info( "No cached faults, building from persistent cache {0}"\ .format(self.faults_persistent_cache)) self.memcache_faults = store.get( self.faults_persistent_cache) # still if none, build from latest faults & persist if not self.memcache_faults: logger.info("No persistent faults cache, building " "cache from latest faults") self.memcache_faults = self.latest_faults # On SSPL boot, run through existing faults as no cache to # verify with for new faults self.existing_faults = True #logger.debug("existing_faults {0}".\ # format(self.existing_faults)) store.put(self.memcache_faults, self.faults_persistent_cache) else: # Reset flag as existing faults processed by now # and cached faults are built already self.existing_faults = False else: logger.error("poll system failed with err %d" % api_resp)