Beispiel #1
0
    def __init__(self):
        self._max_size = int(
            Conf.get(SSPL_CONF,
                     f"{self.RABBITMQPROCESSOR}>{self.LIMIT_CONSUL_MEMORY}",
                     50000000))

        self.cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME)
        self.SSPL_MEMORY_USAGE = os.path.join(self.cache_dir_path,
                                              'SSPL_MEMORY_USAGE')
        self._current_size = store.get(self.SSPL_MEMORY_USAGE)
        if self._current_size is None:
            store.put(0, self.SSPL_MEMORY_USAGE)

        self.SSPL_MESSAGE_HEAD_INDEX = os.path.join(self.cache_dir_path,
                                                    'SSPL_MESSAGE_HEAD_INDEX')
        self._head = store.get(self.SSPL_MESSAGE_HEAD_INDEX)
        if self._head is None:
            store.put(0, self.SSPL_MESSAGE_HEAD_INDEX)

        self.SSPL_MESSAGE_TAIL_INDEX = os.path.join(self.cache_dir_path,
                                                    'SSPL_MESSAGE_TAIL_INDEX')
        self._tail = store.get(self.SSPL_MESSAGE_TAIL_INDEX)
        if self._tail is None:
            store.put(0, self.SSPL_MESSAGE_TAIL_INDEX)
        self.SSPL_UNSENT_MESSAGES = os.path.join(self.cache_dir_path,
                                                 'MESSAGES')
Beispiel #2
0
    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorFanSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorFanSensor, self).initialize_msgQ(msgQlist)


        self._fanmodule_prcache = os.path.join(self.rssencl.frus, \
                                      self.FAN_MODULES_DIR)

        # Persistence file location. This file stores faulty FanModule data
        self._faulty_fan_file_path = os.path.join(self._fanmodule_prcache,
                                                  "fanmodule_data.json")

        # Load faulty Fan Module data from file if available
        self._faulty_fan_modules_list = store.get(\
                                           self._faulty_fan_file_path)

        if self._faulty_fan_modules_list is None:
            self._faulty_fan_modules_list = {}
            store.put(self._faulty_fan_modules_list,\
                self._faulty_fan_file_path)

        return True
    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorPSUSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorPSUSensor, self).initialize_msgQ(msgQlist)

        self.psu_prcache = os.path.join(self.rssencl.frus, self.PSUS_DIR)

        # Persistence file location. This file stores faulty PSU data
        self._faulty_psu_file_path = os.path.join(
            self.psu_prcache, "psudata.json")
        self._log_debug(
            f"_faulty_psu_file_path: {self._faulty_psu_file_path}")

        # Load faulty PSU data from file if available
        self._previously_faulty_psus = store.get(\
                                           self._faulty_psu_file_path)

        if self._previously_faulty_psus is None:
            self._previously_faulty_psus = {}
            store.put(self._previously_faulty_psus,\
                self._faulty_psu_file_path)

        return True
Beispiel #4
0
    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorControllerSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorControllerSensor, self).initialize_msgQ(msgQlist)

        self._controller_prcache = os.path.join(self.rssencl.frus,\
             self.CONTROLLERS_DIR)

        # Persistence file location. This file stores faulty Controller data
        self._faulty_controller_file_path = os.path.join(
            self._controller_prcache, "controllerdata.json")

        # Load faulty Controller data from file if available
        self._previously_faulty_controllers = store.get(\
                                                  self._faulty_controller_file_path)

        if self._previously_faulty_controllers is None:
            self._previously_faulty_controllers = {}
            store.put(self._previously_faulty_controllers,\
                self._faulty_controller_file_path)

        return True
    def _rss_build_disk_cache_from_persistent_cache(self):
        """Retreive realstor system state info using cli api /show/system"""

        files = store.get_keys_with_prefix(self.disks_prcache)

        if not files:
            logger.debug("No files in Disk cache folder, ignoring")
            return

        for filename in files:
            if filename.startswith('disk_') and filename.endswith('.json'):
                if f"{filename}.prev" in files:
                    filename = f"{filename}.prev"
                drive = store.get(self.disks_prcache + filename)
                slotstr = re.findall("disk_(\d+).json", filename)[0]

                if not slotstr.isdigit():
                    logger.debug(f"slot {slotstr} not numeric, ignoring")
                    continue

                slot = int(slotstr)

                if drive:
                    sn = drive.get("serial-number", "NA")
                    self.memcache_disks[slot] = {"serial-number": sn}
    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorLogicalVolumeSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorLogicalVolumeSensor, self).initialize_msgQ(msgQlist)

        self._logical_volume_prcache = os.path.join(self.rssencl.frus,\
             self.LOGICAL_VOLUMES_DIR)

        # Persistence file location. This file stores faulty Logical Volume data
        self._faulty_disk_group_file_path = os.path.join(
            self._logical_volume_prcache, "logicalvolumedata.json")

        # Load faulty Logical Volume data from file if available
        self._previously_faulty_disk_groups = store.get(\
                                                  self._faulty_disk_group_file_path)

        if self._previously_faulty_disk_groups is None:
            self._previously_faulty_disk_groups = {}
            store.put(self._previously_faulty_disk_groups,\
                self._faulty_disk_group_file_path)

        return True
 def delete(self):
     if self.is_empty():
         return
     item = store.get(f"{self.SSPL_UNSENT_MESSAGES}/{self.head}")
     store.delete(f"{self.SSPL_UNSENT_MESSAGES}/{self.head}")
     self.head += 1
     self.current_size -= sys.getsizeof(item)
    def initialize(self, conf_reader, msgQlist, products):
        """Initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorSideplaneExpanderSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorSideplaneExpanderSensor, self).initialize_msgQ(msgQlist)

        self._sideplane_exp_prcache = os.path.join(self.rssencl.frus,\
                                          self.SIDEPLANE_EXPANDERS_DIR)

        # Persistence file location.
        # This file stores faulty sideplane expander data
        self._faulty_sideplane_expander_file_path = os.path.join(
            self._sideplane_exp_prcache, "sideplane_expanders_data.json")

        # Load faulty sideplane expander data from file if available
        self._faulty_sideplane_expander_dict = \
            store.get(\
               self._faulty_sideplane_expander_file_path)

        if self._faulty_sideplane_expander_dict is None:
            self._faulty_sideplane_expander_dict = {}
            store.put(\
                self._faulty_sideplane_expander_dict,\
                self._faulty_sideplane_expander_file_path)

        return True
    def initialize(self, conf_reader, msgQlist, products):
        """initialize configuration reader and internal msg queues"""

        # Initialize ScheduledMonitorThread and InternalMsgQ
        super(RealStorEnclosureSensor, self).initialize(conf_reader)

        # Initialize internal message queues for this module
        super(RealStorEnclosureSensor, self).initialize_msgQ(msgQlist)

        self.ENCL_SENSOR_DATA_PATH = os.path.join(self.rssencl.encl_cache,
                                                  'enclosure_data.json')
        # Get the stored previous alert info
        self.persistent_encl_data = store.get(self.ENCL_SENSOR_DATA_PATH)
        if self.persistent_encl_data:
            if self.persistent_encl_data['fault_alert'].lower() == "true":
                self.fault_alert = True
            else:
                self.fault_alert = False
            self.previous_alert_type = self.persistent_encl_data[
                'previous_alert_type']
        else:
            self.persistent_encl_data = {
                'fault_alert': str(self.fault_alert),
                'previous_alert_type': str(self.previous_alert_type),
            }
            store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH)

        return True
    def __init__(self):
        self._conf_reader = ConfigReader()
        self._max_size = int(
            self._conf_reader._get_value_with_default(self.RABBITMQPROCESSOR,
                                                      self.LIMIT_CONSUL_MEMORY,
                                                      50000000))
        self._current_size = store.get("SSPL_MEMORY_USAGE")
        if self._current_size is None:
            store.put(0, "SSPL_MEMORY_USAGE")

        self._head = store.get("SSPL_MESSAGE_HEAD_INDEX")
        if self._head is None:
            store.put(0, "SSPL_MESSAGE_HEAD_INDEX")

        self._tail = store.get("SSPL_MESSAGE_TAIL_INDEX")
        if self._tail is None:
            store.put(0, "SSPL_MESSAGE_TAIL_INDEX")
 def from_cache(cls, service_name, unit):
     """
     Initialize service from cache
     """
     data = store.get(f"{CACHE_PATH}/{service_name}")
     service = cls(unit)
     service.new_service_state(data["service_monitor_state"])
     service.state = data["service_state"]
     service.nonactive_enter_timestamp = data["nonactive_enter_timestamp"]
     service.active_enter_timestamp = data["active_enter_timestamp"]
     return service
Beispiel #12
0
def _check_module_recovered(module):
    """
    Once SSPL is restarted, check current status of the module after
    certain recovery cycle time. If module is running and its previous
    state is fault, raise fault_resolved alert and update cache.
    """
    module_name = module.name()
    # Wait till sensor module completes few run cycle. Then
    # raise module recovery fault_resolved alert.
    polling_cycle_time = Conf.get(
        SSPL_CONF, f"{SSPL_LL_SETTING}>sensor_polling_cycle_time", 60)
    time.sleep(polling_cycle_time)
    if not module.is_running():
        return

    curr_state = "fault_resolved"
    per_data_path = os.path.join(
        module_cache_dir, f'{module_name.upper()}_{node_id}')
    if not os.path.isfile(per_data_path):
        module_persistent_data[module_name] = {}
        store.put(module_persistent_data[module_name], per_data_path)
    # Check previous state before sending fault resolved alert
    module_persistent_data[module_name] = store.get(per_data_path)
    prev_state = module_persistent_data[module_name].get('prev_state')
    if prev_state and curr_state != prev_state:
        module_persistent_data[module_name] = {"prev_state": curr_state}
        store.put(module_persistent_data[module_name], per_data_path)
        specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}")
        info = {
            "module_name": module_name,
            "alert_type": curr_state,
            "description": f"{module_name} is recovered",
            "impact": "",
            "recommendation": "",
            "severity": "info",
            "specific_info": specific_info
        }
        jsonMsg = ThreadMonitorMsg(info).getJson()
        module._write_internal_msgQ(EgressProcessor.name(), jsonMsg)
Beispiel #13
0
 def head(self):
     return store.get(self.SSPL_MESSAGE_HEAD_INDEX)
Beispiel #14
0
 def current_size(self):
     return store.get(self.SSPL_MEMORY_USAGE)
    def _rss_check_disk_faults(self):
        """Retreive realstor system state info using cli api /show/system"""

        if not self.rssencl.check_system_faults_changed():
            #logger.debug("System faults state _NOT_ changed !!! ")
            return

        try:
            # Extract new system faults
            faults = self.rssencl.latest_faults
            # TODO optimize to avoid nested 'for' loops.
            # Second 'for' loop in check_new_fault()
            self._event = Event()
            if faults:
                for fault in faults:

                    #logger.debug("Faulty component-id {0}, IDENT {1}"\
                    #    .format(fault["component-id"], self.DISK_IDENTIFIER))

                    # Check faulting component type
                    if self.DISK_IDENTIFIER in fault["component-id"]:
                        # If fault on disk, get disk full info including health
                        if self.rssencl.check_new_fault(fault):

                            # Extract slot from "component-id":"Disk 0.39"
                            slot = fault["component-id"].split()[1].split(
                                '.')[1]

                            # Alert send only if disks_prcache updated with latest disk data
                            if self.latest_disks[int(slot)]["health"] != "OK":
                                #get drive data from disk cache
                                disk_info = store.get(
                                    self.disks_prcache +
                                    "disk_{0}.json".format(slot))

                                # raise alert for disk fault
                                self._rss_raise_disk_alert(
                                    self.rssencl.FRU_FAULT, disk_info)
                                # To ensure all msg is sent to rabbitmq or added in consul for resending.
                                self._event_wait_results.add(
                                    self._event.wait(
                                        self.rssencl.
                                        PERSISTENT_DATA_UPDATE_TIMEOUT))
                                self._event.clear()

            # Check for resolved faults
            for cached in self.rssencl.memcache_faults:
                if not any(d.get("component-id", None) == cached["component-id"] \
                    for d in self.rssencl.latest_faults) and self.DISK_IDENTIFIER in cached["component-id"]:

                    # Extract slot from "component-id":"Disk 0.39"
                    logger.info(
                        f"Found resolved disk fault for {cached['component-id']}"
                    )
                    slot = cached["component-id"].split()[1].split('.')[1]

                    # Alert send only if disks_prcache updated with latest disk data
                    if self.latest_disks[int(slot)]["health"] == "OK":
                        # get drive data from disk cache
                        disk_info = store.get(self.disks_prcache +
                                              "disk_{0}.json".format(slot))
                        # raise alert for resolved disk fault
                        self._rss_raise_disk_alert(
                            self.rssencl.FRU_FAULT_RESOLVED, disk_info)
                        # To ensure all msg is sent to rabbitmq or added in consul for resending.
                        self._event_wait_results.add(
                            self._event.wait(
                                self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT))
                        self._event.clear()
            # If all messages are sent to rabbitmq or added in consul for resending.
            # then only update cache
            if self._event_wait_results and all(self._event_wait_results):
                self.rssencl.update_memcache_faults()
            self._event_wait_results.clear()
            self._event = None

        except Exception as e:
            logger.exception(f"Error in _rss_check_disk_faults {e}")
Beispiel #16
0
def execute_thread(module, msgQlist, conf_reader, product, resume=True):
    """
    Run module as a thread. Recover the module if any error during
    initialization and run time of the module.

    If recovery count>0,
        module will be recovered from failure until the maximum recovery
        attempt. If not recoverable, corresponding module will be shutdown
        and failure alert will be raised due to its impact.
    If recovery count=0,
        no recovery attempt will be made.
    """
    module_name = module.name()
    # Suspend module threads
    if resume == False:
        module.suspend()

    # Initialize persistent cache for sensor status
    per_data_path = os.path.join(
        module_cache_dir, f"{module_name.upper()}_{node_id}")
    if not os.path.isfile(per_data_path):
        module_persistent_data[module_name] = {}
        store.put(module_persistent_data[module_name], per_data_path)

    is_sensor_thread = False
    recovery_count = recovery_interval = 0
    if isinstance(module, SensorThread):
        recovery_count, recovery_interval = _get_recovery_config(module_name)
        is_sensor_thread = True

    attempt = 0

    while attempt <= recovery_count:
        attempt += 1
        try:
            # Each module is passed a reference list to message queues so it
            # can transmit internal messages to other modules as desired
            module.start_thread(conf_reader, msgQlist, product)
        except Exception as err:
            curr_state = "fault"
            err_msg = f"{module_name}, {err}"
            logger.error(err_msg)
            if attempt > recovery_count:
                logger.debug(traceback.format_exc())
                description = f"{module_name} is stopped and unrecoverable. {err_msg}"
                impact = module.impact()
                recommendation = "Restart SSPL service"
                logger.critical(
                    f"{description}. Impact: {impact} Recommendation: {recommendation}")
                # Check previous state of the module and send fault alert
                if os.path.isfile(per_data_path):
                    module_persistent_data[module_name] = store.get(per_data_path)
                prev_state = module_persistent_data[module_name].get('prev_state')
                if is_sensor_thread and curr_state != prev_state:
                    module_persistent_data[module_name] = {"prev_state": curr_state}
                    store.put(module_persistent_data[module_name], per_data_path)
                    specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}")
                    info = {
                        "module_name": module_name,
                        "alert_type": curr_state,
                        "description": description,
                        "impact": impact,
                        "recommendation": recommendation,
                        "severity": "critical",
                        "specific_info": specific_info
                    }
                    jsonMsg = ThreadMonitorMsg(info).getJson()
                    module._write_internal_msgQ(EgressProcessor.name(), jsonMsg)
            else:
                logger.debug(f"Recovering {module_name} from failure, "
                             f"attempt: {attempt}")
                time.sleep(recovery_interval)

            # Shutdown if no recovery attempt
            logger.info(f"Terminating monitoring thread {module_name}")
            module.shutdown()
            retry = 5
            while module.is_running():
                module.shutdown()
                retry -= 1
                if not retry:
                    break
                time.sleep(2)
    def rss_cliapi_poll_disks(self, disk):
        """Retreive realstor disk info using cli api /show/disks"""

        # make ws request
        url = self.rssencl.build_url(self.rssencl.URI_CLIAPI_SHOWDISKS)

        if (disk != self.RSS_DISK_GET_ALL):
            diskId = disk.partition("0.")[2]

            if (diskId.isdigit()):
                url = f"{url}/{disk}"
        url = f"{url}/detail"

        response = self.rssencl.ws_request(url, self.rssencl.ws.HTTP_GET)

        if not response:
            logger.warn(
                f"{self.rssencl.LDR_R1_ENCL}:: Disks status unavailable as ws request {url} failed"
            )
            return

        if response.status_code != self.rssencl.ws.HTTP_OK:
            if url.find(self.rssencl.ws.LOOPBACK) == -1:
                logger.error(
                    f"{self.rssencl.LDR_R1_ENCL}:: http request {url} to poll disks failed with \
                       err {response.status_code}")
            return

        try:
            jresponse = json.loads(response.content)
        except ValueError as badjson:
            logger.error(f"{url} returned mal-formed json:\n{badjson}")

        if jresponse:
            api_resp = self.rssencl.get_api_status(jresponse['status'])
            #logger.debug("%s api response:%d" % (url.format(),api_resp))

            if ((api_resp == -1)
                    and (response.status_code == self.rssencl.ws.HTTP_OK)):
                logger.warn("/show/disks api response unavailable, "
                            "marking success as http code is 200")
                api_resp = 0

            if api_resp == 0:
                drives = jresponse['drives']

                # reset latest drive cache to build new
                self.latest_disks = {}
                self.invalidate_latest_disks_info = False

                for drive in drives:
                    slot = drive.get("slot", -1)
                    sn = drive.get("serial-number", "NA")
                    health = drive.get("health", "NA")

                    if slot != -1:
                        self.latest_disks[slot] = {
                            "serial-number": sn,
                            "health": health
                        }

                        #dump drive data to persistent cache
                        dcache_path = f"{self.disks_prcache}disk_{slot}.json"

                        # If drive is replaced, previous drive info needs
                        # to be retained in disk_<slot>.json.prev file and
                        # then only dump new data to disk_<slot>.json
                        path_exists, ret_val = store.exists(dcache_path)
                        if path_exists and ret_val == "Success":
                            prevdrive = store.get(dcache_path)

                            if prevdrive is not None:
                                prevsn = prevdrive.get("serial-number", "NA")
                                prevhealth = prevdrive.get("health", "NA")

                                if prevsn != sn or prevhealth != health:
                                    # Rename path
                                    store.put(store.get(dcache_path),
                                              dcache_path + ".prev")
                                    store.delete(dcache_path)

                                    store.put(drive, dcache_path)
                        elif not path_exists and ret_val == "Success":
                            store.put(drive, dcache_path)
                        else:
                            # Invalidate latest disks info if persistence store error encountered
                            logger.warn(
                                f"store.exists {dcache_path} return value {ret_val}"
                            )
                            self.invalidate_latest_disks_info = True
                            break

                if self.invalidate_latest_disks_info is True:
                    # Reset latest disks info
                    self.latest_disks = {}

            #If no in-memory cache, build from persistent cache
            if not self.memcache_disks:
                self._rss_build_disk_cache_from_persistent_cache()

            # if no memory cache still
            if not self.memcache_disks:
                self.memcache_disks = self.latest_disks
Beispiel #18
0
    def _check_for_fan_module_fault(self):
        """Iterates over fan modules list. maintains a dictionary in order to
           keep track of previous health of the FRU in order to set
           alert_type"""

        self._fan_modules_list = self._get_fan_modules_list()
        alert_type = None

        if not self._fan_modules_list:
            return

        try:
            for fan_module in self._fan_modules_list:
                fru_status = fan_module.get("health").lower()
                durable_id = fan_module.get("durable-id").lower()
                health_reason = fan_module.get("health-reason").lower()

                if fru_status == self.rssencl.HEALTH_FAULT and \
                    self._check_if_fan_module_is_installed(health_reason):
                    if durable_id not in self._faulty_fan_modules_list:
                        alert_type = self.rssencl.FRU_MISSING
                        self._faulty_fan_modules_list[durable_id] = alert_type
                    else:
                        prev_alert_type = self._faulty_fan_modules_list[
                            durable_id]
                        if prev_alert_type != self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_MISSING
                            self._faulty_fan_modules_list[
                                durable_id] = alert_type
                elif fru_status == self.rssencl.HEALTH_FAULT or \
                         fru_status == self.rssencl.HEALTH_DEGRADED:
                    if durable_id not in self._faulty_fan_modules_list:
                        alert_type = self.rssencl.FRU_FAULT
                        self._faulty_fan_modules_list[durable_id] = alert_type
                    else:
                        prev_alert_type = self._faulty_fan_modules_list[
                            durable_id]
                        if prev_alert_type != self.rssencl.FRU_FAULT:
                            alert_type = self.rssencl.FRU_FAULT
                            self._faulty_fan_modules_list[
                                durable_id] = alert_type
                elif fru_status == self.rssencl.HEALTH_OK:
                    if durable_id in self._faulty_fan_modules_list:
                        prev_alert_type = \
                            self._faulty_fan_modules_list[durable_id]
                        if prev_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        else:
                            alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        del self._faulty_fan_modules_list[durable_id]

                # Persist faulty Fan Module list to file only if there is any
                # type of alert generated
                if alert_type:
                    internal_json_message = \
                        self._create_internal_json_msg(fan_module, alert_type)
                    self._send_json_message(internal_json_message)
                    # Wait till msg is sent to message bus or added in consul for resending.
                    # If timed out, do not update cache and revert in-memory cache.
                    # So, in next iteration change can be detectedcted
                    if self._event.wait(
                            self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                        store.put(self._faulty_fan_modules_list,\
                            self._faulty_fan_file_path)
                    else:
                        self._faulty_fan_modules_list = store.get(
                            self._faulty_fan_file_path)
                    alert_type = None
        except Exception as e:
            logger.exception(e)
    def _get_msgs_for_faulty_psus(self, psus, send_message = True):
        """Checks for health of psus and returns list of messages to be
           sent to handler if there are any.
        """
        self._log_debug(
            f"RealStorPSUSensor._get_msgs_for_faulty_psus -> {psus} {send_message}")
        faulty_psu_messages = []
        internal_json_msg = None
        psu_health = None
        durable_id = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_psus
        state_changed = False

        if not psus:
            return
        for psu in psus:
            psu_health = psu["health"].lower()
            durable_id = psu["durable-id"]
            psu_health_reason = psu["health-reason"]
            # Check for missing and fault case
            if psu_health == self.rssencl.HEALTH_FAULT:
                self._log_debug("Found fault in PSU {0}".format(durable_id))
                alert_type = self.rssencl.FRU_FAULT
                # Check for removal
                if self._check_if_psu_not_installed(psu_health_reason):
                    alert_type = self.rssencl.FRU_MISSING
                state_changed = not (durable_id in self._previously_faulty_psus and
                        self._previously_faulty_psus[durable_id]["alert_type"] == alert_type)
                if state_changed:
                    self._previously_faulty_psus[durable_id] = {
                        "health": psu_health, "alert_type": alert_type}
                    internal_json_msg = self._create_internal_msg(
                        psu, alert_type)
                    faulty_psu_messages.append(internal_json_msg)
                    # Send message to handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)
            # Check for fault case
            elif psu_health == self.rssencl.HEALTH_DEGRADED:
                self._log_debug("Found degraded in PSU {0}".format(durable_id))
                state_changed = durable_id not in self._previously_faulty_psus
                if state_changed:
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_psus[durable_id] = {
                        "health": psu_health, "alert_type": alert_type}
                    internal_json_msg = self._create_internal_msg(
                        psu, alert_type)
                    faulty_psu_messages.append(internal_json_msg)
                    # Send message to handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)
            # Check for healthy case
            elif psu_health == self.rssencl.HEALTH_OK:
                self._log_debug("Found ok in PSU {0}".format(durable_id))
                state_changed = durable_id in self._previously_faulty_psus
                if state_changed:
                    # Send message to handler
                    if send_message:
                        previous_alert_type = \
                            self._previously_faulty_psus[durable_id]["alert_type"]
                        alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        if previous_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        internal_json_msg = self._create_internal_msg(
                            psu, alert_type)
                        faulty_psu_messages.append(internal_json_msg)
                        if send_message:
                            self._send_json_msg(internal_json_msg)
                    del self._previously_faulty_psus[durable_id]
            # Persist faulty PSU list to file only if something is changed
            if state_changed:
                # Wait till msg is sent to message bus or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_psus,\
                        self._faulty_psu_file_path)
                else:
                    self._previously_faulty_psus = store.get(self._faulty_psu_file_path)
                state_changed = False
            alert_type = ""
        return faulty_psu_messages
Beispiel #20
0
    def _rss_check_disks_presence(self):
        """Match cached realstor disk info with latest retrieved disks info """

        self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL)

        if not self.memcache_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn("Last polled drives info in-memory cache "
                    "unavailable , unable to check drive presence change")
                return

        if not self.latest_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn("Latest polled drives info in-memory cache "
                    "unavailable, unable to check drive presence change")
            return

        # keys are disk slot numbers
        removed_disks = set(self.memcache_disks.keys()) - set(self.latest_disks.keys())
        inserted_disks = set(self.latest_disks.keys()) - set(self.memcache_disks.keys())

        # get populated slots in both caches
        populated = set(self.memcache_disks.keys()) & set(self.latest_disks.keys())

        # check for replaced disks
        for slot in populated:
            if self.memcache_disks[slot]['serial-number'] != self.latest_disks[slot]['serial-number']:

                if slot not in removed_disks:
                    removed_disks.add(slot)

                if slot not in inserted_disks:
                    inserted_disks.add(slot)

        # If no difference seen between cached & latest set of disk list,
        # means no disk removal or insertion happened
        if not (removed_disks or inserted_disks):
            #logger.info("Disk presence state _NOT_ changed !!!")
            return

        self._event = Event()
        for slot in removed_disks:
            #get removed drive data from disk cache
            disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev"

            path_exists, _ = store.exists(disk_datafile)
            if not path_exists:
                disk_datafile = f"{self.disks_prcache}disk_{slot}.json"

            disk_info = store.get(disk_datafile)

            #raise alert for missing drive
            self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info)
            # Wait till msg is sent to message bus or added in consul for resending.
            # If timed out, do not update cache
            if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                store.delete(disk_datafile)
            self._event.clear()
        self._event = None

        for slot in inserted_disks:
            #get inserted drive data from disk cache
            disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json")

            #raise alert for added drive
            self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info)

            # Update health status for inserted disk in memfault cache,
            # to raise fault alert after insertion if inserted disk status is not OK.
            if disk_info["health"] != "OK":
                for id_fault, cached_fault in enumerate(self.rssencl.memcache_faults):
                    #fetch disk slot from component_id present in memcache_faults.
                    try:
                        component_id = cached_fault["component-id"]
                        if component_id.startswith('Disk 0'):
                            disk_id = int(cached_fault["component-id"].split()[1].split('.')[1])
                            if disk_id == slot:
                                self.rssencl.memcache_faults[id_fault]['health'] = "OK"
                    except Exception as e:
                        logger.error(f"Error in updating health status for \
                        inserted disk in memfault cache {e}")

        # Update cached disk data after comparison
        self.memcache_disks = self.latest_disks
        self.rssencl.memcache_frus.update({"disks":self.memcache_disks})

        return
Beispiel #21
0
    def _get_msgs_for_faulty_controllers(self, controllers, send_message=True):
        """Checks for health of controllers and returns list of messages to be
           sent to handler if there are any.
        """
        faulty_controller_messages = []
        internal_json_msg = None
        controller_health = None
        durable_id = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_controllers
        state_changed = False
        prev_alert_type = None

        if not controllers:
            return
        for controller in controllers:
            controller_health = controller["health"].lower()
            controller_status = controller["status"].lower()
            durable_id = controller["durable-id"]

            # Check for missing and fault case
            if controller_health == self.rssencl.HEALTH_FAULT:
                # Status change from Degraded ==> Fault or OK ==> Fault
                if (durable_id in self._previously_faulty_controllers and \
                        self._previously_faulty_controllers[durable_id]['health']=="degraded") or \
                        (durable_id not in self._previously_faulty_controllers):
                    alert_type = self.rssencl.FRU_FAULT
                    # Check for removal
                    if controller_status == self.rssencl.STATUS_NOTINSTALLED:
                        alert_type = self.rssencl.FRU_MISSING
                    self._previously_faulty_controllers[durable_id] = {
                        "health": controller_health, "alert_type": alert_type}
                    state_changed = True
                    internal_json_msg = self._create_internal_msg(
                        controller, alert_type)
                    faulty_controller_messages.append(internal_json_msg)
                    # Send message to handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)
            # Check for fault case
            elif controller_health == self.rssencl.HEALTH_DEGRADED:
                # Status change from Fault ==> Degraded or OK ==> Degraded
                # Controller can also go into degraded state after installation as well
                # So, Degrade state can be after missing alert as well.
                if (durable_id in self._previously_faulty_controllers and \
                        self._previously_faulty_controllers[durable_id]['health']=="fault") or \
                        (durable_id not in self._previously_faulty_controllers):
                    if self._previously_faulty_controllers and \
                            self._previously_faulty_controllers.get(durable_id).get('alert_type'):
                        prev_alert_type = self._previously_faulty_controllers[durable_id]["alert_type"]

                    # If prev_alert_type is missing, then the next alert type will be insertion first
                    if prev_alert_type and prev_alert_type.lower() == self.rssencl.FRU_MISSING:
                        alert_type = self.rssencl.FRU_INSERTION

                        internal_json_msg = self._create_internal_msg(
                                    controller, alert_type)

                        # send the message to the handler
                        if send_message:
                            self._send_json_msg(internal_json_msg)

                    # And set alert_type as fault
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_controllers[durable_id] = {
                        "health": controller_health, "alert_type": alert_type}

                    internal_json_msg = self._create_internal_msg(controller, alert_type)
                    faulty_controller_messages.append(internal_json_msg)

                    state_changed = True

                    # send the message to the handler
                    if send_message:
                        self._send_json_msg(internal_json_msg)

            # Check for healthy case
            elif controller_health == self.rssencl.HEALTH_OK:
                # Status change from Fault ==> OK or Degraded ==> OK
                if durable_id in self._previously_faulty_controllers:
                    # Send message to handler
                    if send_message:
                        previous_alert_type = \
                            self._previously_faulty_controllers[durable_id]["alert_type"]
                        alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        if previous_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        internal_json_msg = self._create_internal_msg(
                            controller, alert_type)
                        faulty_controller_messages.append(internal_json_msg)
                        if send_message:
                            self._send_json_msg(internal_json_msg)
                    del self._previously_faulty_controllers[durable_id]
                    state_changed = True
            # Persist faulty Controller list to file only if something is changed
            if state_changed:
                # Wait till msg is sent to message bus or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_controllers,\
                        self._faulty_controller_file_path)
                else:
                    self._previously_faulty_controllers = store.get(self._faulty_controller_file_path)
                state_changed = False
            alert_type = ""
        return faulty_controller_messages
 def get(self):
     if self.is_empty():
         return
     item = store.get(f"{self.SSPL_UNSENT_MESSAGES}/{self.head}")
     return item
    def _check_for_sideplane_expander_fault(self):
        """Iterates over sideplane expander list which has some fault.
           maintains a dictionary in order to keep track of previous
           health of the FRU, so that, alert_type can be set accordingly"""

        self.unhealthy_components = {}
        self._sideplane_expander_list = \
            self._get_sideplane_expander_list()
        alert_type = None
        # Declaring the health_recommendation with default type NoneType.
        health_recommendation = None

        missing_health = " ".join(
            "Check that all I/O modules and power supplies in\
        the enclosure are fully seated in their slots and that their latches are locked"
            .split())

        if not self._sideplane_expander_list:
            return

        for sideplane_expander in self._sideplane_expander_list:
            try:
                self.unhealthy_components = \
                    sideplane_expander.get("unhealthy-component", [])
                fru_status = sideplane_expander.get("health").lower()
                durable_id = sideplane_expander.get("durable-id").lower()

                if self.unhealthy_components:
                    health_recommendation = \
                        str(self.unhealthy_components[0]
                            ["health-recommendation"])

                # checking the health_recommendation not None if the fault response will be
                # theire it checks missing health.
                if fru_status == self.rssencl.HEALTH_FAULT and health_recommendation:
                    if missing_health.strip(" ") in health_recommendation:
                        if durable_id not in self._faulty_sideplane_expander_dict:
                            alert_type = self.rssencl.FRU_MISSING
                            self._faulty_sideplane_expander_dict[
                                durable_id] = alert_type
                elif fru_status == self.rssencl.HEALTH_FAULT:
                    if durable_id not in self._faulty_sideplane_expander_dict:
                        alert_type = self.rssencl.FRU_FAULT
                        self._faulty_sideplane_expander_dict[
                            durable_id] = alert_type
                elif fru_status == self.rssencl.HEALTH_OK:
                    if durable_id in self._faulty_sideplane_expander_dict:
                        previous_alert_type = self._faulty_sideplane_expander_dict.\
                        get(durable_id)
                        alert_type = self.rssencl.FRU_FAULT_RESOLVED
                        if previous_alert_type == self.rssencl.FRU_MISSING:
                            alert_type = self.rssencl.FRU_INSERTION
                        del self._faulty_sideplane_expander_dict[durable_id]
                if alert_type:
                    internal_json_message = \
                        self._create_internal_json_message(
                            sideplane_expander, self.unhealthy_components,
                            alert_type)
                    self._send_json_message(internal_json_message)
                    # Wait till msg is sent to rabbitmq or added in consul for resending.
                    # If timed out, do not update cache and revert in-memory cache.
                    # So, in next iteration change can be detected
                    if self._event.wait(
                            self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                        store.put(\
                            self._faulty_sideplane_expander_dict,\
                            self._faulty_sideplane_expander_file_path)
                    else:
                        self._faulty_sideplane_expander_dict = store.get(
                            self._faulty_sideplane_expander_file_path)
                    alert_type = None

            except Exception as ae:
                logger.exception(ae)
Beispiel #24
0
 def tail(self):
     return store.get(self.SSPL_MESSAGE_TAIL_INDEX)
    def _rss_check_disks_presence(self):
        """Match cached realstor disk info with latest retrieved disks info """

        self.rss_cliapi_poll_disks(self.RSS_DISK_GET_ALL)

        if not self.memcache_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn(
                    "Last polled drives info in-memory cache "
                    "unavailable , unable to check drive presence change")
                return

        if not self.latest_disks:
            if self.rssencl.active_ip != self.rssencl.ws.LOOPBACK:
                logger.warn(
                    "Latest polled drives info in-memory cache "
                    "unavailable, unable to check drive presence change")
            return

        # keys are disk slot numbers
        removed_disks = set(self.memcache_disks.keys()) - set(
            self.latest_disks.keys())
        inserted_disks = set(self.latest_disks.keys()) - set(
            self.memcache_disks.keys())

        # get populated slots in both caches
        populated = set(self.memcache_disks.keys()) & set(
            self.latest_disks.keys())

        # check for replaced disks
        for slot in populated:
            if self.memcache_disks[slot]['serial-number'] != self.latest_disks[
                    slot]['serial-number']:

                if slot not in removed_disks:
                    removed_disks.add(slot)

                if slot not in inserted_disks:
                    inserted_disks.add(slot)

        # If no difference seen between cached & latest set of disk list,
        # means no disk removal or insertion happened
        if not (removed_disks or inserted_disks):
            #logger.info("Disk presence state _NOT_ changed !!!")
            return

        self._event = Event()
        for slot in removed_disks:
            #get removed drive data from disk cache
            disk_datafile = f"{self.disks_prcache}disk_{slot}.json.prev"

            path_exists, _ = store.exists(disk_datafile)
            if not path_exists:
                disk_datafile = f"{self.disks_prcache}disk_{slot}.json"

            disk_info = store.get(disk_datafile)

            #raise alert for missing drive
            self._rss_raise_disk_alert(self.rssencl.FRU_MISSING, disk_info)
            # Wait till msg is sent to rabbitmq or added in consul for resending.
            # If timed out, do not update cache
            if self._event.wait(self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                store.delete(disk_datafile)
            self._event.clear()
        self._event = None

        for slot in inserted_disks:
            #get inserted drive data from disk cache
            disk_info = store.get(f"{self.disks_prcache}disk_{slot}.json")

            #raise alert for added drive
            self._rss_raise_disk_alert(self.rssencl.FRU_INSERTION, disk_info)

        # Update cached disk data after comparison
        self.memcache_disks = self.latest_disks
        self.rssencl.memcache_frus.update({"disks": self.memcache_disks})

        return
Beispiel #26
0
    def _get_msgs_for_faulty_logical_volumes(self,
                                             logical_volumes,
                                             disk_group,
                                             send_message=True):
        """Checks for health of logical volumes and returns list of messages to be
           sent to handler if there are any.
        """
        faulty_logical_volume_messages = []
        internal_json_msg = None
        logical_volume_health = None
        serial_number = None
        alert_type = ""
        # Flag to indicate if there is a change in _previously_faulty_logical_volumes
        state_changed = False

        if not logical_volumes:
            return

        for logical_volume in logical_volumes:
            logical_volume_health = logical_volume["health"].lower()
            serial_number = logical_volume["serial-number"]

            # Check for missing and fault case
            if logical_volume_health == self.rssencl.HEALTH_FAULT:
                # Status change from Degraded ==> Fault or OK ==> Fault
                if (serial_number in self._previously_faulty_logical_volumes and \
                        self._previously_faulty_logical_volumes[serial_number]['health']=="degraded") or \
                        (serial_number not in self._previously_faulty_logical_volumes):
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_logical_volumes[serial_number] = {
                        "health": logical_volume_health,
                        "alert_type": alert_type
                    }
                    state_changed = True

            # Check for degraded case
            elif logical_volume_health == self.rssencl.HEALTH_DEGRADED:
                # Status change from Fault ==> Degraded or OK ==> Degraded
                if (serial_number in self._previously_faulty_logical_volumes and \
                        self._previously_faulty_logical_volumes[serial_number]['health']=="fault") or \
                        (serial_number not in self._previously_faulty_logical_volumes):
                    alert_type = self.rssencl.FRU_FAULT
                    self._previously_faulty_logical_volumes[serial_number] = {
                        "health": logical_volume_health,
                        "alert_type": alert_type
                    }
                    state_changed = True

            # Check for healthy case
            elif logical_volume_health == self.rssencl.HEALTH_OK:
                # Status change from Fault ==> OK or Degraded ==> OK
                if serial_number in self._previously_faulty_logical_volumes:
                    # Send message to handler
                    alert_type = self.rssencl.FRU_FAULT_RESOLVED
                    del self._previously_faulty_logical_volumes[serial_number]
                    state_changed = True

            if state_changed:
                # Generate the alert contents
                internal_json_msg = self._create_internal_msg_lvol(
                    logical_volume, alert_type, disk_group)
                faulty_logical_volume_messages.append(internal_json_msg)
                # Send message to handler
                if send_message:
                    self._send_json_msg(internal_json_msg)
                # Persist faulty Logical Volume list to file only if something is changed
                # Wait till msg is sent to rabbitmq or added in consul for resending.
                # If timed out, do not update cache and revert in-memory cache.
                # So, in next iteration change can be detected
                if self._event.wait(
                        self.rssencl.PERSISTENT_DATA_UPDATE_TIMEOUT):
                    store.put(self._previously_faulty_logical_volumes,\
                        self._faulty_logical_volume_file_path)
                else:
                    self._previously_faulty_logical_volumes = store.get(
                        self._faulty_logical_volume_file_path)
                state_changed = False
            alert_type = ""

        return faulty_logical_volume_messages
    def get_system_status(self):
        """Retreive realstor system state info using cli api /show/system"""

        # poll system would get invoked through multiple realstor sensors
        # with less frequency compared to configured polling frequency
        # adding check to comply with polling frequency
        elapsed = time.time() - self.poll_system_ts

        if elapsed < self.pollfreq:
            logger.warn("/show/system request came in {0} seconds,"
                        "while configured polling frequency is {1} seconds,"
                        "ignoring".format(elapsed, self.pollfreq))
            return

        system = None

        # make ws request
        url = self.build_url(self.URI_CLIAPI_SHOWSYSTEM)
        #logger.info("show system url: %s" % url)

        response = self.ws_request(url, self.ws.HTTP_GET)

        if not response:
            logger.warn("System status unavailable as ws request failed")
            return

        if response.status_code != self.ws.HTTP_OK:
            logger.info("{0}:: http request {1} polling system status failed"
                " with http err {2}".format(self.LDR_R1_ENCL, url, \
                response.status_code))
            return

        self.poll_system_ts = time.time()

        try:
            jresponse = json.loads(response.content)
        except ValueError as badjson:
            logger.error("%s returned mal-formed json:\n%s" % (url, badjson))

        if jresponse:
            api_resp = self.get_api_status(jresponse['status'])

            if ((api_resp == -1)
                    and (response.status_code == self.ws.HTTP_OK)):
                logger.warn("/show/system api response unavailable, "
                            "marking success as http code is 200")
                api_resp = 0

            if api_resp == 0:
                system = jresponse['system'][0]
                self.memcache_system = system

            if system:
                # Check if fault exists
                # TODO: use self.FAULT_KEY in system: system.key() generates
                # list and find item in that.
                if not self.FAULT_KEY in system.keys():
                    logger.debug("{0} Healthy, no faults seen".format(
                        self.LDR_R1_ENCL))
                    self.latest_faults = {}
                    return

                # Extract system faults
                self.latest_faults = system[self.FAULT_KEY]

                #If no in-memory fault cache built yet!
                if not self.memcache_faults:
                    # build from persistent cache if available
                    logger.info(
                        "No cached faults, building from  persistent cache {0}"\
                        .format(self.faults_persistent_cache))

                    self.memcache_faults = store.get(
                        self.faults_persistent_cache)

                    # still if none, build from latest faults & persist
                    if not self.memcache_faults:
                        logger.info("No persistent faults cache, building "
                                    "cache from latest faults")

                        self.memcache_faults = self.latest_faults

                        # On SSPL boot, run through existing faults as no cache to
                        # verify with for new faults
                        self.existing_faults = True

                        #logger.debug("existing_faults {0}".\
                        #    format(self.existing_faults))

                        store.put(self.memcache_faults,
                                  self.faults_persistent_cache)
                else:
                    # Reset flag as existing faults processed by now
                    # and cached faults are built already
                    self.existing_faults = False
            else:
                logger.error("poll system failed with err %d" % api_resp)