Exemple #1
0
    def _load_json_file(self, key):
        """ Load dict obj from json in given absolute file path"""
        value = None
        absfilepath = key

        # Check if directory exists
        directory_path = os.path.join(os.path.dirname(absfilepath), "")
        if not os.path.isdir(directory_path):
            logger.critical("Path doesn't exists: {0}".format(directory_path))
            return

        try:
            fh = open(absfilepath,"rb")
            try:
                value = pickle.load(fh)
            except:
                value = fh.read()
        except IOError as err:
            logger.warn("I/O error[{0}] while loading data from file {1}): {2}"\
                .format(err.errno,absfilepath,err))
        except ValueError as jsonerr:
            logger.warn("JSON error{0} while loading from {1}".format(jsonerr, absfilepath))
            value = None
        except OSError as oserr:
            logger.warn("OS error{0} while loading from {1}".format(oserr, absfilepath))
        except Exception as gerr:
            logger.warn("Error{0} while reading data from file {1}"\
                .format(gerr, absfilepath))
        else:
            fh.close()

        return value
Exemple #2
0
    def put(self, value, key, pickled=True):
        """ Dump value to given absolute file path"""

        absfilepath = key
        directory_path = os.path.join(os.path.dirname(absfilepath), "")

        # If directory does not exists, create
        if not os.path.isdir(directory_path):
            try:
                os.makedirs(directory_path, exist_ok=True)
            except OSError as exc:
                if exc.errno == errno.EACCES:
                    logger.critical(f"Permission denied while creating dir: {directory_path}")
            except Exception as err:
                    logger.warn(f"{directory_path} creation failed with error {err}, alerts \
                    may get missed on sspl restart or failover!!")

        try:
            fh = open(absfilepath,"wb")
            if pickled:
                pickle.dump(value, fh)
            else:
                fh.write(value)

        except IOError as err:
            logger.warn("I/O error[{0}] while dumping data to file {1}): {2}"\
                .format(err.errno,absfilepath,err))
        except Exception as gerr:
            logger.warn("Error[{0}] while dumping data to file {1}"\
                .format(gerr, absfilepath))
        else:
            fh.close()
Exemple #3
0
def _run_thread_capture_errors(curr_module, msgQlist, conf_reader, product):
    """Run the given thread and log any errors that happen on it.
    Will stop all sspl_modules if one of them fails."""
    try:
        # Each module is passed a reference list to message queues so it can transmit
        #  internal messages to other modules as desired
        curr_module.initialize(conf_reader, msgQlist, product)
        curr_module.start()

    except BaseException as ex:
        logger.critical(
            "SSPL-Tests encountered a fatal error, terminating service Error: %s"
            % ex)
        logger.exception(ex)

        # Populate an actuator response message and transmit back to HAlon
        error_msg = "SSPL-Tests encountered an error, terminating service Error: " + \
                    ", Exception: " + logger.exception(ex)
        jsonMsg = ThreadControllerMsg(curr_module, error_msg).getJson()
        curr_module._write_internal_msgQ(EgressProcessorTests.name(), jsonMsg)

        # Shut it down, error is non-recoverable
        for name, other_module in list(world.sspl_modules.items()):
            if other_module is not curr_module:
                other_module.shutdown()
    def _parse_fru_info(self, fru):
        """Parses fan information"""
        specific_info = None
        specifics = []

        for sensor_id, fru_info in self.fru_specific_info.items():
            specific_info = dict()
            for fru_key,fru_value in fru_info.items():
                specific_info[fru_key] = fru_value
            specific_info["resource_id"] = sensor_id
            specifics.append(specific_info)

        if (fru == "Power Supply") or (fru == "Fan") or (fru == "Drive Slot / Bay"):
            if not specifics:
                manufacturer = self._executor.get_manufacturer_name()
                msg = "'%s' sensors not seen in %s node server" % (
                    fru, manufacturer)
                specifics.append({"ERROR": msg})
                logger.critical(msg)
            else:
                for each in specifics:
                    if each.get('States Asserted'):
                        each['States Asserted'] = ' '.join(
                            x.strip() for x in each['States Asserted'].split())

        self.fru_specific_info = {}
        return specifics
Exemple #5
0
def _run_thread_capture_errors(curr_module, sspl_modules, msgQlist,
                               conf_reader, product):
    """Run the given thread and log any errors that happen on it.
    Will stop all sspl_modules if one of them fails."""
    try:
        # Each module is passed a reference list to message queues so it can transmit
        #  internal messages to other modules as desired
        curr_module.start_thread(conf_reader, msgQlist, product)

    except BaseException as ex:
        logger.critical(
            "SSPL-LL encountered a fatal error, terminating service Error: %s"
            % ex)
        logger.exception(ex)

        # Populate an actuator response message and transmit back to HAlon
        error_msg = "SSPL-LL encountered an error, terminating service Error: " + \
                    ", Exception: " + logger.exception(ex)
        json_msg = ThreadControllerMsg(curr_module.name(), error_msg).getJson()

        if product.lower() in [x.lower() for x in enabled_products]:
            self._write_internal_msgQ(RabbitMQegressProcessor.name(), json_msg)
        elif product.lower() in [x.lower() for x in cs_legacy_products]:
            self._write_internal_msgQ(PlaneCntrlRMQegressProcessor.name(),
                                      json_msg)

        # Shut it down, error is non-recoverable
        for name, other_module in list(sspl_modules.items()):
            if other_module is not curr_module:
                other_module.shutdown()
Exemple #6
0
def execute_thread(module, msgQlist, conf_reader, product, resume=True):
    """
    Run module as a thread. Recover the module if any error during
    initialization and run time of the module.

    If recovery count>0,
        module will be recovered from failure until the maximum recovery
        attempt. If not recoverable, corresponding module will be shutdown
        and failure alert will be raised due to its impact.
    If recovery count=0,
        no recovery attempt will be made.
    """
    module_name = module.name()
    # Suspend module threads
    if resume == False:
        module.suspend()

    # Initialize persistent cache for sensor status
    per_data_path = os.path.join(
        module_cache_dir, f"{module_name.upper()}_{node_id}")
    if not os.path.isfile(per_data_path):
        module_persistent_data[module_name] = {}
        store.put(module_persistent_data[module_name], per_data_path)

    is_sensor_thread = False
    recovery_count = recovery_interval = 0
    if isinstance(module, SensorThread):
        recovery_count, recovery_interval = _get_recovery_config(module_name)
        is_sensor_thread = True

    attempt = 0

    while attempt <= recovery_count:
        attempt += 1
        try:
            # Each module is passed a reference list to message queues so it
            # can transmit internal messages to other modules as desired
            module.start_thread(conf_reader, msgQlist, product)
        except Exception as err:
            curr_state = "fault"
            err_msg = f"{module_name}, {err}"
            logger.error(err_msg)
            if attempt > recovery_count:
                logger.debug(traceback.format_exc())
                description = f"{module_name} is stopped and unrecoverable. {err_msg}"
                impact = module.impact()
                recommendation = "Restart SSPL service"
                logger.critical(
                    f"{description}. Impact: {impact} Recommendation: {recommendation}")
                # Check previous state of the module and send fault alert
                if os.path.isfile(per_data_path):
                    module_persistent_data[module_name] = store.get(per_data_path)
                prev_state = module_persistent_data[module_name].get('prev_state')
                if is_sensor_thread and curr_state != prev_state:
                    module_persistent_data[module_name] = {"prev_state": curr_state}
                    store.put(module_persistent_data[module_name], per_data_path)
                    specific_info = Conf.get(SSPL_CONF, f"{module_name.upper()}")
                    info = {
                        "module_name": module_name,
                        "alert_type": curr_state,
                        "description": description,
                        "impact": impact,
                        "recommendation": recommendation,
                        "severity": "critical",
                        "specific_info": specific_info
                    }
                    jsonMsg = ThreadMonitorMsg(info).getJson()
                    module._write_internal_msgQ(EgressProcessor.name(), jsonMsg)
            else:
                logger.debug(f"Recovering {module_name} from failure, "
                             f"attempt: {attempt}")
                time.sleep(recovery_interval)

            # Shutdown if no recovery attempt
            logger.info(f"Terminating monitoring thread {module_name}")
            module.shutdown()
            retry = 5
            while module.is_running():
                module.shutdown()
                retry -= 1
                if not retry:
                    break
                time.sleep(2)