def _send_json_msg(self, alert_type, resource_id, error_msg): """Transmit data to NodeDataMsgHandler to be processed and sent out""" epoch_time = str(int(time.time())) severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) self._alert_id = self._get_alert_id(epoch_time) host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time, "description": error_msg } specific_info = {"error": error_msg} internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "sensor_type": "node:os:raid_integrity", "host_id": host_name, "alert_type": alert_type, "alert_id": self._alert_id, "severity": severity, "info": info, "specific_info": specific_info } } }) self.alert_type = None # Send the event to node data message handler to generate json message and send out self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg)
def send_json_msg(self, alert_type, encl_status): severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = "0" host_name = socket.getfqdn() info = { "site_id": self.rssencl.site_id, "cluster_id": self.rssencl.cluster_id, "rack_id": self.rssencl.rack_id, "node_id": self.rssencl.node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": { "event": encl_status } } }}) self.previous_alert_type = alert_type self._write_internal_msgQ(RealStorEnclMsgHandler.name(), internal_json_msg)
def _create_internal_msg(self, controller_detail, alert_type): """Forms a dictionary containing info about Controllers to send to message handler. """ if not controller_detail: return {} severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = controller_detail.get("durable-id", "") host_name = socket.gethostname() info = { "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": controller_detail } } }) return internal_json_msg
def _create_internal_json_message(self, sideplane_expander, unhealthy_components, alert_type): """Creates internal json structure which is sent to realstor_msg_handler for further processing""" sideplane_expander_info_key_list = \ ['name', 'status', 'location', 'health', 'health-reason', 'health-recommendation', 'enclosure-id', 'durable-id', 'drawer-id', 'position'] sideplane_expander_info_dict = {} if unhealthy_components: sideplane_unhealthy_components = \ self._get_unhealthy_components(unhealthy_components) for exp_key, exp_val in sideplane_expander.items(): if exp_key in sideplane_expander_info_key_list: sideplane_expander_info_dict[exp_key] = exp_val sideplane_expander_info_dict["unhealthy_components"] = \ unhealthy_components severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) drawer_id = "drawer" + ' ' + str( sideplane_expander_info_dict.get("drawer-id")) name = sideplane_expander_info_dict.get("name", "") resource_id = drawer_id + ' ' + name host_name = socket.gethostname() info = { "site_id": self.rssencl.site_id, "cluster_id": self.rssencl.cluster_id, "rack_id": self.rssencl.rack_id, "node_id": self.rssencl.node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "status": "update", "host_id": host_name, "alert_type": alert_type, "alert_id": alert_id, "severity": severity, "info": info, "specific_info": sideplane_expander_info_dict } } }) return internal_json_msg
def _create_json_message(self, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() specific_info = {} specific_info_list = [] if alert_type == "fault": description = \ "Total available main memory value decreased from {} kB to {} kB"\ .format(self.prev_mem, self.total_mem) elif alert_type == "fault_resolved": description = \ "Total main memory value available {} kB"\ .format(self.total_mem) # populate all the data from /proc/meminfo split_strs = [ s.split(maxsplit=1) for s in self.mem_path_file.splitlines() ] dictionary_str = dict(split_strs) specific_info["meminfo"] = dictionary_str specific_info_list.append(specific_info) alert_specific_info = specific_info_list info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID, "event_time": epoch_time, "description": description } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg
def _create_internal_msg_lvol(self, logical_volume_detail, alert_type, disk_group): """Forms a dictionary containing info about Logical Volumes to send to message handler. """ if not logical_volume_detail: return {} generic_info = dict.fromkeys(self.volumes_generic, "NA") extended_info = dict.fromkeys(self.volumes_extended, "NA") disk_groups_info = dict.fromkeys(self.disk_groups_generic, "NA") severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = logical_volume_detail.get("volume-name", "") host_name = socket.gethostname() for key, value in logical_volume_detail.items(): if key in self.volumes_generic: generic_info.update({key: value}) elif key in self.volumes_extended: extended_info.update({key: value}) for key, value in disk_group.items(): if key in self.disk_groups_generic: disk_groups_info.update({key: value}) generic_info['disk-group'] = [disk_groups_info] generic_info.update(extended_info) info = { "site_id": self.rssencl.site_id, "cluster_id": self.rssencl.cluster_id, "rack_id": self.rssencl.rack_id, "node_id": self.rssencl.node_id, "resource_type": self.RESOURCE_TYPE_LVOL, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": generic_info } } }) return internal_json_msg
def _create_internal_json_msg(self, fan_module, alert_type): """Creates internal json structure which is sent to realstor_msg_handler for further processing""" fan_module_info_key_list = \ ['name', 'location', 'status', 'health', 'health-reason', 'health-recommendation', 'enclosure-id', 'durable-id', 'position'] fan_module_info_dict = {} fan_module_extended_info_dict = {} fans_list = self._get_fan_attributes(fan_module) for fan_module_key, fan_module_value in fan_module.items(): if fan_module_key in fan_module_info_key_list: fan_module_info_dict[fan_module_key] = fan_module_value fan_module_info_dict["fans"] = fans_list severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = fan_module_info_dict.get("name", "") host_name = socket.gethostname() info = { "site_id": self.rssencl.site_id, "cluster_id": self.rssencl.cluster_id, "rack_id": self.rssencl.rack_id, "node_id": self.rssencl.node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time } # Creates internal json message request structure. # this message will be passed to the StorageEnclHandler internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": fan_module_info_dict } } }) return internal_json_msg
def _send_json_msg(self, alert_type, resource_id, device, drives): """Transmit data to NodeDataMsgHandler to be processed and sent out""" epoch_time = str(int(time.time())) severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) self._alert_id = self._get_alert_id(epoch_time) host_name = socket.getfqdn() if alert_type == self.MISSING: description = "RAID array or drive from RAID array is missing." elif alert_type == self.FAULT: description = "RAID array or drive from RAID array is faulty." elif alert_type == self.INSERTION: description = "Inserted drive in RAID array." elif alert_type == self.FAULT_RESOLVED: description = "Fault for RAID array or RAID drive is resolved" else: description = "Raid array alert" info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time, "description": description } specific_info = { "device": device, "drives": drives } internal_json_msg = json.dumps( {"sensor_request_type" : { "node_data": { "status": "update", "sensor_type" : "node:os:raid_data", "host_id": host_name, "alert_type": alert_type, "alert_id": self._alert_id, "severity": severity, "info": info, "specific_info": specific_info } } }) self.prev_alert_type[device] = alert_type self.alert_type = None # Send the event to node data message handler to generate json message and send out self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg)
def _create_json_message(self, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() specific_info = {} specific_info_list = [] for key, val in self.phy_dir_to_linkrate_mapping.items(): # key will be phy-0:0. So, aplit it using ':' # So, structure will be SASHBA-0:phy-0 phy_number = key.split(":")[1] specific_info[ "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number specific_info[ "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[ key][0].strip() specific_info_list.append(specific_info) specific_info = {} alert_specific_info = specific_info_list info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID, "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg
def _create_json_message(self, cpu, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() # Populate specific info self.fill_specific_info() alert_specific_info = self.specific_info res_id = self.RESOURCE_ID + str(cpu) for item in alert_specific_info: if item['resource_id'] == res_id: if alert_type == "fault": description = "Faulty CPU detected, %s state is %s" % ( item['resource_id'], item["state"]) else: description = "Fault resolved for CPU, %s state is %s" % ( item['resource_id'], item["state"]) info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": res_id, "resource_id": self.RESOURCE_ID + str(cpu), "event_time": epoch_time, "description": description } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg
def _create_internal_msg_dg(self, alert_type, disk_group_detail): """Forms a dictionary containing info about Disk Groups to send to message handler. """ if not disk_group_detail: return {} generic_info = dict.fromkeys(self.disk_groups_generic, "NA") extended_info = dict.fromkeys(self.disk_groups_extended, "NA") severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = disk_group_detail.get("name", "") host_name = self.os_utils.get_fqdn() for key, value in disk_group_detail.items(): if key in self.disk_groups_generic: generic_info.update({key : value}) elif key in self.disk_groups_extended: extended_info.update({key : value}) generic_info.update(extended_info) cvg_info = { "cvg_name": resource_id if resource_id in self.cvg_info_dict else "NA", "cvg_id": self.cvg_info_dict.get(resource_id, "NA") } generic_info.update(cvg_info) info = { "resource_type": self.RESOURCE_TYPE_DG, "resource_id": resource_id, "event_time": epoch_time } internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": generic_info } }}) return internal_json_msg
def get_alert(cls, service, alert): if service.state == "active": description = alert.description.format( service.name, service.state, service.active_threshold) else: description = alert.description.format( service.name, service.state, service.nonactive_threshold) return { "sensor_request_type": { "service_status_alert": { "host_id": socket.getfqdn(), "severity": SeverityReader().map_severity( alert.alert_type), "alert_id": get_alert_id(str(int(time.time()))), "alert_type": alert.alert_type, "info": { "resource_type": cls.RESOURCE_TYPE, "resource_id": service.name, "event_time": str(int(time.time())), "description": description, "impact": alert.impact.format(service.name), "recommendation": alert.recommendation, }, "specific_info": { "service_name": service.name, "previous_state": service.previous_state, "state": service.state, "previous_substate": service.previous_substate, "substate": service.substate, "previous_pid": service.previous_pid, "pid": service.pid, } } } }
def _send_json_msg(self, alert_type, resource_id, device, drives): """Transmit data to NodeDataMsgHandler to be processed and sent out""" epoch_time = str(int(time.time())) severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) self._alert_id = self._get_alert_id(epoch_time) host_name = socket.getfqdn() info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time } specific_info = { "device": device, "drives": drives } internal_json_msg = json.dumps( {"sensor_request_type" : { "node_data": { "status": "update", "sensor_type" : "node:os:raid_data", "host_id": host_name, "alert_type": alert_type, "alert_id": self._alert_id, "severity": severity, "info": info, "specific_info": specific_info } } }) self.prev_alert_type[device] = alert_type self.alert_type = None self._log_debug("_send_json_msg, internal_json_msg: %s" %(internal_json_msg)) # Send the event to node data message handler to generate json message and send out self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg)
def _gen_json_msg(self, alert_type, details, ext): """ Generate json message""" severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) resource_id = ext.get("durable-id") host_name = socket.gethostname() info = { "site_id": self.rssencl.site_id, "cluster_id": self.rssencl.cluster_id, "rack_id": self.rssencl.rack_id, "node_id": self.rssencl.node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time } specific_info = dict() specific_info.update(details) specific_info.update(ext) for k in specific_info.keys(): if specific_info[k] == "": specific_info[k] = "N/A" json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": specific_info }, } }) return json_msg
def _create_json_message(self, cpu, alert_type): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() # Populate specific info self.fill_specific_info() alert_specific_info = self.specific_info info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": self.RESOURCE_ID + str(cpu), "event_time": epoch_time } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg
def send_json_msg(self, alert_type, encl_status): severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('enclosure') resource_id = "0" host_name = self.os_utils.get_fqdn() info = { "resource_type": self.RESOURCE_TYPE, "fru": fru, "resource_id": resource_id, "event_time": epoch_time, "description": encl_status } internal_json_msg = json.dumps({ "sensor_request_type": { "enclosure_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "status": "update", "info": info, "specific_info": { "event": encl_status } } } }) self.previous_alert_type = alert_type self._write_internal_msgQ(RealStorEnclMsgHandler.name(), internal_json_msg) self.persistent_encl_data = { 'fault_alert': str(self.fault_alert), 'previous_alert_type': str(self.previous_alert_type), } store.put(self.persistent_encl_data, self.ENCL_SENSOR_DATA_PATH)
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread super(NodeDataMsgHandler, self).initialize(conf_reader) # Initialize internal message queues for this module super(NodeDataMsgHandler, self).initialize_msgQ(msgQlist) self._transmit_interval = int( Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.TRANSMIT_INTERVAL}", 60)) self._high_cpu_usage_wait_threshold = int( Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HIGH_CPU_USAGE_WAIT_THRESHOLD}", 60)) self._high_memory_usage_wait_threshold = int( Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HIGH_MEMORY_USAGE_WAIT_THRESHOLD}", 60)) self._units = Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.UNITS}", "MB") self._disk_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.DISK_USAGE_THRESHOLD}", self.DEFAULT_DISK_USAGE_THRESHOLD) self._cpu_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.CPU_USAGE_THRESHOLD}", self.DEFAULT_CPU_USAGE_THRESHOLD) self._host_memory_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HOST_MEMORY_USAGE_THRESHOLD}", self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD) self.node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, "SN01") self.bmcNwStatus = None self.severity_reader = SeverityReader() self._node_sensor = None self._login_actuator = None self.disk_sensor_data = None self.host_sensor_data = None self.if_sensor_data = None self.cpu_sensor_data = None self.raid_sensor_data = None self.sensor_type = None self._epoch_time = str(int(time.time())) self._raid_drives = [] self._raid_device = "N/A" self.os_sensor_type = { "disk_space": self.disk_sensor_data, "memory_usage": self.host_sensor_data, "nw": self.if_sensor_data, "cpu_usage": self.cpu_sensor_data, "raid_data": self.raid_sensor_data } # UUID used in json msgs self._uuid = None # Dict of drives by device name from systemd self._drive_by_device_name = {} # Dict of drive path by-ids by serial number from systemd self._drive_byid_by_serial_number = {} self._import_products(product) self.cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.persistent_data = {'cpu': {}, 'disk': {}, 'memory': {}, 'nw': {}} # Persistent Cache for High CPU usage self.init_from_persistent_cache('cpu', 'CPU_USAGE_DATA') # Persistent Cache for High Disk Usage self.init_from_persistent_cache('disk', 'DISK_USAGE_DATA') # Persistent Cache for High Memory Usage self.init_from_persistent_cache('memory', 'MEMORY_USAGE_DATA') # Persistent Cache for Nework sensor self.init_from_persistent_cache('nw', 'NW_SENSOR_DATA')
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread super(NodeDataMsgHandler, self).initialize(conf_reader) # Initialize internal message queues for this module super(NodeDataMsgHandler, self).initialize_msgQ(msgQlist) self._transmit_interval = int( Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.TRANSMIT_INTERVAL}", 60)) self._units = Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.UNITS}", "MB") self._disk_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.DISK_USAGE_THRESHOLD}", self.DEFAULT_DISK_USAGE_THRESHOLD) self._cpu_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.CPU_USAGE_THRESHOLD}", self.DEFAULT_CPU_USAGE_THRESHOLD) self._host_memory_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HOST_MEMORY_USAGE_THRESHOLD}", self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD) self.site_id = Conf.get(GLOBAL_CONF, SITE_ID_KEY, "DC01") self.rack_id = Conf.get(GLOBAL_CONF, RACK_ID_KEY, "RC01") self.node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, "SN01") self.cluster_id = Conf.get(GLOBAL_CONF, CLUSTER_ID_KEY, "CC01") self.bmcNwStatus = None self.severity_reader = SeverityReader() self._node_sensor = None self._login_actuator = None self.disk_sensor_data = None self.host_sensor_data = None self.if_sensor_data = None self.cpu_sensor_data = None self.raid_sensor_data = None self.sensor_type = None self._epoch_time = str(int(time.time())) self._raid_drives = [] self._raid_device = "N/A" self.os_sensor_type = { "disk_space": self.disk_sensor_data, "memory_usage": self.host_sensor_data, "nw": self.if_sensor_data, "cpu_usage": self.cpu_sensor_data, "raid_data": self.raid_sensor_data } # UUID used in json msgs self._uuid = None # Dict of drives by device name from systemd self._drive_by_device_name = {} # Dict of drive path by-ids by serial number from systemd self._drive_byid_by_serial_number = {} self._import_products(product) cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.NW_SENSOR_DATA_PATH = os.path.join( cache_dir_path, f'NW_SENSOR_DATA_{self.node_id}') # Get the stored previous alert info self.persistent_nw_data = store.get(self.NW_SENSOR_DATA_PATH) if self.persistent_nw_data is None: self.prev_nw_status = {} self.prev_cable_cnxns = {} # Dir to maintain fault detected state for interface # in case of cable fault detection self.interface_fault_state = {} persistent_nw_data = { 'prev_nw_status': self.prev_nw_status, 'prev_cable_cnxns': self.prev_cable_cnxns, 'interface_fault_state': self.interface_fault_state } store.put(persistent_nw_data, self.NW_SENSOR_DATA_PATH) else: self.prev_nw_status = self.persistent_nw_data.get( 'prev_nw_status', {}) self.prev_cable_cnxns = self.persistent_nw_data.get( 'prev_cable_cnxns', {}) self.interface_fault_state = self.persistent_nw_data.get( 'interface_fault_state', {})
def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread super(NodeDataMsgHandler, self).initialize(conf_reader) # Initialize internal message queues for this module super(NodeDataMsgHandler, self).initialize_msgQ(msgQlist) self._transmit_interval = int( Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.TRANSMIT_INTERVAL}", 60)) self._units = Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.UNITS}", "MB") self._disk_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.DISK_USAGE_THRESHOLD}", self.DEFAULT_DISK_USAGE_THRESHOLD) self._cpu_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.CPU_USAGE_THRESHOLD}", self.DEFAULT_CPU_USAGE_THRESHOLD) self._host_memory_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HOST_MEMORY_USAGE_THRESHOLD}", self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD) self.site_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{SRVNODE}>{self.SITE_ID}', 'DC01') self.rack_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{SRVNODE}>{self.RACK_ID}', 'RC01') self.node_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{SRVNODE}>{self.NODE_ID}', 'SN01') self.cluster_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{self.CLUSTER_ID}', 'CC01') self.prev_nw_status = {} self.bmcNwStatus = None self.severity_reader = SeverityReader() self.prev_cable_cnxns = {} self._node_sensor = None self._login_actuator = None self.disk_sensor_data = None self.host_sensor_data = None self.if_sensor_data = None self.cpu_sensor_data = None self.raid_sensor_data = None self.sensor_type = None self._epoch_time = str(int(time.time())) self._raid_drives = [] self._raid_device = "N/A" self.os_sensor_type = { "disk_space": self.disk_sensor_data, "memory_usage": self.host_sensor_data, "nw": self.if_sensor_data, "cpu_usage": self.cpu_sensor_data, "raid_data": self.raid_sensor_data } # Dir to maintain fault detected state for interface # in case of cable fault detection self.interface_fault_state = {} # UUID used in json msgs self._uuid = None # Dict of drives by device name from systemd self._drive_by_device_name = {} # Dict of drive path by-ids by serial number from systemd self._drive_byid_by_serial_number = {} self._import_products(product)
class NodeDataMsgHandler(ScheduledModuleThread, InternalMsgQ): """Message Handler for generic node requests and generating host update messages on a regular interval""" MODULE_NAME = "NodeDataMsgHandler" PRIORITY = 2 # Section and keys in configuration file NODEDATAMSGHANDLER = MODULE_NAME.upper() TRANSMIT_INTERVAL = 'transmit_interval' UNITS = 'units' DISK_USAGE_THRESHOLD = 'disk_usage_threshold' DEFAULT_DISK_USAGE_THRESHOLD = 80 CPU_USAGE_THRESHOLD = 'cpu_usage_threshold' DEFAULT_CPU_USAGE_THRESHOLD = 80 HOST_MEMORY_USAGE_THRESHOLD = 'host_memory_usage_threshold' DEFAULT_HOST_MEMORY_USAGE_THRESHOLD = 80 SYSTEM_INFORMATION = "SYSTEM_INFORMATION" SITE_ID = "site_id" CLUSTER_ID = "cluster_id" NODE_ID = "node_id" RACK_ID = "rack_id" IPMI_RESOURCE_TYPE_PSU = "node:fru:psu" IPMI_RESOURCE_TYPE_FAN = "node:fru:fan" IPMI_RESOURCE_TYPE_DISK = "node:fru:disk" IPMI_RESOURCE_TYPE_TEMPERATURE = "node:sensor:temperature" IPMI_RESOURCE_TYPE_VOLTAGE = "node:sensor:voltage" # TODO: Enable this code once Intel servers become available # to test the current sensor # IPMI_RESOURCE_TYPE_CURRENT = "node:sensor:current" NW_RESOURCE_TYPE = "node:interface:nw" NW_CABLE_RESOURCE_TYPE = "node:interface:nw:cable" host_fault = False cpu_fault = False disk_fault = False if_fault = False FAULT = "fault" FAULT_RESOLVED = "fault_resolved" INTERFACE_FAULT_DETECTED = False # Dependency list DEPENDENCIES = {"plugins": ["RabbitMQegressProcessor"], "rpms": []} @staticmethod def name(): """ @return: name of the module.""" return NodeDataMsgHandler.MODULE_NAME def __init__(self): super(NodeDataMsgHandler, self).__init__(self.MODULE_NAME, self.PRIORITY) # Flag to indicate suspension of module self._suspended = False @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return NodeDataMsgHandler.DEPENDENCIES def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread super(NodeDataMsgHandler, self).initialize(conf_reader) # Initialize internal message queues for this module super(NodeDataMsgHandler, self).initialize_msgQ(msgQlist) self._transmit_interval = int( Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.TRANSMIT_INTERVAL}", 60)) self._units = Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.UNITS}", "MB") self._disk_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.DISK_USAGE_THRESHOLD}", self.DEFAULT_DISK_USAGE_THRESHOLD) self._cpu_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.CPU_USAGE_THRESHOLD}", self.DEFAULT_CPU_USAGE_THRESHOLD) self._host_memory_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HOST_MEMORY_USAGE_THRESHOLD}", self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD) self.site_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{SRVNODE}>{self.SITE_ID}', 'DC01') self.rack_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{SRVNODE}>{self.RACK_ID}', 'RC01') self.node_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{SRVNODE}>{self.NODE_ID}', 'SN01') self.cluster_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{self.CLUSTER_ID}', 'CC01') self.prev_nw_status = {} self.bmcNwStatus = None self.severity_reader = SeverityReader() self.prev_cable_cnxns = {} self._node_sensor = None self._login_actuator = None self.disk_sensor_data = None self.host_sensor_data = None self.if_sensor_data = None self.cpu_sensor_data = None self.raid_sensor_data = None self.sensor_type = None self._epoch_time = str(int(time.time())) self._raid_drives = [] self._raid_device = "N/A" self.os_sensor_type = { "disk_space": self.disk_sensor_data, "memory_usage": self.host_sensor_data, "nw": self.if_sensor_data, "cpu_usage": self.cpu_sensor_data, "raid_data": self.raid_sensor_data } # Dir to maintain fault detected state for interface # in case of cable fault detection self.interface_fault_state = {} # UUID used in json msgs self._uuid = None # Dict of drives by device name from systemd self._drive_by_device_name = {} # Dict of drive path by-ids by serial number from systemd self._drive_byid_by_serial_number = {} self._import_products(product) def _import_products(self, product): """Import classes based on which product is being used""" if product.lower() in [x.lower() for x in enabled_products]: from zope.component import queryUtility self._queryUtility = queryUtility def run(self): """Run the module periodically on its own thread.""" self._log_debug("Start accepting requests") # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(1, self._priority, self.run, ()) return # self._set_debug(True) # self._set_debug_persist(True) try: # Query the Zope GlobalSiteManager for an object implementing INodeData if self._node_sensor is None: from sensors.INode_data import INodeData self._node_sensor = self._queryUtility(INodeData)() self._log_debug("_node_sensor name: %s" % self._node_sensor.name()) # Delay for the desired interval if it's greater than zero if self._transmit_interval > 0: logger.debug("self._transmit_interval:{}".format( self._transmit_interval)) timer = self._transmit_interval while timer > 0: # See if the message queue contains an entry and process jsonMsg, _ = self._read_my_msgQ_noWait() if jsonMsg is not None: self._process_msg(jsonMsg) time.sleep(1) timer -= 1 # Generate the JSON messages with data from the node and transmit on regular interval self._generate_host_update() self._generate_cpu_data() self._generate_if_data() self._generate_disk_space_alert() # If the timer is zero then block for incoming requests notifying to transmit data else: # Block on message queue until it contains an entry jsonMsg, _ = self._read_my_msgQ() if jsonMsg is not None: self._process_msg(jsonMsg) # Keep processing until the message queue is empty while not self._is_my_msgQ_empty(): jsonMsg, _ = self._read_my_msgQ() if jsonMsg is not None: self._process_msg(jsonMsg) except Exception as ae: # Log it and restart the whole process when a failure occurs logger.exception("NodeDataMsgHandler restarting: %s" % ae) self._scheduler.enter(1, self._priority, self.run, ()) self._log_debug("Finished processing successfully") def _process_msg(self, jsonMsg): """Parses the incoming message and generate the desired data message""" self._log_debug("_process_msg, jsonMsg: %s" % jsonMsg) if isinstance(jsonMsg, dict) is False: jsonMsg = json.loads(jsonMsg) # Parse out the uuid so that it can be sent back in response message self._uuid = None if jsonMsg.get("sspl_ll_msg_header") is not None and \ jsonMsg.get("sspl_ll_msg_header").get("uuid") is not None: self._uuid = jsonMsg.get("sspl_ll_msg_header").get("uuid") self._log_debug("_processMsg, uuid: %s" % self._uuid) if jsonMsg.get("sensor_request_type") is not None and \ jsonMsg.get("sensor_request_type").get("node_data") is not None and \ jsonMsg.get("sensor_request_type").get("node_data").get("sensor_type") is not None: self.sensor_type = jsonMsg.get("sensor_request_type").get( "node_data").get("sensor_type").split(":")[2] self._log_debug("_processMsg, sensor_type: %s" % self.sensor_type) if self.sensor_type == "memory_usage": self._generate_host_update() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(RabbitMQegressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "cpu_usage": self._generate_cpu_data() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(RabbitMQegressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "nw": self._generate_if_data() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(RabbitMQegressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "disk_space": self._generate_disk_space_alert() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(RabbitMQegressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "raid_data": self._generate_raid_data(jsonMsg) sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(RabbitMQegressProcessor.name(), sensor_message_type) else: self._log_debug( "NodeDataMsgHandler, _process_msg " + f"No past data found for {self.sensor_type} sensor type" ) elif self.sensor_type == "raid_integrity": self._generate_raid_integrity_data(jsonMsg) sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(RabbitMQegressProcessor.name(), sensor_message_type) else: self._log_debug( "NodeDataMsgHandler, _process_msg " + f"No past data found for {self.sensor_type} sensor type" ) # Update mapping of device names to serial numbers for global use elif jsonMsg.get("sensor_response_type") is not None: if jsonMsg.get( "sensor_response_type") == "devicename_serialnumber": self._update_devicename_sn_dict(jsonMsg) elif jsonMsg.get("sensor_request_type") is not None and \ jsonMsg.get("sensor_request_type").get("node_data") is not None and \ jsonMsg.get("sensor_request_type").get("node_data").get("info") is not None and \ jsonMsg.get("sensor_request_type").get("node_data").get("info").get("resource_type") is not None: self._generate_node_fru_data(jsonMsg) # ... handle other node sensor message types def _update_devicename_sn_dict(self, jsonMsg): """Update the dict of device names to serial numbers""" drive_byid = jsonMsg.get("drive_byid") device_name = jsonMsg.get("device_name") serial_number = jsonMsg.get("serial_number") self._drive_by_device_name[device_name] = serial_number self._drive_byid_by_serial_number[serial_number] = drive_byid self._log_debug( "NodeDataMsgHandler, device_name: %s, serial_number: %s, drive_byid: %s" % (device_name, serial_number, drive_byid)) def _generate_host_update(self): """Create & transmit a host update message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the host_update message successful = self._node_sensor.read_data("host_update", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_host_update was NOT successful." ) self._host_memory_usage_threshold = str( self._host_memory_usage_threshold) try: if self._host_memory_usage_threshold.isdigit(): self._host_memory_usage_threshold = int( self._host_memory_usage_threshold) else: self._host_memory_usage_threshold = float( self._host_memory_usage_threshold) except ValueError: logger.warning( "Host Memory Alert, Invalid host_memory_usage_threshold value are entered in config." ) # Assigning default value to _disk_usage_threshold self._host_memory_usage_threshold = self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD if self._node_sensor.total_memory[ "percent"] >= self._host_memory_usage_threshold: # Create the disk space data message and hand it over to the egress processor to transmit if not self.host_fault: self.host_fault = True # Create the disk space data message and hand it over to the egress processor to transmit fault_event = "Host memory usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold) logger.warning(fault_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it out over rabbitMQ channel self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.total_memory["percent"] < self._host_memory_usage_threshold) and (self.host_fault == True): fault_resolved_event = "Host memory usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold) logger.warning(fault_resolved_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it out over rabbitMQ channel self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.host_fault = False def _generate_local_mount_data(self): """Create & transmit a local_mount_data message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the local_mount_data message successful = self._node_sensor.read_data("local_mount_data", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_local_mount_data was NOT successful." ) # Create the local mount data message and hand it over to the egress processor to transmit localMountDataMsg = LocalMountDataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.free_space, self._node_sensor.free_inodes, self._node_sensor.free_swap, self._node_sensor.total_space, self._node_sensor.total_swap, self._units) # Add in uuid if it was present in the json request if self._uuid is not None: localMountDataMsg.set_uuid(self._uuid) jsonMsg = localMountDataMsg.getJson() # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) def _generate_cpu_data(self): """Create & transmit a cpu_data message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the cpu_data message successful = self._node_sensor.read_data("cpu_data", self._get_debug()) if not successful: logger.error( "NodeDataMsgHandler, _generate_cpu_data was NOT successful.") self._cpu_usage_threshold = str(self._cpu_usage_threshold) try: if self._cpu_usage_threshold.isdigit(): self._cpu_usage_threshold = int(self._cpu_usage_threshold) else: self._cpu_usage_threshold = float(self._cpu_usage_threshold) except ValueError: logger.warning( "CPU Usage Alert, Invalid host_memory_usage_threshold value are entered in config." ) # Assigning default value to _cpu_usage_threshold self._cpu_usage_threshold = self.DEFAULT_CPU_USAGE_THRESHOLD if self._node_sensor.cpu_usage >= self._cpu_usage_threshold: if not self.cpu_fault: self.cpu_fault = True # Create the cpu usage data message and hand it over to the egress processor to transmit fault_event = "CPU usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.cpu_usage, self._cpu_usage_threshold) logger.warning(fault_event) # Create the local mount data message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.cpu_usage <= self._cpu_usage_threshold) and (self.cpu_fault == True): # Create the cpu usage data message and hand it over to the egress processor to transmit fault_resolved_event = "CPU usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.cpu_usage, self._cpu_usage_threshold) logger.warning(fault_resolved_event) # Create the local mount data message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.cpu_fault = False def _send_ifdata_json_msg(self, sensor_type, resource_id, resource_type, state, severity, event=""): """A resuable method for transmitting IFDataMsg to RMQ and IEM logging""" ifDataMsg = IFdataMsg(self._node_sensor.host_id, self._node_sensor.local_time, self._node_sensor.if_data, resource_id, resource_type, self.site_id, self.node_id, self.cluster_id, self.rack_id, state, severity, event) # Add in uuid if it was present in the json request if self._uuid is not None: ifDataMsg.set_uuid(self._uuid) jsonMsg = ifDataMsg.getJson() self.if_sensor_data = jsonMsg self.os_sensor_type[sensor_type] = self.if_sensor_data # Send the event to logging msg handler to send IEM message to journald #internal_json_msg=json.dumps({ # 'actuator_request_type': { # 'logging': { # 'log_level': 'LOG_WARNING', # 'log_type': 'IEM', # 'log_msg': '{}'.format(jsonMsg)}}}) #self._write_internal_msgQ(LoggingMsgHandler.name(), internal_json_msg) # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) def _generate_if_data(self): """Create & transmit a network interface data message as defined by the sensor response json schema""" event_field = "" # Notify the node sensor to update its data required for the if_data message successful = self._node_sensor.read_data("if_data", self._get_debug()) if not successful: logger.error( "NodeDataMsgHandler, _generate_if_data was NOT successful.") interfaces = self._node_sensor.if_data nw_alerts = self._get_nwalert(interfaces) # Get all cable connections state and generate alert on # cables identified for fault detected and resolved state nw_cable_alerts = self._nw_cable_alert_exists(interfaces) for nw_cable_resource_id, state in nw_cable_alerts.items(): severity = self.severity_reader.map_severity(state) # Check if any nw interface fault is there because of cable pull if nw_alerts and nw_alerts[nw_cable_resource_id] == state: if state == self.FAULT: self.INTERFACE_FAULT_DETECTED = True # if yes, then mark the flag detection True for the respective interface self.interface_fault_state[ nw_cable_resource_id] = self.INTERFACE_FAULT_DETECTED event_field = f'Network interface: {nw_cable_resource_id}' + ' ' \ 'is also down because of cable fault' else: event_field = f'Network interface: {nw_cable_resource_id}' + ' ' \ 'is also up after cable insertion' # Send the cable alert self._send_ifdata_json_msg("nw", nw_cable_resource_id, self.NW_CABLE_RESOURCE_TYPE, state, severity, event_field) # Check for Nw interface fault for nw_resource_id, nw_state in nw_alerts.items(): # Check if nw interface fault is resolved. If resolved, check whether its # resolved by cable insertion by checking the self.interface_fault_state # dictionary. if (self.interface_fault_state and nw_state == self.FAULT_RESOLVED and not \ self.interface_fault_state.get(nw_resource_id)): # delete the entry for that interface from the interface # directory specifically maintaned to track interface # fault in case of cable fault. This is imp because otherwise # if fault occurs for the same nw interface after cable insertion case, # fault_resolved alert for the same nw interface will not be seen. del self.interface_fault_state[nw_resource_id] continue elif self.interface_fault_state.get(nw_resource_id): # If yes, then don't repeat the alert. continue if nw_state == self.FAULT: event_field = f'Network interface {nw_resource_id} is down' else: event_field = f'Network interface {nw_resource_id} is up' # If no or for othe interface, send the alert severity = self.severity_reader.map_severity(nw_state) self._send_ifdata_json_msg("nw", nw_resource_id, self.NW_RESOURCE_TYPE, nw_state, severity, event_field) def _get_nwalert(self, interfaces): """ Get network interfaces with fault/OK state for each interface. Parameters: interfaces(list) : List of availabel network interfaces Returns: Dictionary of network interfaces having key as interface name and value as fault state. Return type: dict """ nw_alerts = {} try: for interface in interfaces: interface_name = interface.get("ifId") nw_status = interface.get("nwStatus") logger.debug("{0}:{1}".format(interface_name, nw_status)) # fault detected (Down/UNKNOWN, Up/UNKNOWN to Down, Up/Down to UNKNOWN) if nw_status == 'DOWN' or nw_status == 'UNKNOWN': if self.prev_nw_status.get(interface_name) != nw_status: if self.prev_nw_status.get( interface_name) and self.prev_nw_status.get( interface_name) == 'UP': logger.warning( f"Network connection fault is detected for interface:'{interface_name}'" ) nw_alerts[interface_name] = self.FAULT self.prev_nw_status[interface_name] = nw_status # fault resolved (Down to Up) elif nw_status == 'UP': if self.prev_nw_status.get(interface_name) != nw_status: if self.prev_nw_status.get(interface_name): logger.info( f"Network connection fault is resolved for interface:'{interface_name}'" ) nw_alerts[interface_name] = self.FAULT_RESOLVED self.prev_nw_status[interface_name] = nw_status else: logger.warning( f"Network connection state is:'{nw_status}', for interface:'{interface_name}'" ) except Exception as e: logger.error( f"Exception occurs while checking for network alert condition:'{e}'" ) logger.debug("nw_alerts existed for:{}".format(nw_alerts)) return nw_alerts def _nw_cable_alert_exists(self, interfaces): """Checks cable connection status with physical link(carrier) state and avoids duplicate alert reporting by comparing with its previous state. Fault detection is identified by physical link state Down. Fault resolved is identified by physical link state changed from Down to Up. """ identified_cables = {} for interface in interfaces: interface_name = interface.get("ifId") phy_link_status = interface.get("nwCableConnStatus") # fault detected (Down, Up to Down) if phy_link_status == 'DOWN': if self.prev_cable_cnxns.get( interface_name) != phy_link_status: if self.prev_cable_cnxns.get(interface_name): logger.warning( f"Cable connection fault is detected with '{interface_name}'" ) identified_cables[interface_name] = self.FAULT self.prev_cable_cnxns[interface_name] = phy_link_status # fault resolved (Down to Up) elif phy_link_status == 'UP': if self.prev_cable_cnxns.get( interface_name) != phy_link_status: if self.prev_cable_cnxns.get(interface_name): logger.info( f"Cable connection fault is resolved with '{interface_name}'" ) identified_cables[interface_name] = self.FAULT_RESOLVED if self.interface_fault_state and interface_name in self.interface_fault_state: # After the cable fault is resolved, unset the flag for interface # So that, it can be tracked further for any failure self.INTERFACE_FAULT_DETECTED = False self.interface_fault_state[ interface_name] = self.INTERFACE_FAULT_DETECTED # Also clear the global nw interface dictionary self.prev_nw_status[ interface_name] = phy_link_status self.prev_cable_cnxns[interface_name] = phy_link_status else: logger.debug( f"Cable connection state is unknown with '{interface_name}'" ) return identified_cables def _generate_disk_space_alert(self): """Create & transmit a disk_space_alert message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the disk_space_data message successful = self._node_sensor.read_data("disk_space_alert", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_disk_space_alert was NOT successful." ) return # Changing disk_usage_threshold type according to what value type entered in config file self._disk_usage_threshold = str(self._disk_usage_threshold) try: if self._disk_usage_threshold.isdigit(): self._disk_usage_threshold = int(self._disk_usage_threshold) else: self._disk_usage_threshold = float(self._disk_usage_threshold) except ValueError: logger.warning( "Disk Space Alert, Invalid disk_usage_threshold value are entered in config." ) # Assigning default value to _disk_usage_threshold self._disk_usage_threshold = self.DEFAULT_DISK_USAGE_THRESHOLD if self._node_sensor.disk_used_percentage >= self._disk_usage_threshold: if not self.disk_fault: self.disk_fault = True # Create the disk space data message and hand it over to the egress processor to transmit fault_event = "Disk usage increased to %s, beyond configured threshold of %s" \ %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold) logger.warning(fault_event) diskSpaceAlertMsg = DiskSpaceAlertMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) if (self._node_sensor.disk_used_percentage <= self._disk_usage_threshold) and (self.disk_fault == True): # Create the disk space data message and hand it over to the egress processor to transmit fault_resolved_event = "Disk usage decreased to %s, lesser than configured threshold of %s" \ %(self._node_sensor.disk_used_percentage, self._disk_usage_threshold) logger.warning(fault_resolved_event) diskSpaceAlertMsg = DiskSpaceAlertMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.site_id, self.rack_id, self.node_id, self.cluster_id, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it out over rabbitMQ channel self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) self.disk_fault = False def _generate_raid_data(self, jsonMsg): """Create & transmit a RAID status data message as defined by the sensor response json schema""" successful = self._node_sensor.read_data("raid", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, updating RAID information was NOT successful." ) return # See if status is in the msg; ie it's an internal msg from the RAID sensor if jsonMsg.get("sensor_request_type").get("node_data").get( "status") is not None: sensor_request = jsonMsg.get("sensor_request_type").get( "node_data") host_name = sensor_request.get("host_id") alert_type = sensor_request.get("alert_type") alert_id = sensor_request.get("alert_id") severity = sensor_request.get("severity") info = sensor_request.get("info") specific_info = sensor_request.get("specific_info") self._raid_device = jsonMsg.get("sensor_request_type").get( "node_data").get("specific_info").get("device") self._raid_drives = list( jsonMsg.get("sensor_request_type").get("node_data").get( "specific_info").get("drives")) raidDataMsg = RAIDdataMsg(host_name, alert_type, alert_id, severity, info, specific_info) # Add in uuid if it was present in the json request if self._uuid is not None: raidDataMsg.set_uuid(self._uuid) jsonMsg = raidDataMsg.getJson() self.raid_sensor_data = jsonMsg self.os_sensor_type["raid_data"] = self.raid_sensor_data # Loop thru each index of drives containing only paths and fill in with s/n for drive in self._raid_drives: self._log_debug("drive: %s" % str(drive)) if drive.get("identity") is not None: path = drive.get("identity").get("path") self._log_debug("path: %s" % str(path)) # Lookup the serial number from the path serial_number = str(self._drive_by_device_name.get(path)) self._log_debug("serial_number: %s" % str(serial_number)) if serial_number != "None": drive["identity"]["serialNumber"] = serial_number # Change device path to path-byid drive_byid = str( self._drive_byid_by_serial_number.get(serial_number)) if drive_byid != "None": drive["identity"]["path"] = drive_byid self._log_debug( "_generate_raid_data, host_id: %s, device: %s, drives: %s" % (self._node_sensor.host_id, self._raid_device, str(self._raid_drives))) def _generate_raid_integrity_data(self, jsonMsg): """Create & transmit a Validate RAID result data message as defined by the sensor response json schema""" logger.debug("NodeDataMsgHandler, Validating RAID information") # See if status is in the msg; ie it's an internal msg from the RAID sensor if jsonMsg.get("sensor_request_type").get("node_data").get( "status") is not None: sensor_request = jsonMsg.get("sensor_request_type").get( "node_data") host_name = sensor_request.get("host_id") alert_type = sensor_request.get("alert_type") alert_id = sensor_request.get("alert_id") severity = sensor_request.get("severity") info = sensor_request.get("info") specific_info = sensor_request.get("specific_info") self._alert = jsonMsg.get("sensor_request_type").get( "node_data").get("specific_info").get("error") RAIDintegrityMsg = RAIDIntegrityMsg(host_name, alert_type, alert_id, severity, info, specific_info) # Add in uuid if it was present in the json request if self._uuid is not None: RAIDintegrityMsg.set_uuid(self._uuid) jsonMsg = RAIDintegrityMsg.getJson() self.raid_integrity_data = jsonMsg self.os_sensor_type["raid_integrity"] = self.raid_integrity_data self._log_debug("_generate_raid_integrity_data, host_id: %s" % (self._node_sensor.host_id)) def _generate_node_fru_data(self, jsonMsg): """Create & transmit a FRU IPMI data message as defined by the sensor response json schema""" if self._node_sensor.host_id is None: successful = self._node_sensor.read_data("None", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, updating host information was NOT successful." ) if jsonMsg.get("sensor_request_type").get("node_data") is not None: self._fru_info = jsonMsg.get("sensor_request_type").get( "node_data") node_ipmi_data_msg = NodeIPMIDataMsg(self._fru_info) if self._uuid is not None: node_ipmi_data_msg.set_uuid(self._uuid) jsonMsg = node_ipmi_data_msg.getJson() self._write_internal_msgQ(RabbitMQegressProcessor.name(), jsonMsg) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(NodeDataMsgHandler, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(NodeDataMsgHandler, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(NodeDataMsgHandler, self).shutdown()
def _create_json_message(self, alert_type, port): """Creates a defined json message structure which can flow inside SSPL modules""" internal_json_msg = None severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) host_name = socket.gethostname() specific_info = {} specific_info_list = [] description = "N/A" # specific_info will contain all 16 phys for conn level alert # Only 4 phys for port level alert for key, val in self.phy_dir_to_linkrate_mapping.items(): if port != -1: # This is a port level alert, skip phys that are not relevant if key not in self.port_phy_list_dict[port]: # Skip adding this phy continue # Key will be phy-0:0. So, aplit it using ':' # So, structure will be SASHBA-0:phy-0 phy_number = key.split(":")[1] specific_info[ "resource_id"] = self.RESOURCE_ID + ':' + "phy-" + phy_number specific_info[ "negotiated_link_rate"] = self.phy_dir_to_linkrate_mapping[ key][0].strip() specific_info_list.append(specific_info) specific_info = {} alert_specific_info = specific_info_list if port == -1: # This is a SAS HBA level connection alert if alert_type == 'fault': description = "SAS connection error detected in SAS HBA %s." % self.RESOURCE_ID elif alert_type == 'fault_resolved': description = "SAS connection re-established in SAS HBA %s." % self.RESOURCE_ID info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, # node:interface:sas "resource_id": self.RESOURCE_ID, # SASHBA-0 "event_time": epoch_time, "description": description } else: # This is a port level alert if alert_type == 'fault': description = ( "No connectivity detected on the SAS port %s, possible" "causes could be missing SAS cable, bad cable connection," "faulty cable or SAS port failure." % port) elif alert_type == 'fault_resolved': description = "Connection established on SAS port." info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE + ':port', # node:interface:sas:port "resource_id": self.RESOURCE_ID + f'-port-{port}', # SASHBA-0-port-0 "event_time": epoch_time, "description": description } internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": alert_specific_info } } }) return internal_json_msg
def _send_json_msg(self, alert_type, resource_id, device, drives): """Transmit data to NodeDataMsgHandler to be processed and sent out""" epoch_time = str(int(time.time())) severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) self._alert_id = self._get_alert_id(epoch_time) host_name = socket.getfqdn() if alert_type == self.MISSING: description = "RAID array or drive from RAID array is missing." elif alert_type == self.FAULT: description = "RAID array or drive from RAID array is faulty." elif alert_type == self.INSERTION: description = "Inserted drive in RAID array." elif alert_type == self.FAULT_RESOLVED: description = "Fault for RAID array or RAID drive is resolved" else: description = "Raid array alert" info = { "resource_type": self.RESOURCE_TYPE, "resource_id": resource_id, "event_time": epoch_time, "description": description } specific_info = {"device": device, "drives": drives} internal_json_msg = json.dumps({ "sensor_request_type": { "node_data": { "status": "update", "sensor_type": "node:os:raid_data", "host_id": host_name, "alert_type": alert_type, "alert_id": self._alert_id, "severity": severity, "info": info, "specific_info": specific_info } } }) self.prev_alert_type[device] = alert_type self.alert_type = None # Send the event to node data message handler to generate json message and send out self._write_internal_msgQ(NodeDataMsgHandler.name(), internal_json_msg) # Save the state to Persistent Cache. self.persistent_raid_data = { '_RAID_status_contents': self._RAID_status_contents, '_RAID_status': self._RAID_status, '_faulty_drive_list': self._faulty_drive_list, '_faulty_device_list': self._faulty_device_list, '_drives': self._drives, '_total_drives': self._total_drives, '_devices': self._devices, '_missing_drv': self._missing_drv, '_prev_drive_dict': self._prev_drive_dict, 'prev_alert_type': self.prev_alert_type, } store.put(self.persistent_raid_data, self.RAID_SENSOR_DATA_PATH)
def _create_internal_msg(self, psu_detail, alert_type): """Forms a dictionary containing info about PSUs to send to message handler. """ self._log_debug( f"RealStorPSUSensor._create_internal_msg -> {psu_detail} {alert_type}") if not psu_detail: return {} severity_reader = SeverityReader() severity = severity_reader.map_severity(alert_type) epoch_time = str(int(time.time())) alert_id = self._get_alert_id(epoch_time) fru = self.rssencl.is_storage_fru('POWER_SUPPLY') resource_id = psu_detail.get("durable-id") host_name = self._get_hostname() info = { "resource_type": self.RESOURCE_CATEGORY, "fru": fru, "resource_id": resource_id, "event_time": epoch_time } specific_info = { "enclosure-id": psu_detail.get("enclosure-id"), "serial-number": psu_detail.get("serial-number"), "description": psu_detail.get("description"), "revision": psu_detail.get("revision"), "model": psu_detail.get("model"), "vendor": psu_detail.get("vendor"), "location": psu_detail.get("location"), "part-number": psu_detail.get("part-number"), "fru-shortname": psu_detail.get("fru-shortname"), "mfg-date": psu_detail.get("mfg-date"), "mfg-vendor-id": psu_detail.get("mfg-vendor-id"), "dc12v": psu_detail.get("dc12v"), "dc5v": psu_detail.get("dc12v"), "dc33v": psu_detail.get("dc33v"), "dc12i": psu_detail.get("dc12i"), "dc5i": psu_detail.get("dc5i"), "dctemp": psu_detail.get("dctemp"), "health": psu_detail.get("health"), "health-reason": psu_detail.get("health-reason"), "health-recommendation": psu_detail.get("health-recommendation"), "status": psu_detail.get("status"), "durable-id": psu_detail.get("durable-id"), "position": psu_detail.get("position"), } for k in specific_info.keys(): if specific_info[k] == "": specific_info[k] = "N/A" # Creates internal json message request structure. # this message will be passed to the StorageEnclHandler internal_json_msg = json.dumps( {"sensor_request_type": { "enclosure_alert": { "status": "update", "host_id": host_name, "alert_type": alert_type, "severity": severity, "alert_id": alert_id, "info": info, "specific_info": specific_info } }}) return internal_json_msg
class NodeDataMsgHandler(ScheduledModuleThread, InternalMsgQ): """Message Handler for generic node requests and generating host update messages on a regular interval""" MODULE_NAME = "NodeDataMsgHandler" PRIORITY = 2 # Section and keys in configuration file NODEDATAMSGHANDLER = MODULE_NAME.upper() TRANSMIT_INTERVAL = 'transmit_interval' HIGH_CPU_USAGE_WAIT_THRESHOLD = 'high_cpu_usage_wait_threshold' HIGH_MEMORY_USAGE_WAIT_THRESHOLD = 'high_memory_usage_wait_threshold' UNITS = 'units' DISK_USAGE_THRESHOLD = 'disk_usage_threshold' DEFAULT_DISK_USAGE_THRESHOLD = 80 CPU_USAGE_THRESHOLD = 'cpu_usage_threshold' DEFAULT_CPU_USAGE_THRESHOLD = 80 HOST_MEMORY_USAGE_THRESHOLD = 'host_memory_usage_threshold' DEFAULT_HOST_MEMORY_USAGE_THRESHOLD = 80 SYSTEM_INFORMATION = "SYSTEM_INFORMATION" IPMI_RESOURCE_TYPE_PSU = "node:hw:psu" IPMI_RESOURCE_TYPE_FAN = "node:hw:fan" IPMI_RESOURCE_TYPE_DISK = "node:hw:disk" IPMI_RESOURCE_TYPE_TEMPERATURE = "node:sensor:temperature" IPMI_RESOURCE_TYPE_VOLTAGE = "node:sensor:voltage" IPMI_RESOURCE_TYPE_CURRENT = "node:sensor:current" NW_RESOURCE_TYPE = "node:interface:nw" NW_CABLE_RESOURCE_TYPE = "node:interface:nw:cable" high_usage = {'cpu': False, 'disk': False, 'memory': False} usage_time_map = {'cpu': -1, 'memory': -1, 'disk': -1} fault_resolved_iterations = {'cpu': 0, 'memory': 0, 'disk': 0} prev_nw_if_status = {} prev_cable_status = {} FAULT = "fault" FAULT_RESOLVED = "fault_resolved" INTERFACE_FAULT_DETECTED = False CACHE_DIR_NAME = "server" # Dependency list DEPENDENCIES = {"plugins": ["EgressProcessor"], "rpms": []} @staticmethod def name(): """ @return: name of the module.""" return NodeDataMsgHandler.MODULE_NAME def __init__(self): super(NodeDataMsgHandler, self).__init__(self.MODULE_NAME, self.PRIORITY) # Flag to indicate suspension of module self._suspended = False @staticmethod def dependencies(): """Returns a list of plugins and RPMs this module requires to function. """ return NodeDataMsgHandler.DEPENDENCIES def initialize(self, conf_reader, msgQlist, product): """initialize configuration reader and internal msg queues""" # Initialize ScheduledMonitorThread super(NodeDataMsgHandler, self).initialize(conf_reader) # Initialize internal message queues for this module super(NodeDataMsgHandler, self).initialize_msgQ(msgQlist) self._transmit_interval = int( Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.TRANSMIT_INTERVAL}", 60)) self._high_cpu_usage_wait_threshold = int( Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HIGH_CPU_USAGE_WAIT_THRESHOLD}", 60)) self._high_memory_usage_wait_threshold = int( Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HIGH_MEMORY_USAGE_WAIT_THRESHOLD}", 60)) self._units = Conf.get(SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.UNITS}", "MB") self._disk_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.DISK_USAGE_THRESHOLD}", self.DEFAULT_DISK_USAGE_THRESHOLD) self._cpu_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.CPU_USAGE_THRESHOLD}", self.DEFAULT_CPU_USAGE_THRESHOLD) self._host_memory_usage_threshold = Conf.get( SSPL_CONF, f"{self.NODEDATAMSGHANDLER}>{self.HOST_MEMORY_USAGE_THRESHOLD}", self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD) self.node_id = Conf.get(GLOBAL_CONF, NODE_ID_KEY, "SN01") self.bmcNwStatus = None self.severity_reader = SeverityReader() self._node_sensor = None self._login_actuator = None self.disk_sensor_data = None self.host_sensor_data = None self.if_sensor_data = None self.cpu_sensor_data = None self.raid_sensor_data = None self.sensor_type = None self._epoch_time = str(int(time.time())) self._raid_drives = [] self._raid_device = "N/A" self.os_sensor_type = { "disk_space": self.disk_sensor_data, "memory_usage": self.host_sensor_data, "nw": self.if_sensor_data, "cpu_usage": self.cpu_sensor_data, "raid_data": self.raid_sensor_data } # UUID used in json msgs self._uuid = None # Dict of drives by device name from systemd self._drive_by_device_name = {} # Dict of drive path by-ids by serial number from systemd self._drive_byid_by_serial_number = {} self._import_products(product) self.cache_dir_path = os.path.join(DATA_PATH, self.CACHE_DIR_NAME) self.persistent_data = {'cpu': {}, 'disk': {}, 'memory': {}, 'nw': {}} # Persistent Cache for High CPU usage self.init_from_persistent_cache('cpu', 'CPU_USAGE_DATA') # Persistent Cache for High Disk Usage self.init_from_persistent_cache('disk', 'DISK_USAGE_DATA') # Persistent Cache for High Memory Usage self.init_from_persistent_cache('memory', 'MEMORY_USAGE_DATA') # Persistent Cache for Nework sensor self.init_from_persistent_cache('nw', 'NW_SENSOR_DATA') def init_from_persistent_cache(self, resource, data_path): PER_DATA_PATH = os.path.join(self.cache_dir_path, f'{data_path}_{self.node_id}') if os.path.isfile(PER_DATA_PATH): self.persistent_data[resource] = store.get(PER_DATA_PATH) if self.persistent_data[resource]: if resource == 'nw': self.prev_nw_if_status = \ self.persistent_data[resource].get('prev_nw_if_status', {}) self.prev_cable_status = \ self.persistent_data[resource].get('prev_cable_status', {}) elif self.persistent_data[resource]\ [f'high_{resource}_usage'].lower() == "true": self.high_usage[resource] = True else: self.high_usage[resource] = False else: self.persist_state_data(resource, data_path) def persist_state_data(self, resource, data_path): PER_DATA_PATH = os.path.join(self.cache_dir_path, f'{data_path}_{self.node_id}') if resource == 'nw': self.persistent_data[resource] = { 'prev_nw_if_status': self.prev_nw_if_status, 'prev_cable_status': self.prev_cable_status } else: self.persistent_data[resource] = { f'high_{resource}_usage': str(self.high_usage[resource]), f'{resource}_usage_time_map': str(self.usage_time_map[resource]), f'{resource}_fault_resolved_iterations': str(self.fault_resolved_iterations[resource]) } store.put(self.persistent_data[resource], PER_DATA_PATH) def read_persistent_data(self, data_path): """Read resource data from persistent cache.""" persistent_data = {} PER_DATA_PATH = os.path.join(self.cache_dir_path, f'{data_path}_{self.node_id}') if os.path.isfile(PER_DATA_PATH): persistent_data = store.get(PER_DATA_PATH) return persistent_data def _import_products(self, product): """Import classes based on which product is being used""" if product.lower() in [x.lower() for x in enabled_products]: from zope.component import queryUtility self._queryUtility = queryUtility def run(self): """Run the module periodically on its own thread.""" self._log_debug("Start accepting requests") # Do not proceed if module is suspended if self._suspended == True: self._scheduler.enter(1, self._priority, self.run, ()) return # self._set_debug(True) # self._set_debug_persist(True) try: # Query the Zope GlobalSiteManager for an object implementing INodeData if self._node_sensor is None: from sensors.INode_data import INodeData self._node_sensor = self._queryUtility(INodeData)() self._log_debug("_node_sensor name: %s" % self._node_sensor.name()) # Delay for the desired interval if it's greater than zero if self._transmit_interval > 0: logger.debug("self._transmit_interval:{}".format( self._transmit_interval)) timer = self._transmit_interval while timer > 0: # See if the message queue contains an entry and process jsonMsg, _ = self._read_my_msgQ_noWait() if jsonMsg is not None: self._process_msg(jsonMsg) time.sleep(1) timer -= 1 # Generate the JSON messages with data from the node and transmit on regular interval self._generate_host_update() self._generate_cpu_data() self._generate_if_data() self._generate_disk_space_alert() # If the timer is zero then block for incoming requests notifying to transmit data else: # Block on message queue until it contains an entry jsonMsg, _ = self._read_my_msgQ() if jsonMsg is not None: self._process_msg(jsonMsg) # Keep processing until the message queue is empty while not self._is_my_msgQ_empty(): jsonMsg, _ = self._read_my_msgQ() if jsonMsg is not None: self._process_msg(jsonMsg) except Exception as ae: # Log it and restart the whole process when a failure occurs logger.exception("NodeDataMsgHandler restarting: %s" % ae) self._scheduler.enter(1, self._priority, self.run, ()) self._log_debug("Finished processing successfully") def _process_msg(self, jsonMsg): """Parses the incoming message and generate the desired data message""" self._log_debug("_process_msg, jsonMsg: %s" % jsonMsg) if isinstance(jsonMsg, dict) is False: jsonMsg = json.loads(jsonMsg) # Parse out the uuid so that it can be sent back in response message self._uuid = None if jsonMsg.get("sspl_ll_msg_header") is not None and \ jsonMsg.get("sspl_ll_msg_header").get("uuid") is not None: self._uuid = jsonMsg.get("sspl_ll_msg_header").get("uuid") self._log_debug("_processMsg, uuid: %s" % self._uuid) if jsonMsg.get("sensor_request_type") is not None and \ jsonMsg.get("sensor_request_type").get("node_data") is not None and \ jsonMsg.get("sensor_request_type").get("node_data").get("sensor_type") is not None: self.sensor_type = jsonMsg.get("sensor_request_type").get( "node_data").get("sensor_type").split(":")[2] self._log_debug("_processMsg, sensor_type: %s" % self.sensor_type) if self.sensor_type == "memory_usage": self._generate_host_update() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(EgressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "cpu_usage": self._generate_cpu_data() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(EgressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "nw": self._generate_if_data() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(EgressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "disk_space": self._generate_disk_space_alert() sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(EgressProcessor.name(), sensor_message_type) else: self._log_debug(f"NodeDataMsgHandler, _process_msg, \ No past data found for {self.sensor_type} sensor type") elif self.sensor_type == "raid_data": self._generate_raid_data(jsonMsg) sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(EgressProcessor.name(), sensor_message_type) else: self._log_debug( "NodeDataMsgHandler, _process_msg " + f"No past data found for {self.sensor_type} sensor type" ) elif self.sensor_type == "raid_integrity": self._generate_raid_integrity_data(jsonMsg) sensor_message_type = self.os_sensor_type.get( self.sensor_type, "") if sensor_message_type: self._write_internal_msgQ(EgressProcessor.name(), sensor_message_type) else: self._log_debug( "NodeDataMsgHandler, _process_msg " + f"No past data found for {self.sensor_type} sensor type" ) # Update mapping of device names to serial numbers for global use elif jsonMsg.get("sensor_response_type") is not None: if jsonMsg.get( "sensor_response_type") == "devicename_serialnumber": self._update_devicename_sn_dict(jsonMsg) elif jsonMsg.get("sensor_request_type") is not None and \ jsonMsg.get("sensor_request_type").get("node_data") is not None and \ jsonMsg.get("sensor_request_type").get("node_data").get("info") is not None and \ jsonMsg.get("sensor_request_type").get("node_data").get("info").get("resource_type") is not None: self._generate_node_fru_data(jsonMsg) # ... handle other node sensor message types def _update_devicename_sn_dict(self, jsonMsg): """Update the dict of device names to serial numbers""" drive_byid = jsonMsg.get("drive_byid") device_name = jsonMsg.get("device_name") serial_number = jsonMsg.get("serial_number") self._drive_by_device_name[device_name] = serial_number self._drive_byid_by_serial_number[serial_number] = drive_byid self._log_debug( "NodeDataMsgHandler, device_name: %s, serial_number: %s, drive_byid: %s" % (device_name, serial_number, drive_byid)) def _generate_host_update(self): """Create & transmit a host update message as defined by the sensor response json schema""" current_time = Utility.get_current_time() # Notify the node sensor to update its data required for the host_update message successful = self._node_sensor.read_data("host_update", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_host_update was NOT successful." ) self._host_memory_usage_threshold = str( self._host_memory_usage_threshold) try: if self._host_memory_usage_threshold.isdigit(): self._host_memory_usage_threshold = int( self._host_memory_usage_threshold) else: self._host_memory_usage_threshold = float( self._host_memory_usage_threshold) except ValueError: logger.warn( "Host Memory Alert, Invalid host_memory_usage_threshold value are entered in config." ) # Assigning default value to _memory_usage_threshold self._host_memory_usage_threshold = self.DEFAULT_HOST_MEMORY_USAGE_THRESHOLD memory_persistent_data = self.read_persistent_data('MEMORY_USAGE_DATA') if memory_persistent_data.get('memory_usage_time_map') is not None: previous_check_time = int( memory_persistent_data['memory_usage_time_map']) else: previous_check_time = int(-1) if memory_persistent_data\ .get('memory_fault_resolved_iterations') is not None: fault_resolved_iters = int( memory_persistent_data['memory_fault_resolved_iterations']) else: fault_resolved_iters = 0 try: iteration_limit = int(self._high_memory_usage_wait_threshold / self._transmit_interval) except ZeroDivisionError: iteration_limit = 0 self.usage_time_map['memory'] = current_time if self._node_sensor.total_memory["percent"] >= self._host_memory_usage_threshold \ and not self.high_usage['memory']: if previous_check_time == -1: previous_check_time = current_time self.persist_state_data('memory', 'MEMORY_USAGE_DATA') if self.usage_time_map[ 'memory'] - previous_check_time >= self._high_memory_usage_wait_threshold: self.high_usage['memory'] = True self.fault_resolved_iterations['memory'] = 0 # Create the memory data message and hand it over # to the egress processor to transmit fault_event = "Host memory usage has increased to {}%,"\ "beyond the configured threshold of {}% "\ "for more than {} seconds.".format( self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold, self._high_memory_usage_wait_threshold ) logger.warn(fault_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it to message processor self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) self.persist_state_data('memory', 'MEMORY_USAGE_DATA') if self._node_sensor.total_memory[ "percent"] < self._host_memory_usage_threshold: if not self.high_usage['memory']: self.persist_state_data('memory', 'MEMORY_USAGE_DATA') else: if fault_resolved_iters < iteration_limit: fault_resolved_iters += 1 self.fault_resolved_iterations[ 'memory'] = fault_resolved_iters self.persist_state_data('memory', 'MEMORY_USAGE_DATA') elif fault_resolved_iters >= iteration_limit: # Create the memory data message and hand it over # to the egress processor to transmit fault_resolved_event = "Host memory usage has decreased to {}%, "\ "lower than the configured threshold of {}%.".format( self._node_sensor.total_memory["percent"], self._host_memory_usage_threshold ) logger.info(fault_resolved_event) logged_in_users = [] # Create the host update message and hand it over to the egress processor to transmit hostUpdateMsg = HostUpdateMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.boot_time, self._node_sensor.up_time, self._node_sensor.uname, self._units, self._node_sensor.total_memory, self._node_sensor.logged_in_users, self._node_sensor.process_count, self._node_sensor.running_process_count, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: hostUpdateMsg.set_uuid(self._uuid) jsonMsg = hostUpdateMsg.getJson() # Transmit it to message processor self.host_sensor_data = jsonMsg self.os_sensor_type["memory_usage"] = self.host_sensor_data self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) self.high_usage['memory'] = False self.usage_time_map['memory'] = int(-1) self.fault_resolved_iterations['memory'] = 0 self.persist_state_data('memory', 'MEMORY_USAGE_DATA') def _generate_local_mount_data(self): """Create & transmit a local_mount_data message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the local_mount_data message successful = self._node_sensor.read_data("local_mount_data", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_local_mount_data was NOT successful." ) # Create the local mount data message and hand it over to the egress processor to transmit localMountDataMsg = LocalMountDataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.free_space, self._node_sensor.free_inodes, self._node_sensor.free_swap, self._node_sensor.total_space, self._node_sensor.total_swap, self._units) # Add in uuid if it was present in the json request if self._uuid is not None: localMountDataMsg.set_uuid(self._uuid) jsonMsg = localMountDataMsg.getJson() # Transmit it to message processor self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) def _generate_cpu_data(self): """Create & transmit a cpu_data message as defined by the sensor response json schema""" current_time = Utility.get_current_time() # Notify the node sensor to update its data required for the cpu_data message successful = self._node_sensor.read_data("cpu_data", self._get_debug()) if not successful: logger.error( "NodeDataMsgHandler, _generate_cpu_data was NOT successful.") self._cpu_usage_threshold = str(self._cpu_usage_threshold) try: if self._cpu_usage_threshold.isdigit(): self._cpu_usage_threshold = int(self._cpu_usage_threshold) else: self._cpu_usage_threshold = float(self._cpu_usage_threshold) except ValueError: logger.warn( "CPU Usage Alert, Invalid host_memory_usage_threshold value are entered in config." ) # Assigning default value to _cpu_usage_threshold self._cpu_usage_threshold = self.DEFAULT_CPU_USAGE_THRESHOLD cpu_persistent_data = self.read_persistent_data('CPU_USAGE_DATA') if cpu_persistent_data.get('cpu_usage_time_map') is not None: previous_check_time = int( cpu_persistent_data['cpu_usage_time_map']) else: previous_check_time = int(-1) if cpu_persistent_data.get( 'cpu_fault_resolved_iterations') is not None: fault_resolved_iters = int( cpu_persistent_data['cpu_fault_resolved_iterations']) else: fault_resolved_iters = 0 try: iteration_limit = int(self._high_cpu_usage_wait_threshold / self._transmit_interval) except ZeroDivisionError: iteration_limit = 0 self.usage_time_map['cpu'] = current_time if self._node_sensor.cpu_usage >= self._cpu_usage_threshold \ and not self.high_usage['cpu']: if previous_check_time == -1: previous_check_time = current_time self.persist_state_data('cpu', 'CPU_USAGE_DATA') if self.usage_time_map[ 'cpu'] - previous_check_time >= self._high_cpu_usage_wait_threshold: self.high_usage['cpu'] = True self.fault_resolved_iterations['cpu'] = 0 # Create the cpu usage data message and hand it over # to the egress processor to transmit fault_event = "CPU usage has increased to {}%, "\ "beyond the configured threshold of {}% "\ "for more than {} seconds.".format( self._node_sensor.cpu_usage, self._cpu_usage_threshold, self._high_cpu_usage_wait_threshold ) logger.warn(fault_event) # Create the cpu usage update message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it to message processor self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) # Store the state to Persistent Cache. self.persist_state_data('cpu', 'CPU_USAGE_DATA') if self._node_sensor.cpu_usage < self._cpu_usage_threshold: if not self.high_usage['cpu']: self.persist_state_data('cpu', 'CPU_USAGE_DATA') else: if fault_resolved_iters < iteration_limit: fault_resolved_iters += 1 self.fault_resolved_iterations[ 'cpu'] = fault_resolved_iters self.persist_state_data('cpu', 'CPU_USAGE_DATA') elif fault_resolved_iters >= iteration_limit: # Create the cpu usage data message and hand it over # to the egress processor to transmit fault_resolved_event = "CPU usage has decreased to {}%, "\ "lower than the configured threshold of {}%.".format( self._node_sensor.cpu_usage, self._cpu_usage_threshold ) logger.info(fault_resolved_event) # Create the cpu usage update message and hand it over to the egress processor to transmit cpuDataMsg = CPUdataMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.csps, self._node_sensor.idle_time, self._node_sensor.interrupt_time, self._node_sensor.iowait_time, self._node_sensor.nice_time, self._node_sensor.softirq_time, self._node_sensor.steal_time, self._node_sensor.system_time, self._node_sensor.user_time, self._node_sensor.cpu_core_data, self._node_sensor.cpu_usage, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: cpuDataMsg.set_uuid(self._uuid) jsonMsg = cpuDataMsg.getJson() self.cpu_sensor_data = jsonMsg self.os_sensor_type["cpu_usage"] = self.cpu_sensor_data # Transmit it to message processor self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) self.high_usage['cpu'] = False self.usage_time_map['cpu'] = int(-1) self.fault_resolved_iterations['cpu'] = 0 # Store the state to Persistent Cache. self.persist_state_data('cpu', 'CPU_USAGE_DATA') def _send_ifdata_json_msg(self, resource_id, resource_type, state, event=""): """A resuable method for transmitting IFDataMsg to RMQ and IEM logging""" ifDataMsg = IFdataMsg(self._node_sensor.host_id, self._node_sensor.local_time, self._node_sensor.if_data, resource_id, resource_type, state, self.severity_reader.map_severity(state), event) # Add in uuid if it was present in the json request if self._uuid is not None: ifDataMsg.set_uuid(self._uuid) jsonMsg = ifDataMsg.getJson() self.if_sensor_data = jsonMsg self.os_sensor_type["nw"] = self.if_sensor_data # Transmit it to message processor self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) self.persist_state_data('nw', 'NW_SENSOR_DATA') def _generate_if_data(self): """Create & transmit a network interface data message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the if_data message successful = self._node_sensor.read_data("if_data", self._get_debug()) if not successful: logger.error( "NodeDataMsgHandler, _generate_if_data was NOT successful.") interfaces = self._node_sensor.if_data cable_alert_desc = "Newtork Cable for interface {} is {}." nw_alert_desc = "Network interface {} is {}." for interface in interfaces: if_name = interface["ifId"] nw_if_status = interface["nwStatus"].upper() # if nw_fault if nw_if_status in ["DOWN", "UNKNOWN"]: nw_cable_status = interface["nwCableConnStatus"].upper() # if nw_cable_disconneted if nw_cable_status == "DISCONNECTED": if self.prev_cable_status.get(if_name) != nw_cable_status: # raise nw_cable_fault_alert self.prev_cable_status[if_name] = nw_cable_status self._send_ifdata_json_msg( if_name, self.NW_CABLE_RESOURCE_TYPE, self.FAULT, cable_alert_desc.format(if_name, nw_cable_status)) # elif nw_cable_connected elif nw_cable_status == "CONNECTED": if self.prev_cable_status.get(if_name) == "DISCONNECTED": # raise nw_cable_fault_resolved_alert self.prev_cable_status[if_name] = nw_cable_status self._send_ifdata_json_msg( if_name, self.NW_CABLE_RESOURCE_TYPE, self.FAULT_RESOLVED, cable_alert_desc.format(if_name, nw_cable_status)) if self.prev_nw_if_status.get(if_name) != "DOWN": # raise nw_fault_alert self.prev_nw_if_status[if_name] = "DOWN" self._send_ifdata_json_msg( if_name, self.NW_RESOURCE_TYPE, self.FAULT, nw_alert_desc.format(if_name, "DOWN")) # nw_cable_state is unknown and nw_is_faulty else: if self.prev_nw_if_status.get(if_name) != "DOWN": # raise nw_fault_alert self.prev_nw_if_status[if_name] = "DOWN" self._send_ifdata_json_msg( if_name, self.NW_RESOURCE_TYPE, self.FAULT, nw_alert_desc.format(if_name, "DOWN")) else: # no nw_fault if self.prev_nw_if_status.get(if_name) == "DOWN": # raise nw_fault_resolved_alert self.prev_nw_if_status[if_name] = "UP" self._send_ifdata_json_msg( if_name, self.NW_RESOURCE_TYPE, self.FAULT_RESOLVED, nw_alert_desc.format(if_name, "UP")) if self.prev_cable_status.get(if_name) == "DISCONNECTED": # raise nw_cable_fault_resolved_alert self.prev_cable_status[if_name] = "CONNECTED" self._send_ifdata_json_msg( if_name, self.NW_CABLE_RESOURCE_TYPE, self.FAULT_RESOLVED, cable_alert_desc.format(if_name, "CONNECTED")) def _generate_disk_space_alert(self): """Create & transmit a disk_space_alert message as defined by the sensor response json schema""" # Notify the node sensor to update its data required for the disk_space_data message successful = self._node_sensor.read_data("disk_space_alert", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, _generate_disk_space_alert was NOT successful." ) return # Changing disk_usage_threshold type according to what value type entered in config file self._disk_usage_threshold = str(self._disk_usage_threshold) try: if self._disk_usage_threshold.isdigit(): self._disk_usage_threshold = int(self._disk_usage_threshold) else: self._disk_usage_threshold = float(self._disk_usage_threshold) except ValueError: logger.warn( "Disk Space Alert, Invalid disk_usage_threshold value are entered in config." ) # Assigning default value to _disk_usage_threshold self._disk_usage_threshold = self.DEFAULT_DISK_USAGE_THRESHOLD if self._node_sensor.disk_used_percentage >= self._disk_usage_threshold \ and not self.high_usage['disk']: self.high_usage['disk'] = True # Create the disk space data message and hand it over # to the egress processor to transmit fault_event = "Disk usage has increased to {}%, "\ "beyond the configured threshold of {}%.".format( self._node_sensor.disk_used_percentage, self._disk_usage_threshold ) logger.warn(fault_event) diskSpaceAlertMsg = DiskSpaceAlertMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.FAULT, fault_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it to message processor self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) # Save the new state in Persistent Cache. self.persist_state_data('disk', 'DISK_USAGE_DATA') if self._node_sensor.disk_used_percentage <= self._disk_usage_threshold \ and self.high_usage['disk']: # Create the disk space data message and hand it over # to the egress processor to transmit fault_resolved_event = "Disk usage has decreased to {}%, "\ "lower than the configured threshold of {}%.".format( self._node_sensor.disk_used_percentage, self._disk_usage_threshold ) logger.info(fault_resolved_event) diskSpaceAlertMsg = DiskSpaceAlertMsg( self._node_sensor.host_id, self._epoch_time, self._node_sensor.total_space, self._node_sensor.free_space, self._node_sensor.disk_used_percentage, self._units, self.FAULT_RESOLVED, fault_resolved_event) # Add in uuid if it was present in the json request if self._uuid is not None: diskSpaceAlertMsg.set_uuid(self._uuid) jsonMsg = diskSpaceAlertMsg.getJson() self.disk_sensor_data = jsonMsg self.os_sensor_type["disk_space"] = self.disk_sensor_data # Transmit it to message processor self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) self.high_usage['disk'] = False # Save the new state in Persistent Cache. self.persist_state_data('disk', 'DISK_USAGE_DATA') def _generate_raid_data(self, jsonMsg): """Create & transmit a RAID status data message as defined by the sensor response json schema""" successful = self._node_sensor.read_data("raid", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, updating RAID information was NOT successful." ) return # See if status is in the msg; ie it's an internal msg from the RAID sensor if jsonMsg.get("sensor_request_type").get("node_data").get( "status") is not None: sensor_request = jsonMsg.get("sensor_request_type").get( "node_data") host_name = sensor_request.get("host_id") alert_type = sensor_request.get("alert_type") alert_id = sensor_request.get("alert_id") severity = sensor_request.get("severity") info = sensor_request.get("info") specific_info = sensor_request.get("specific_info") self._raid_device = jsonMsg.get("sensor_request_type").get( "node_data").get("specific_info").get("device") self._raid_drives = list( jsonMsg.get("sensor_request_type").get("node_data").get( "specific_info").get("drives")) raidDataMsg = RAIDdataMsg(host_name, alert_type, alert_id, severity, info, specific_info) # Add in uuid if it was present in the json request if self._uuid is not None: raidDataMsg.set_uuid(self._uuid) jsonMsg = raidDataMsg.getJson() self.raid_sensor_data = jsonMsg self.os_sensor_type["raid_data"] = self.raid_sensor_data # Loop thru each index of drives containing only paths and fill in with s/n for drive in self._raid_drives: self._log_debug("drive: %s" % str(drive)) if drive.get("identity") is not None: path = drive.get("identity").get("path") self._log_debug("path: %s" % str(path)) # Lookup the serial number from the path serial_number = str(self._drive_by_device_name.get(path)) self._log_debug("serial_number: %s" % str(serial_number)) if serial_number != "None": drive["identity"]["serialNumber"] = serial_number # Change device path to path-byid drive_byid = str( self._drive_byid_by_serial_number.get(serial_number)) if drive_byid != "None": drive["identity"]["path"] = drive_byid self._log_debug( "_generate_raid_data, host_id: %s, device: %s, drives: %s" % (self._node_sensor.host_id, self._raid_device, str(self._raid_drives))) def _generate_raid_integrity_data(self, jsonMsg): """Create & transmit a Validate RAID result data message as defined by the sensor response json schema""" logger.debug("NodeDataMsgHandler, Validating RAID information") # See if status is in the msg; ie it's an internal msg from the RAID sensor if jsonMsg.get("sensor_request_type").get("node_data").get( "status") is not None: sensor_request = jsonMsg.get("sensor_request_type").get( "node_data") host_name = sensor_request.get("host_id") alert_type = sensor_request.get("alert_type") alert_id = sensor_request.get("alert_id") severity = sensor_request.get("severity") info = sensor_request.get("info") specific_info = sensor_request.get("specific_info") self._alert = jsonMsg.get("sensor_request_type").get( "node_data").get("specific_info").get("error") RAIDintegrityMsg = RAIDIntegrityMsg(host_name, alert_type, alert_id, severity, info, specific_info) # Add in uuid if it was present in the json request if self._uuid is not None: RAIDintegrityMsg.set_uuid(self._uuid) jsonMsg = RAIDintegrityMsg.getJson() self.raid_integrity_data = jsonMsg self.os_sensor_type["raid_integrity"] = self.raid_integrity_data self._log_debug("_generate_raid_integrity_data, host_id: %s" % (self._node_sensor.host_id)) def _generate_node_fru_data(self, jsonMsg): """Create & transmit a FRU IPMI data message as defined by the sensor response json schema""" if self._node_sensor.host_id is None: successful = self._node_sensor.read_data("None", self._get_debug(), self._units) if not successful: logger.error( "NodeDataMsgHandler, updating host information was NOT successful." ) if jsonMsg.get("sensor_request_type").get("node_data") is not None: self._fru_info = jsonMsg.get("sensor_request_type").get( "node_data") node_ipmi_data_msg = NodeIPMIDataMsg(self._fru_info) if self._uuid is not None: node_ipmi_data_msg.set_uuid(self._uuid) jsonMsg = node_ipmi_data_msg.getJson() self._write_internal_msgQ(EgressProcessor.name(), jsonMsg) def suspend(self): """Suspends the module thread. It should be non-blocking""" super(NodeDataMsgHandler, self).suspend() self._suspended = True def resume(self): """Resumes the module thread. It should be non-blocking""" super(NodeDataMsgHandler, self).resume() self._suspended = False def shutdown(self): """Clean up scheduler queue and gracefully shutdown thread""" super(NodeDataMsgHandler, self).shutdown()
def raise_alert(self, service, prev_state, state, prev_substate, substate, prev_pid, pid, alert_info_index): """Send the alert to ServiceMsgHandler.""" # Each alert info contains 4 fields # 1.Description | 2.Alert Type | 3.Impact | 4.Recommendation alert_info = [ [ f"{service} in {state} state.", #index 0 "fault", f"{service} service is unavailable.", "Try to restart the service" ], [ f"{service} in a {state} state for more than {self.max_wait_time} seconds.", "fault", #index 1 f"{service} service is unavailable.", "Try to restart the service" ], [ f"{service} in {state} state.", "fault_resolved", #index 2 f"{service} service is available now.", "" ], ] description = alert_info[alert_info_index][0] alert_type = alert_info[alert_info_index][1] impact = alert_info[alert_info_index][2] recommendation = alert_info[alert_info_index][3] severity = SeverityReader().map_severity(alert_type) epoch_time = str(self.current_time()) alert_id = get_alert_id(epoch_time) host_name = socket.getfqdn() self._site_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{SITE_ID}", 'DC01') self._rack_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{RACK_ID}", 'RC01') self._node_id = Conf.get(GLOBAL_CONF, f"{CLUSTER}>{SRVNODE}>{NODE_ID}", 'SN01') self._cluster_id = Conf.get(GLOBAL_CONF, f'{CLUSTER}>{CLUSTER_ID}', 'CC01') info = { "site_id": self._site_id, "cluster_id": self._cluster_id, "rack_id": self._rack_id, "node_id": self._node_id, "resource_type": self.RESOURCE_TYPE, "resource_id": service, "event_time": epoch_time, "description": description, "impact": impact, "recommendation": recommendation, } alert_msg = { "sensor_request_type": { "service_status_alert": { "host_id": host_name, "severity": severity, "alert_id": alert_id, "alert_type": alert_type, "info": info, "specific_info": { "service_name": service, "previous_state": prev_state, "state": state, "previous_substate": prev_substate, "substate": substate, "previous_pid": prev_pid, "pid": pid, } } } } self._write_internal_msgQ(ServiceMsgHandler.name(), alert_msg)