def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) Platform.validate_server_type_support(self.log, ResourceMapError, server_type) self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensor': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fan': self.get_fans_info, 'nw_port': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_port': self.get_sas_ports_info, 'disk': self.get_disks_info, 'psu': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current'] self.service = Service() self.resource_indexing_map = ServerResourceMap.resource_indexing_map\ ["health"]
def __init__(self): """Initialize server manifest.""" super().__init__() self.log = CustomLog(MANIFEST_SVC_NAME) server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) # import pdb; pdb.set_trace() Platform.validate_server_type_support(self.log, ResourceMapError, server_type) self.field_mapping = { 'id': 'uid', 'class': 'type', 'description': 'description', 'product': 'product', 'serial': 'serial_number', 'vendor': 'manufacturer', 'part_number': 'part_number', 'model_number': 'model_number', 'physid': 'physical_id', 'version': 'version', 'logicalname': 'logical_name' } self.class_mapping = { 'memory': 'hw>memory[%s]>%s', 'disk': 'hw>disk[%s]>%s', 'storage': 'hw>storage[%s]>%s', 'system': 'hw>system[%s]>%s', 'processor': 'hw>cpu[%s]>%s', 'network': 'hw>nw_port[%s]>%s', 'power': 'hw>psu[%s]>%s', 'volume': 'hw>volume[%s]>%s', 'bus': 'hw>bus[%s]>%s', 'bridge': 'hw>bridge[%s]>%s', 'display': 'hw>display[%s]>%s', 'input': 'hw>input[%s]>%s', 'generic': 'hw>generic[%s]>%s' } self.kv_dict = {} sw_resources = { 'os': self.get_os_server_info, 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info } fw_resources = {'bmc': self.get_bmc_version_info} # Extracting resource type for 'self.class_mapping' dictionary values # and adding to hw_resources for function mapping. hw_resources = {value[len('hw>'):-len('[%s]>%s')]: \ self.get_hw_resources_info for value in self.class_mapping.values()} self.server_resources = { "hw": hw_resources, "sw": sw_resources, "fw": fw_resources } self.service = Service() self.platform = Platform() self.resource_indexing_map = ServerResourceMap.resource_indexing_map\ ["manifest"]
def get_external_service_info(self): """Get external service info in required format.""" service_info = [] external_services = Service().get_external_service_list() for service in external_services: response = self.get_systemd_service_info(service) if response is not None: service_info.append(response) return service_info
class ServerManifest(): """ ServerManifest class provides resource map and related information like health. """ name = "server_manifest" def __init__(self): """Initialize server manifest.""" super().__init__() self.log = CustomLog(MANIFEST_SVC_NAME) server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) # import pdb; pdb.set_trace() Platform.validate_server_type_support(self.log, ResourceMapError, server_type) self.field_mapping = { 'id': 'uid', 'class': 'type', 'description': 'description', 'product': 'product', 'serial': 'serial_number', 'vendor': 'manufacturer', 'part_number': 'part_number', 'model_number': 'model_number', 'physid': 'physical_id', 'version': 'version', 'logicalname': 'logical_name' } self.class_mapping = { 'memory': 'hw>memory[%s]>%s', 'disk': 'hw>disk[%s]>%s', 'storage': 'hw>storage[%s]>%s', 'system': 'hw>system[%s]>%s', 'processor': 'hw>cpu[%s]>%s', 'network': 'hw>nw_port[%s]>%s', 'power': 'hw>psu[%s]>%s', 'volume': 'hw>volume[%s]>%s', 'bus': 'hw>bus[%s]>%s', 'bridge': 'hw>bridge[%s]>%s', 'display': 'hw>display[%s]>%s', 'input': 'hw>input[%s]>%s', 'generic': 'hw>generic[%s]>%s' } self.kv_dict = {} sw_resources = { 'os': self.get_os_server_info, 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info } fw_resources = {'bmc': self.get_bmc_version_info} # Extracting resource type for 'self.class_mapping' dictionary values # and adding to hw_resources for function mapping. hw_resources = {value[len('hw>'):-len('[%s]>%s')]: \ self.get_hw_resources_info for value in self.class_mapping.values()} self.server_resources = { "hw": hw_resources, "sw": sw_resources, "fw": fw_resources } self.service = Service() self.platform = Platform() self.resource_indexing_map = ServerResourceMap.resource_indexing_map\ ["manifest"] def get_data(self, rpath): """Fetch manifest information for given rpath.""" logger.info(self.log.svc_log(f"Get Manifest data for rpath:{rpath}")) info = {} resource_found = False nodes = rpath.strip().split(">") leaf_node, _ = ServerResourceMap.get_node_info(nodes[-1]) # Fetch manifest information for all sub nodes if leaf_node == "server": # Example rpath: 'node>server[0]' server_hw_data = self.get_server_hw_info() info = self.get_server_info(server_hw_data) resource_found = True elif leaf_node == "hw": # Example rpath: 'node>server[0]>hw' server_hw_data = self.get_server_hw_info() info = self.get_hw_resources_info(server_hw_data, "hw")["hw"] resource_found = True elif leaf_node in ["sw", "fw"]: # Example rpath: 'node>server[0]>fw' or sw for resource, method in self.server_resources[leaf_node].items(): try: info.update({resource: method()}) resource_found = True except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}: {err}")) info = None else: # Example rpath: 'node>server[0]>hw>disk' server_hw_data = self.get_server_hw_info() for node in nodes: resource, _ = ServerResourceMap.get_node_info(node) for res_type in self.server_resources: method = self.server_resources[res_type].get(resource) if not method: logger.error( self.log.svc_log( f"No mapping function found for {res_type}")) continue try: if res_type == "hw": info = method(server_hw_data, resource) else: info = method() resource_found = True except Exception as err: logger.error( self.log.svc_log( f"{err.__class__.__name__}: {err}")) info = None if resource_found: break if not resource_found: msg = f"Invalid rpath or manifest provider doesn't have support for'{rpath}'." logger.error(self.log.svc_log(f"{msg}")) raise ResourceMapError(errno.EINVAL, msg) info = MonUtils.normalize_kv(info, HEALTH_UNDESIRED_VALS, "Not Available") return info def get_server_info(self, server_hw_data): """Get server manifest information.""" server = [] info = {} for res_type in self.server_resources: info.update({res_type: {}}) for fru, method in self.server_resources[res_type].items(): try: if res_type == "hw": info[res_type].update( {fru: method(server_hw_data, fru)}) else: info[res_type].update({fru: method()}) except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}:{err}")) info[res_type].update({fru: []}) info["last_updated"] = int(time.time()) server.append(info) return server def get_server_hw_info(self): """Get server hw information.""" cls_res_cnt = {} lshw_data = {} data, output_file = self.set_lshw_input_data() for kv_key in data.get_keys(): if kv_key.endswith('class'): r_spec = data.get(kv_key) if r_spec in cls_res_cnt: cls_res_cnt[r_spec] += 1 else: cls_res_cnt[r_spec] = 0 if r_spec in self.class_mapping.keys(): for field in self.field_mapping.keys(): manifest_key = self.class_mapping[r_spec] % ( cls_res_cnt[r_spec], self.field_mapping[field]) self.map_manifest_server_data(field, manifest_key, data, kv_key) # Adding data to kv output_file.set(self.kv_dict.keys(), self.kv_dict.values()) lshw_data = self.get_manifest_output_data() # Removing out storage enclosure (RBOD/JBOD) drives from disk server data # as they are coming from lshw , as those need to be represented # separately & uniquely if "disk" in lshw_data["hw"]: lshw_data["hw"]["disk"] = self.get_local_disk( lshw_data["hw"]["disk"]) # Sort list by serial_number eth_ctrl = [] for resource, sort_key_path in self.resource_indexing_map["hw"].items( ): if resource in lshw_data["hw"]: if resource == "nw_port": # Separating out ethernet controller and ethernet interface # data for sorting. eth_ctrl = [eth_ctr for eth_ctr in lshw_data["hw"][resource] \ if eth_ctr['logical_name']=='NA'] lshw_data["hw"][resource] = [eth_interface for eth_interface \ in lshw_data["hw"][resource] if eth_interface[ 'logical_name']!='NA'] sorted_data = MonUtils.sort_by_specific_kv( lshw_data["hw"][resource], sort_key_path, self.log) lshw_data["hw"][resource] = sorted_data if resource == "nw_port" and eth_ctrl: lshw_data["hw"][resource] += eth_ctrl return lshw_data def get_hw_resources_info(self, server_hw_data, resource=False): """Get server hw resource information.""" server = {} if resource == "hw" and "hw" in server_hw_data: server.update({"hw": {}}) # Sorting output dictionary according to data priority. for server_type in self.server_resources["hw"].keys(): if server_type in server_hw_data["hw"]: server["hw"][server_type] = server_hw_data["hw"][ server_type] else: server = server_hw_data["hw"].get(resource, []) return server def set_lshw_input_data(self): """ KvStoreFactory can not accept a dictionary as direct input and output It will support only JSON, YAML, TOML, INI, PROPERTIES files. So here we are fetching the lshw data and adding that to a file for further execution. """ input_file = None output_file = None response, err, returncode = SimpleProcess("lshw -json").run() if returncode: msg = f"Failed to capture Node support data. Error:{str(err)}" logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) try: with open(LSHW_FILE, 'w+') as fp: json.dump(json.loads(response.decode("utf-8")), fp, indent=4) with open(MANIFEST_OUTPUT_FILE, 'w+') as fp: json.dump({}, fp, indent=4) input_file = KvStoreFactory.get_instance( f'json://{LSHW_FILE}').load() output_file = KvStoreFactory.get_instance( f'json://{MANIFEST_OUTPUT_FILE}') except Exception as e: msg = "Error in getting {0} file: {1}".format(LSHW_FILE, e) logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) return input_file, output_file def map_manifest_server_data(self, field, manifest_key, data, kv_key): """Mapping actual lshw output data with standard structured manifest data.""" parent_id = "" base_key = '>'.join(kv_key.split('>')[:-1]) if base_key: value = data.get(base_key + '>' + field) else: value = data.get(field) if isinstance(value, list): value = ','.join(value) value = value.replace(" (To be filled by O.E.M.)", "") \ if value else 'NA' if field == 'id' and '>' in kv_key: parent_key = '>'.join(kv_key.split('>')[:-2]) field = '>' + field if parent_key else field parent_id = data.get(parent_key + field) + "-" self.kv_dict[manifest_key] = parent_id + value def get_manifest_output_data(self): """Returns JSON data in the manifest output file.""" data = {} try: with open(MANIFEST_OUTPUT_FILE) as json_file: data = json.loads(json_file.read()) except Exception as e: msg = "Error in getting {0} file: {1}".format( MANIFEST_OUTPUT_FILE, e) logger.error(self.log.svc_log(msg)) raise ResourceMapError(errno.EINVAL, msg) try: if os.path.exists(LSHW_FILE): os.remove(LSHW_FILE) if os.path.exists(MANIFEST_OUTPUT_FILE): os.remove(MANIFEST_OUTPUT_FILE) except OSError as ex: msg = f"Failed in manifest tmp files cleanup. Error:{str(ex)}" logger.warn(self.log.svc_log(msg)) return data def get_cortx_service_info(self): """Get cortx service info in required format.""" cortx_services = self.service.get_cortx_service_list() cortx_service_info = self.get_service_info(cortx_services) sort_key_path = self.resource_indexing_map["sw"]["cortx_sw_services"] cortx_service_info = MonUtils.sort_by_specific_kv( cortx_service_info, sort_key_path, self.log) return cortx_service_info def get_external_service_info(self): """Get external service info in required format.""" external_services = self.service.get_external_service_list() external_service_info = self.get_service_info(external_services) sort_key_path = self.resource_indexing_map["sw"][ "external_sw_services"] external_service_info = MonUtils.sort_by_specific_kv( external_service_info, sort_key_path, self.log) return external_service_info def get_service_info(self, services): """Returns node server services info.""" services_info = [] for service in services: response = self.service.get_systemd_service_info(self.log, service) if response is not None: uid, _, health_description, _, specifics = response service_info = { "uid": uid, "type": "software", "description": health_description, "product": specifics[0].pop("service_name"), "manufacturer": "Not Applicable", "serial_number": "Not Applicable", "part_number": "Not Applicable", "version": specifics[0].pop("version"), "last_updated": int(time.time()), "specifics": specifics } services_info.append(service_info) return services_info def get_bmc_version_info(self): """Returns node server bmc info.""" bmc_data = [] specifics = self.platform.get_bmc_info() if specifics: bmc = { "uid": 'bmc', "type": "firmware", "description": "BMC and IPMI version information", "product": specifics.get("product_name", "NA"), "manufacturer": specifics.get("manufacturer_name", "NA"), "serial_number": "Not Applicable", "part_number": "Not Applicable", "version": specifics.get("firmware_revision", "NA"), "last_updated": int(time.time()), "specifics": [specifics] } bmc_data.append(bmc) return bmc_data def get_os_server_info(self): """Returns node server os info.""" os_data = [] specifics = self.platform.get_os_info() if specifics: os_info = { "uid": specifics.get("id", "NA"), "type": "software", "description": "OS information", "product": specifics.get("pretty_name", "NA"), "manufacturer": specifics.get("manufacturer_name", "Not Applicable"), "serial_number": "Not Applicable", "part_number": "Not Applicable", "version": specifics.get("version", "NA"), "last_updated": int(time.time()), "specifics": [specifics] } os_data.append(os_info) return os_data @staticmethod def get_local_disk(disks): #TODO: We need better logic to identify server local disks. local_disk = [] for disk in disks: if disk.get("version") in ["G265", "G280"]: continue else: local_disk.append(disk) return local_disk
def get_systemd_service_info(self, service_name): """Get info of specified service using dbus API.""" try: unit = Service()._bus.get_object( const.SYSTEMD_BUS, Service()._manager.LoadUnit(service_name)) properties_iface = Interface(unit, dbus_interface=PROPERTIES_IFACE) except DBusException as err: logger.error( self.log.svc_log( f"Unable to initialize {service_name} due to {err}")) return None path_array = properties_iface.Get(const.SERVICE_IFACE, 'ExecStart') try: command_line_path = str(path_array[0][0]) except IndexError as err: logger.error( self.log.svc_log( f"Unable to find {service_name} path due to {err}")) command_line_path = "NA" is_installed = True if command_line_path != "NA" or 'invalid' in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else False uid = str(properties_iface.Get(const.UNIT_IFACE, 'Id')) if not is_installed: health_status = "NA" health_description = f"Software enabling {uid} is not installed" recommendation = "NA" specifics = [{ "service_name": uid, "description": "NA", "installed": str(is_installed).lower(), "pid": "NA", "state": "NA", "substate": "NA", "status": "NA", "license": "NA", "version": "NA", "command_line_path": "NA" }] else: service_license = "NA" version = "NA" service_description = str( properties_iface.Get(const.UNIT_IFACE, 'Description')) state = str(properties_iface.Get(const.UNIT_IFACE, 'ActiveState')) substate = str(properties_iface.Get(const.UNIT_IFACE, 'SubState')) service_status = 'enabled' if 'disabled' not in properties_iface.Get( const.UNIT_IFACE, 'UnitFileState') else 'disabled' pid = "NA" if state == "inactive" else str( properties_iface.Get(const.SERVICE_IFACE, 'ExecMainPID')) try: version = Service().get_service_info_from_rpm(uid, "VERSION") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service version due to {err}")) try: service_license = Service().get_service_info_from_rpm( uid, "LICENSE") except ServiceError as err: logger.error( self.log.svc_log( f"Unable to get service license due to {err}")) specifics = [{ "service_name": uid, "description": service_description, "installed": str(is_installed).lower(), "pid": pid, "state": state, "substate": substate, "status": service_status, "license": service_license, "version": version, "command_line_path": command_line_path }] if service_status == 'enabled' and state == 'active' \ and substate == 'running': health_status = 'OK' health_description = f"{uid} is in good health" recommendation = "NA" else: health_status = state health_description = f"{uid} is not in good health" recommendation = const.DEFAULT_RECOMMENDATION service_info = self.get_health_template(uid, is_fru=False) self.set_health_data(service_info, health_status, health_description, recommendation, specifics) return service_info
class ServerHealth(): """ ServerHealth class provides resource map and related information like health. """ name = "server_health" def __init__(self): """Initialize server.""" super().__init__() self.log = CustomLog(const.HEALTH_SVC_NAME) server_type = Conf.get(GLOBAL_CONF, NODE_TYPE_KEY) Platform.validate_server_type_support(self.log, ResourceMapError, server_type) self.sysfs = ToolFactory().get_instance('sysfs') self.sysfs.initialize() self.sysfs_base_path = self.sysfs.get_sysfs_base_path() self.cpu_path = self.sysfs_base_path + const.CPU_PATH hw_resources = { 'cpu': self.get_cpu_info, 'platform_sensor': self.get_platform_sensors_info, 'memory': self.get_mem_info, 'fan': self.get_fans_info, 'nw_port': self.get_nw_ports_info, 'sas_hba': self.get_sas_hba_info, 'sas_port': self.get_sas_ports_info, 'disk': self.get_disks_info, 'psu': self.get_psu_info } sw_resources = { 'cortx_sw_services': self.get_cortx_service_info, 'external_sw_services': self.get_external_service_info, 'raid': self.get_raid_info } self.server_resources = {"hw": hw_resources, "sw": sw_resources} self._ipmi = IpmiFactory().get_implementor("ipmitool") self.platform_sensor_list = ['Temperature', 'Voltage', 'Current'] self.service = Service() self.resource_indexing_map = ServerResourceMap.resource_indexing_map\ ["health"] def get_data(self, rpath): """Fetch health information for given rpath.""" logger.info(self.log.svc_log(f"Get Health data for rpath:{rpath}")) info = {} resource_found = False nodes = rpath.strip().split(">") leaf_node, _ = ServerResourceMap.get_node_info(nodes[-1]) # Fetch health information for all sub nodes if leaf_node == "server": info = self.get_server_health_info() resource_found = True elif leaf_node in self.server_resources: for resource, method in self.server_resources[leaf_node].items(): try: info.update({resource: method()}) resource_found = True except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}: {err}")) info = None else: for node in nodes: resource, _ = ServerResourceMap.get_node_info(node) for res_type in self.server_resources: method = self.server_resources[res_type].get(resource) if not method: logger.error( self.log.svc_log( f"No mapping function found for {res_type}")) continue try: info = method() resource_found = True except Exception as err: logger.error( self.log.svc_log( f"{err.__class__.__name__}: {err}")) info = None if resource_found: break if not resource_found: msg = f"Invalid rpath or health provider doesn't have support for'{rpath}'." logger.error(self.log.svc_log(f"{msg}")) raise ResourceMapError(errno.EINVAL, msg) info = MonUtils.normalize_kv(info, const.HEALTH_UNDESIRED_VALS, "Not Available") return info @staticmethod def _is_any_resource_unhealthy(fru, data): """Check for any unhealthy resource at child level.""" for child in data[fru]: if isinstance(child, dict): if child.get("health") and \ child["health"]["status"].lower() != "ok": return True return False @staticmethod def get_health_template(uid, is_fru: bool): """Returns health template.""" return { "uid": uid, "fru": str(is_fru).lower(), "last_updated": "", "health": { "status": "", "description": "", "recommendation": "", "specifics": [] } } @staticmethod def set_health_data(health_data: dict, status, description=None, recommendation=None, specifics=None): """Sets health attributes for a component.""" good_state = (status == "OK") if not description or \ description in const.HEALTH_UNDESIRED_VALS: description = "%s %s in good health." % ( health_data.get("uid"), 'is' if good_state else 'is not') if not good_state: if not recommendation or \ recommendation in const.HEALTH_UNDESIRED_VALS: recommendation = const.DEFAULT_RECOMMENDATION else: recommendation = "None" health_data["last_updated"] = int(time.time()) health_data["health"].update({ "status": status, "description": description, "recommendation": recommendation, "specifics": specifics }) def get_server_health_info(self): """Returns overall server information.""" unhealthy_resource_found = False server_details = Platform().get_server_details() # Currently only one instance of server is considered server = [] info = {} info["make"] = server_details["Board Mfg"] info["model"] = server_details["Product Name"] try: build_instance = BuildInfo() info["product_family"] = build_instance.get_attribute("NAME") info["version"] = build_instance.get_attribute("VERSION") info["build"] = build_instance.get_attribute("BUILD") except Exception as err: logger.error( self.log.svc_log(f"Unable to get build info due to {err}")) info["resource_usage"] = {} info["resource_usage"]["cpu_usage"] = self.get_cpu_overall_usage() info["resource_usage"]["disk_usage"] = self.get_disk_overall_usage() info["resource_usage"]["memory_usage"] = self.get_memory_overall_usage( ) for res_type in self.server_resources: info.update({res_type: {}}) for fru, method in self.server_resources[res_type].items(): try: info[res_type].update({fru: method()}) unhealthy_resource_found = self._is_any_resource_unhealthy( fru, info[res_type]) except Exception as err: logger.error( self.log.svc_log(f"{err.__class__.__name__}:{err}")) info[res_type].update({fru: None}) info["uid"] = socket.getfqdn() info["last_updated"] = int(time.time()) info["health"] = {} info["health"][ "status"] = "OK" if not unhealthy_resource_found else "Degraded" health_desc = 'good' if info["health"]["status"] == 'OK' else 'bad' info["health"]["description"] = f"Server is in {health_desc} health." info["health"]["recommendation"] = const.DEFAULT_RECOMMENDATION \ if info["health"]["status"] != "OK" else "NA" info["health"]["specifics"] = [] server.append(info) return server @staticmethod def get_cpu_usage(index=2, percpu=False): """Get CPU usage list.""" i = 0 cpu_usage = None while i < index: cpu_usage = psutil.cpu_percent(interval=None, percpu=percpu) time.sleep(1) i = i + 1 return cpu_usage def get_cpu_list(self, mode): """Returns the CPU list as per specified mode.""" cpu_info_path = Path(self.cpu_path + mode) # Read the text from /cpu/online file cpu_info = cpu_info_path.read_text() # Drop the \n character from the end of string cpu_info = cpu_info.rstrip('\n') # Convert the string to list of indexes cpu_list = self.sysfs.convert_cpu_info_list(cpu_info) return cpu_list def get_cpu_info(self, add_overall_usage=False): """Update and return CPU information in specific format.""" per_cpu_data = [] cpu_present = self.get_cpu_list("present") cpu_online = self.get_cpu_list("online") cpu_usage = self.get_cpu_usage(percpu=True) cpu_usage_dict = dict(zip(cpu_online, cpu_usage)) overall_cpu_usage = list(psutil.getloadavg()) cpu_count = len(cpu_present) overall_usage = { "current": self.get_cpu_usage(percpu=False), "1_min_avg": overall_cpu_usage[0], "5_min_avg": overall_cpu_usage[1], "15_min_avg": overall_cpu_usage[2] } for cpu_id in range(0, cpu_count): uid = f"CPU-{cpu_id}" cpu_dict = self.get_health_template(uid, is_fru=False) online_status = "Online" if cpu_id in cpu_online else "Offline" health_status = "OK" if online_status == "Online" else "NA" usage = "NA" if health_status == "NA" \ else cpu_usage_dict[cpu_id] specifics = [{"cpu_usage": usage, "state": online_status}] self.set_health_data(cpu_dict, status=health_status, specifics=specifics) per_cpu_data.append(cpu_dict) cpu_data = [{ "overall_usage": overall_usage, "cpu_count": cpu_count, "last_updated": int(time.time()), "cpus": per_cpu_data }] if not add_overall_usage: cpu_data = per_cpu_data sort_key_path = self.resource_indexing_map["hw"]["cpu"] cpu_data = MonUtils.sort_by_specific_kv(cpu_data, sort_key_path, self.log) logger.debug(self.log.svc_log(f"CPU Health Data:{cpu_data}")) return cpu_data def get_cpu_overall_usage(self): """Returns CPU overall usage.""" overall_usage = None cpu_data = self.get_cpu_info(add_overall_usage=True) if cpu_data[0].get("overall_usage"): overall_usage = cpu_data[0].get("overall_usage") else: logger.error(self.log.svc_log("Failed to get overall cpu usage")) return overall_usage def get_disk_info(self, add_overall_usage=False): """Update and return Disk information in specific format.""" per_disk_data = [] overall_usage = None disk_data = [{ "overall_usage": overall_usage, "last_updated": int(time.time()), "disks": per_disk_data }] if not add_overall_usage: disk_data = per_disk_data logger.debug(self.log.svc_log(f"Disk Health Data:{disk_data}")) return disk_data def format_ipmi_platform_sensor_reading(self, reading): """ builds json response from ipmi tool response. reading arg sample: ('CPU1 Temp', '01', 'ok', '3.1', '36 degrees C'). """ uid = '_'.join(reading[0].split()) sensor_id = reading[0] sensor_props = self._ipmi.get_sensor_props(sensor_id) lower_critical = sensor_props[1].get('Lower Critical', 'NA') upper_critical = sensor_props[1].get('Upper Critical', 'NA') lower_non_recoverable = sensor_props[1].get('Lower Non-Recoverable', 'NA') upper_non_recoverable = sensor_props[1].get('Upper Non-Recoverable', 'NA') status = 'OK' if reading[2] == 'ok' else 'NA' health_desc = 'good' if status == 'OK' else 'bad' description = f"{uid} sensor is in {health_desc} health." recommendation = const.DEFAULT_RECOMMENDATION if status != 'OK' else 'NA' specifics = [{ "Sensor Reading": f"{reading[-1]}", "lower_critical_threshold": lower_critical, "upper_critical_threshold": upper_critical, "lower_non_recoverable": lower_non_recoverable, "upper_non_recoverable": upper_non_recoverable, }] resp = self.get_health_template(uid, is_fru=False) self.set_health_data(resp, status, description, recommendation, specifics) return resp def get_platform_sensors_info(self): """Get the sensor information based on sensor_type and instance.""" response = {sensor: [] for sensor in self.platform_sensor_list} for sensor in self.platform_sensor_list: sensor_reading = self._ipmi.get_sensor_list_by_type(sensor) if not sensor_reading: logger.debug( self.log.svc_log(f"No sensor data received for :{sensor}")) continue for reading in sensor_reading: response[sensor].append( self.format_ipmi_platform_sensor_reading(reading)) logger.debug( self.log.svc_log(f"Platform Sensor Health Data:{response}")) return response def get_mem_info(self): """Collect & return system memory info in specific format.""" default_mem_usage_threshold = int( Conf.get(SSPL_CONF, "NODEDATAMSGHANDLER>host_memory_usage_threshold", 80)) data = [] status = "OK" description = "Host memory is in good health." self.mem_info = dict(psutil.virtual_memory()._asdict()) curr_mem_usage_threshold = int(self.mem_info['percent']) if curr_mem_usage_threshold > int(default_mem_usage_threshold): status = "Overloaded" description = ( f"Current host memory usage is {curr_mem_usage_threshold}," f"beyond configured threshold of {default_mem_usage_threshold}." ) memory_dict = self.prepare_mem_json(status, description) data.append(memory_dict) logger.debug(self.log.svc_log(f"Memory Health Data:{data}")) return data def prepare_mem_json(self, status, description): """Update and return memory information dict.""" total_memory = {} for key, value in self.mem_info.items(): if key == 'percent': total_memory[key] = str(value) + '%' else: total_memory[key] = str(value >> 20) + 'MB' uid = "main_memory" specifics = [{ "total": total_memory['total'], "available": total_memory['available'], "percent": total_memory['percent'], "used": total_memory['used'], "free": total_memory['free'], "active": total_memory['active'], "inactive": total_memory['inactive'], "buffers": total_memory['buffers'], "cached": total_memory['cached'], "shared": total_memory['shared'], "slab": total_memory['slab'] }] memory_dict = self.get_health_template(uid, is_fru=False) self.set_health_data(memory_dict, status=status, description=description, specifics=specifics) return memory_dict def get_memory_overall_usage(self): """Returns Memory overall usage.""" overall_usage = None mem_info = self.get_mem_info() if mem_info[0].get("health"): overall_usage = mem_info[0]["health"]["specifics"] else: logger.error( self.log.svc_log("Failed to get memory overall usage")) return overall_usage def get_fans_info(self): """Get the Fan sensor information using ipmitool.""" data = [] sensor_reading = self._ipmi.get_sensor_list_by_type('Fan') if sensor_reading is None: msg = "Failed to get Fan sensor reading using ipmitool" logger.error(self.log.svc_log(msg)) return for fan_reading in sensor_reading: sensor_id = fan_reading[0] fan_dict = self.get_health_template(sensor_id, is_fru=True) sensor_props = self._ipmi.get_sensor_props(sensor_id) status = 'OK' if fan_reading[2] == 'ok' else 'NA' lower_critical = sensor_props[1].get('Lower Critical', 'NA') upper_critical = sensor_props[1].get('Upper Critical', 'NA') specifics = [{ "Sensor Reading": f"{fan_reading[-1]}", "lower_critical_threshold": lower_critical, "upper_critical_threshold": upper_critical }] self.set_health_data(fan_dict, status=status, specifics=specifics) data.append(fan_dict) logger.debug(self.log.svc_log(f"Fan Health Data:{fan_dict}")) return data def get_sas_hba_info(self): """Return SAS-HBA current health.""" sas_hba_data = [] sas_instance = SAS() try: hosts = sas_instance.get_host_list() # ['host1'] except SASError as err: hosts = [] logger.error(self.log.svc_log(err)) except Exception as err: hosts = [] logger.exception(self.log.svc_log(err)) for host in hosts: host_id = const.SAS_RESOURCE_ID + host.replace('host', '') host_data = self.get_health_template(host_id, False) try: ports = sas_instance.get_port_list(host) # ports = ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3'] except SASError as err: ports = [] logger.error(self.log.svc_log(err)) except Exception as err: ports = [] logger.exception(self.log.svc_log(err)) health = "OK" specifics = {'num_ports': len(ports), 'ports': []} for port in ports: try: port_data = sas_instance.get_port_data(port) except SASError as err: port_data = [] logger.error(self.log.svc_log(err)) except Exception as err: port_data = [] logger.exception(self.log.svc_log(err)) specifics['ports'].append(port_data) if not port_data or port_data['state'] != 'running': health = "NA" self.set_health_data(host_data, health, specifics=[specifics]) sas_hba_data.append(host_data) return sas_hba_data def get_sas_ports_info(self): """Return SAS Ports current health.""" sas_ports_data = [] sas_instance = SAS() try: ports = sas_instance.get_port_list() # eg: ['port-1:0', 'port-1:1', 'port-1:2', 'port-1:3'] except SASError as err: ports = [] logger.error(self.log.svc_log(err)) except Exception as err: ports = [] logger.exception(self.log.svc_log(err)) for port in ports: port_id = 'sas_' + port port_data = self.get_health_template(port_id, False) try: phys = sas_instance.get_phy_list_for_port(port) # eg: [ 'phy-1:0', 'phy-1:1', 'phy-1:2', 'phy-1:3'] except SASError as err: phys = [] logger.error(self.log.svc_log(err)) except Exception as err: phys = [] logger.exception(self.log.svc_log(err)) specifics = {'num_phys': len(phys), 'phys': []} health = "OK" for phy in phys: try: phy_data = sas_instance.get_phy_data(phy) except SASError as err: phy_data = {} logger.error(self.log.svc_log(err)) except Exception as err: phy_data = {} logger.exception(self.log.svc_log(err)) specifics['phys'].append(phy_data) if not phy_data or phy_data['state'] != 'enabled' or \ 'Gbit' not in phy_data['negotiated_linkrate']: health = "NA" self.set_health_data(port_data, health, specifics=[specifics]) sas_ports_data.append(port_data) return sas_ports_data def get_nw_ports_info(self): """Return the Network ports information.""" network_cable_data = [] loopback_interface = {} sort_key_path = None io_counters = psutil.net_io_counters(pernic=True) nw_instance = Network() for interface, addrs in psutil.net_if_addrs().items(): nic_info = self.get_health_template(interface, False) specifics = {} for addr in addrs: if addr.family == socket.AF_INET: specifics["ipV4"] = addr.address if interface in io_counters: io_info = io_counters[interface] specifics = { "networkErrors": io_info.errin + io_info.errout, "droppedPacketsIn": io_info.dropin, "droppedPacketsOut": io_info.dropout, "packetsIn": io_info.packets_recv, "packetsOut": io_info.packets_sent, "trafficIn": io_info.bytes_recv, "trafficOut": io_info.bytes_sent } # Get the interface health status. nw_status, nw_cable_conn_status = \ self.get_nw_status(nw_instance, interface) specifics["nwStatus"] = nw_status specifics["nwCableConnStatus"] = nw_cable_conn_status specifics["logical_name"] = interface # Map and set the interface health status and description. map_status = { "CONNECTED": "OK", "DISCONNECTED": "Disabled/Failed", "UNKNOWN": "NA" } health_status = map_status[nw_cable_conn_status] desc = "Network Interface '%s' is %sin good health." % ( interface, '' if health_status == "OK" else 'not ') self.set_health_data(nic_info, health_status, description=desc, specifics=[specifics]) # Separating out loopback interface and ethernet interface # data to make correct sorting/mapping with manifest data. if interface == 'lo': loopback_interface = nic_info else: network_cable_data.append(nic_info) sort_key_path = self.resource_indexing_map["hw"]["nw_port"] network_cable_data = MonUtils.sort_by_specific_kv( network_cable_data, sort_key_path, self.log) if loopback_interface: network_cable_data.append(loopback_interface) return network_cable_data def get_nw_status(self, nw_interface, interface): """Read & Return the latest network status from sysfs files.""" try: nw_status = nw_interface.get_operational_state(interface) except NetworkError as err: nw_status = "UNKNOWN" logger.error(self.log.svc_log(err)) except Exception as err: nw_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) try: nw_cable_conn_status = nw_interface.get_link_state(interface) except NetworkError as err: nw_cable_conn_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) except Exception as err: nw_cable_conn_status = "UNKNOWN" logger.exception(self.log.svc_log(err)) return nw_status, nw_cable_conn_status def get_cortx_service_info(self): """Get cortx service info in required format.""" cortx_services = self.service.get_cortx_service_list() cortx_service_info = self.get_service_info(cortx_services) sort_key_path = self.resource_indexing_map["sw"]["cortx_sw_services"] cortx_service_info = MonUtils.sort_by_specific_kv( cortx_service_info, sort_key_path, self.log) return cortx_service_info def get_external_service_info(self): """Get external service info in required format.""" external_services = self.service.get_external_service_list() external_service_info = self.get_service_info(external_services) sort_key_path = self.resource_indexing_map["sw"][ "external_sw_services"] external_service_info = MonUtils.sort_by_specific_kv( external_service_info, sort_key_path, self.log) return external_service_info def get_service_info(self, services): services_info = [] for service in services: response = self.service.get_systemd_service_info(self.log, service) if response is not None: uid, health_status, health_description, recommendation, \ specifics = response service_info = self.get_health_template(uid, is_fru=False) self.set_health_data(service_info, health_status, health_description, recommendation, specifics) services_info.append(service_info) return services_info def get_raid_info(self): raids_data = [] for raid in RAIDs.get_configured_raids(): raid_data = self.get_health_template(raid.id, False) health, description = raid.get_health() devices = raid.get_devices() specifics = [{ "location": raid.raid, "data_integrity_status": raid.get_data_integrity_status(), "devices": devices }] self.set_health_data(raid_data, health, specifics=specifics, description=description) raids_data.append(raid_data) return raids_data @staticmethod def get_disk_overall_usage(): units_factor_GB = 1000000000 overall_usage = { "totalSpace": f'{int(psutil.disk_usage("/")[0])//int(units_factor_GB)} GB', "usedSpace": f'{int(psutil.disk_usage("/")[1])//int(units_factor_GB)} GB', "freeSpace": f'{int(psutil.disk_usage("/")[2])//int(units_factor_GB)} GB', "diskUsedPercentage": psutil.disk_usage("/")[3], } return overall_usage def get_disks_info(self): """Update and return server drive information in specific format.""" disks = [] sort_key_path = None for disk in Disk.get_disks(): uid = disk.path if disk.path else disk.id disk_health = self.get_health_template(uid, True) health_data = disk.get_health() health = "OK" if (health_data['SMART_health'] == "PASSED") else "Fault" serial_number = disk.id.split("-")[-1] if disk.id else "NA" health_data.update({"serial_number": serial_number}) self.set_health_data(disk_health, health, specifics=[{ "SMART": health_data }]) disks.append(disk_health) # Sort disk list by serial_number sort_key_path = self.resource_indexing_map["hw"]["disk"] disks = MonUtils.sort_by_specific_kv(disks, sort_key_path, self.log) logger.debug(self.log.svc_log(f"Disk Health Data:{disks}")) return disks def get_psu_info(self): """Update and return PSU information in specific format.""" psus_data = [] sort_key_path = None for psu in self.get_psus(): psu = {k.lower().replace(" ", "_"): v for k, v in psu.items()} data = self.get_health_template(f'{psu["location"]}', True) health = "OK" if (psu["status"] == "Present, OK") else "Fault" self.set_health_data(data, health, specifics=psu) psus_data.append(data) # Sort disk list by serial_number sort_key_path = self.resource_indexing_map["hw"]["psu"] psus_data = MonUtils.sort_by_specific_kv(psus_data, sort_key_path, self.log) logger.debug(self.log.svc_log(f"PSU Health Data:{psus_data}")) return psus_data @staticmethod def get_psus(): response, _, _ = SimpleProcess("dmidecode -t 39").run() matches = re.findall( "System Power Supply|Power Unit Group:.*|" "Location:.*|Name:.*|Serial Number:.*|" "Max Power Capacity:.*|Status: .*|" "Plugged:.*|Hot Replaceable:.*", response.decode()) psus = [] stack = [] while matches: item = matches.pop() while item != "System Power Supply": stack.append(item) item = matches.pop() psu = {} while stack: key, value = stack.pop().strip().split(":") psu[key] = value.strip() psus.append(psu) return psus