class DecisionMonitor: """ Fetch Resource Decisions from Decision DB. """ def __init__(self): self._resource_file = Json( os.path.join(const.CONF_PATH, const.DECISION_MAPPING_FILE)).load() self._loop = asyncio.get_event_loop() self._consul_call = self.ConsulCallHandler(self._resource_file) class ConsulCallHandler: """ Handle async call to consul """ def __init__(self, resource_file): """ Initialize consul call handler """ self._decisiondb = DecisionDB() self._consul_timeout = resource_file.get("request_timeout", 3.0) async def get(self, **resource_key): """ Get consul data else raise error """ return await asyncio.wait_for(self._decisiondb.get_event_time(**resource_key, sort_by=SortBy(DecisionModel.alert_time, SortOrder.DESC)), timeout=self._consul_timeout) async def delete(self, **resource_key): """ Delete consul data else raise error """ await asyncio.wait_for(self._decisiondb.delete_event(**resource_key), timeout=self._consul_timeout) def get_resource_status(self, resource: AnyStr): """ Get the Status for Resource :param resource: Name of Resource :type: str :return: """ Log.debug(f"Received Status Request for resource {resource}") resource_key = self._resource_file.get("resources", {}).get(resource, {}) try: resource_data = self._loop.run_until_complete( self._consul_call.get(**resource_key)) except Exception as e: # Return OK if Failed to Fetch Resource Status. Log.error(f"{traceback.format_exc()} {e}") return Action.OK if resource_data: return resource_data[0].action return Action.OK def get_resource_group_status(self, resource_group): """ Fetch Resource Group Status. :param resource_group: Name of Resource Group. :return: """ group_status = [] Log.debug(f"Received Status Request for resource group {resource_group}") # Fetch List of Resources in group resources = self._resource_file.get("resource_groups", {}).get( resource_group, []) for resource in resources: # Check's the status for each resource. status = self.get_resource_status(resource) if status in [Action.FAILED]: # Return Failed if any one is Failed Status in RG. return status group_status.append(status) if Action.RESOLVED in group_status: # Return Resolved if none is Failed and any one is resolved Status in RG. return Action.RESOLVED return Action.OK def acknowledge_resource(self, resource, force=False): """ Acknowledge a Single Resource Group. :param resource: :return: """ Log.debug(f"Received Acknowledge Request for resource {resource}") resource_key = self._resource_file.get("resources", {}).get(resource, {}) try: if force or not self.get_resource_status(resource) == Action.FAILED: self._loop.run_until_complete( self._consul_call.delete(**resource_key)) except Exception as e: Log.error(f"{e}") def acknowledge_resource_group(self, resource_group): """ Acknowledge a Single Resource Group. :param resource_group: :return: """ Log.debug(f"Received Acknowledge Request for resource group {resource_group}") resources = self._resource_file.get("resource_groups", {}).get( resource_group, []) for resource in resources: self.acknowledge_resource(resource)
class DecisionMaker(object): """ This class is responsible for taking the HA decisions such as failover/failback with the help of RuleEngine """ def __init__(self, decisiondb=DecisionDB()): self._rule_engine = RuleEngine(os.path.join(\ const.CORTX_HA_INSTALL_PATH, const.RULES_FILE_PATH)) self._decision_db = decisiondb self._conf = Json(os.path.join(\ const.CORTX_HA_INSTALL_PATH, const.CONF_FILE_PATH)).load() async def _get_data_nw_interface(self, host_id): interface = [] if self._conf: interface = self._conf.get(const.NETWORK).get(host_id).get\ (const.DATA_IFACE) return interface async def _get_mgmt_nw_interface(self, host_id): interface = [] if self._conf: interface = self._conf.get(const.NETWORK).get(host_id).get\ (const.MGMT_IFACE) return interface async def _get_host_id(self, node_id): host_id = "" if self._conf: host_id = self._conf.get(const.NODES).get(node_id) return host_id async def handle_alert(self, alert): """ Accepts alert in the dict format and validates the same alert against set of rules with the help of RuleEngine. """ try: if alert is not None: action = self._rule_engine.evaluate_alert(alert) if action is not None: await self._store_action(alert, action) except Exception as e: Log.error(f"Error occured during alert handling. {e}") async def _store_action(self, alert, action): """ Further parses the alert to store information such as: component: Actual Hw component which has been affected component_id: FRU_ID entity: enclosure/node entity_id: resource id """ try: sensor_response = alert.get(const.MESSAGE).get(const.SENSOR_RES_TYPE) info_dict = await self._set_db_key_info(sensor_response) if info_dict: await self._decision_db.store_event(info_dict[const.ENTITY], \ info_dict[const.ENTITY_ID], info_dict[const.COMPONENT], \ info_dict[const.COMPONENT_ID], info_dict[const.EVENT_TIME], action) except Exception as e: Log.error(f"Error occured during storing action. {e}") async def _set_db_key_info(self, sensor_response): """ This function derives entity, entity_id, component, component_id, event_time from the incoming alert. These fields are required to create key for storing the decision in db. Key format - HA/entity/entity_id/component/component_Id/timestamp Examples - 1. HA/Enclosure/0/controller/1/timestamp 2. HA/Enclosure/0/controller/2/timestamp 3. HA/Enclosure/0/fan/0/timestamp 4. HA/Node/1/raid/0/timestamp 5. HA/Node/0/IEM/motr/timestamp 6. HA/Node/1/IEM/s3/timestamp """ info_dict = dict() info = sensor_response.get(const.INFO) resource_type = info.get(const.RESOURCE_TYPE) resource_id = info.get(const.RESOURCE_ID) node_id = info.get(const.NODE_ID) host_id = await self._get_host_id(node_id) """ 1. Setting event time. """ info_dict[const.EVENT_TIME] = info.get(const.EVENT_TIME) """ Here resource type can be in 2 forms - 1. enclosure:fru:disk, node:os:disk_space, node:interface:nw:cable etc 2. enclosure, iem Spliting the resource type will give us the entity and component fields. """ res_list = resource_type.split(':') """ 2. Setting entity. For IEM alerts we do not get Node/Enclosure in resource type, so we have to hardcode it to node. """ if resource_type == const.IEM: component_var = sensor_response.get(const.SPECIFIC_INFO).get\ (const.SPECIFIC_INFO_COMPONENT) info_dict[const.ENTITY] = const.NODE info_dict[const.COMPONENT] = resource_type info_dict[const.COMPONENT_ID] = component_var else: info_dict[const.ENTITY] = res_list[0] """ 3. Setting entity_id """ if info_dict[const.ENTITY] == const.NODE: info_dict[const.ENTITY_ID] = host_id else: info_dict[const.ENTITY_ID] = "0" """ 4. Setting Component. We will check if we have got the component value in resource type. """ if len(res_list) > 1: info_dict[const.COMPONENT] = res_list[len(res_list) - 1] else: """ We have to perform some checks if component is not present in reource_type field. 1. For storage connectivity we have component = connectivity 2. For storage connectivity we have component_id = node/host id """ if info_dict[const.ENTITY] == const.ENCLOSURE: info_dict[const.COMPONENT] = const.CONNECTIVITY info_dict[const.COMPONENT_ID] = host_id """ 5. Setting component id """ if info_dict[const.COMPONENT] == const.CONTROLLER: info_dict[const.COMPONENT_ID] = host_id elif resource_type in (const.NIC, const.NIC_CABLE): """ If resource_type is node:interface:nw, node:interface:nw:cable then we will read the values from config to know whether it is data or management interface. Since BMC interface is also included in NIC alert we do not have to take any against against it. In case we found the interface related to BMC so we will ignore it. """ comp_id = await self._get_component_id_for_nic(host_id, resource_id) if comp_id: info_dict[const.COMPONENT_ID] = comp_id else: info_dict = {} elif resource_type not in (const.IEM, const.ENCLOSURE): """ For IEM the component id is fetched from specific info's component id field """ info_dict[const.COMPONENT_ID] = resource_id return info_dict async def _get_component_id_for_nic(self, host_id, resource_id): component_id = "" """ First checking if resource is found in data_nw. """ nw_interface = await self._get_data_nw_interface(host_id) if resource_id in nw_interface: component_id = const.DATA else: """ Since resource not found in data_nw lets serach is mgmt_nw. """ nw_interface = await self._get_mgmt_nw_interface(host_id) if resource_id in nw_interface: component_id = const.MGMT return component_id