Esempio n. 1
0
class DecisionMonitor:
    """
    Fetch Resource Decisions from Decision DB.
    """

    def __init__(self):
        self._resource_file = Json(
            os.path.join(const.CONF_PATH, const.DECISION_MAPPING_FILE)).load()
        self._loop = asyncio.get_event_loop()
        self._consul_call = self.ConsulCallHandler(self._resource_file)

    class ConsulCallHandler:
        """
        Handle async call to consul
        """
        def __init__(self, resource_file):
            """
            Initialize consul call handler
            """
            self._decisiondb = DecisionDB()
            self._consul_timeout = resource_file.get("request_timeout", 3.0)

        async def get(self, **resource_key):
            """
            Get consul data else raise error
            """
            return await asyncio.wait_for(self._decisiondb.get_event_time(**resource_key,
                    sort_by=SortBy(DecisionModel.alert_time, SortOrder.DESC)),
                    timeout=self._consul_timeout)

        async def delete(self, **resource_key):
            """
            Delete consul data else raise error
            """
            await asyncio.wait_for(self._decisiondb.delete_event(**resource_key),
                    timeout=self._consul_timeout)

    def get_resource_status(self, resource: AnyStr):
        """
        Get the Status for Resource
        :param resource: Name of Resource :type: str
        :return:
        """
        Log.debug(f"Received Status Request for resource {resource}")
        resource_key = self._resource_file.get("resources", {}).get(resource, {})
        try:
            resource_data = self._loop.run_until_complete(
                    self._consul_call.get(**resource_key))
        except Exception as e:
            # Return OK if Failed to Fetch Resource Status.
            Log.error(f"{traceback.format_exc()} {e}")
            return Action.OK
        if resource_data:
            return resource_data[0].action
        return Action.OK

    def get_resource_group_status(self, resource_group):
        """
        Fetch Resource Group Status.
        :param resource_group: Name of Resource Group.
        :return:
        """
        group_status = []
        Log.debug(f"Received Status Request for resource group {resource_group}")
        # Fetch List of Resources in group
        resources = self._resource_file.get("resource_groups", {}).get(
            resource_group, [])
        for resource in resources:
            # Check's the status for each resource.
            status = self.get_resource_status(resource)
            if status in [Action.FAILED]:
                # Return Failed if any one is Failed Status in RG.
                return status
            group_status.append(status)
        if Action.RESOLVED in group_status:
            #  Return Resolved if none is Failed and any one is resolved Status in RG.
            return Action.RESOLVED
        return Action.OK

    def acknowledge_resource(self, resource, force=False):
        """
        Acknowledge a Single Resource Group.
        :param resource:
        :return:
        """
        Log.debug(f"Received Acknowledge Request for resource {resource}")
        resource_key = self._resource_file.get("resources", {}).get(resource, {})
        try:
            if force or not self.get_resource_status(resource) == Action.FAILED:
                self._loop.run_until_complete(
                    self._consul_call.delete(**resource_key))
        except Exception as e:
            Log.error(f"{e}")

    def acknowledge_resource_group(self, resource_group):
        """
        Acknowledge a Single Resource Group.
        :param resource_group:
        :return:
        """
        Log.debug(f"Received Acknowledge Request for resource group {resource_group}")
        resources = self._resource_file.get("resource_groups", {}).get(
            resource_group, [])
        for resource in resources:
            self.acknowledge_resource(resource)
class DecisionMaker(object):
    """
    This class is responsible for taking the HA decisions
    such as failover/failback with the help of RuleEngine
    """

    def __init__(self, decisiondb=DecisionDB()):
        self._rule_engine = RuleEngine(os.path.join(\
            const.CORTX_HA_INSTALL_PATH, const.RULES_FILE_PATH))
        self._decision_db = decisiondb
        self._conf = Json(os.path.join(\
            const.CORTX_HA_INSTALL_PATH, const.CONF_FILE_PATH)).load()

    async def _get_data_nw_interface(self, host_id):
        interface = []
        if self._conf:
            interface = self._conf.get(const.NETWORK).get(host_id).get\
                (const.DATA_IFACE)
        return interface

    async def _get_mgmt_nw_interface(self, host_id):
        interface = []
        if self._conf:
            interface = self._conf.get(const.NETWORK).get(host_id).get\
                (const.MGMT_IFACE)
        return interface

    async def _get_host_id(self, node_id):
        host_id = ""
        if self._conf:
            host_id = self._conf.get(const.NODES).get(node_id)
        return host_id

    async def handle_alert(self, alert):
        """
        Accepts alert in the dict format and validates the same
        alert against set of rules with the help of RuleEngine.
        """
        try:
            if alert is not None:
                action = self._rule_engine.evaluate_alert(alert)
                if action is not None:
                    await self._store_action(alert, action)
        except Exception as e:
            Log.error(f"Error occured during alert handling. {e}")

    async def _store_action(self, alert, action):
        """
        Further parses the alert to store information such as:
        component: Actual Hw component which has been affected
        component_id: FRU_ID
        entity: enclosure/node
        entity_id: resource id
        """
        try:
            sensor_response = alert.get(const.MESSAGE).get(const.SENSOR_RES_TYPE)
            info_dict = await self._set_db_key_info(sensor_response)
            if info_dict:
                await self._decision_db.store_event(info_dict[const.ENTITY], \
                    info_dict[const.ENTITY_ID], info_dict[const.COMPONENT], \
                    info_dict[const.COMPONENT_ID], info_dict[const.EVENT_TIME], action)
        except Exception as e:
            Log.error(f"Error occured during storing action. {e}")

    async def _set_db_key_info(self, sensor_response):
        """
        This function derives entity, entity_id, component, component_id,
        event_time from the incoming alert.
        These fields are required to create key for storing the decision in db.
        Key format -
        HA/entity/entity_id/component/component_Id/timestamp
        Examples -
        1. HA/Enclosure/0/controller/1/timestamp
        2. HA/Enclosure/0/controller/2/timestamp
        3. HA/Enclosure/0/fan/0/timestamp
        4. HA/Node/1/raid/0/timestamp
        5. HA/Node/0/IEM/motr/timestamp
        6. HA/Node/1/IEM/s3/timestamp
        """
        info_dict = dict()
        info = sensor_response.get(const.INFO)
        resource_type = info.get(const.RESOURCE_TYPE)
        resource_id = info.get(const.RESOURCE_ID)
        node_id = info.get(const.NODE_ID)
        host_id = await self._get_host_id(node_id)
        """
        1. Setting event time.
        """
        info_dict[const.EVENT_TIME] = info.get(const.EVENT_TIME)
        """
        Here resource type can be in 2 forms -
        1. enclosure:fru:disk, node:os:disk_space, node:interface:nw:cable etc
        2. enclosure, iem
        Spliting the resource type will give us the entity and component fields.
        """
        res_list = resource_type.split(':')

        """
        2. Setting entity.
        For IEM alerts we do not get Node/Enclosure in resource type, so we
        have to hardcode it to node.
        """
        if resource_type == const.IEM:
            component_var = sensor_response.get(const.SPECIFIC_INFO).get\
                (const.SPECIFIC_INFO_COMPONENT)
            info_dict[const.ENTITY] = const.NODE
            info_dict[const.COMPONENT] = resource_type
            info_dict[const.COMPONENT_ID] = component_var
        else:
            info_dict[const.ENTITY] = res_list[0]

        """
        3. Setting entity_id
        """
        if info_dict[const.ENTITY] == const.NODE:
            info_dict[const.ENTITY_ID] = host_id
        else:
            info_dict[const.ENTITY_ID] = "0"

        """
        4. Setting Component.
        We will check if we have got the component value in resource type.
        """
        if len(res_list) > 1:
            info_dict[const.COMPONENT] = res_list[len(res_list) - 1]
        else:
            """
            We have to perform some checks if component is not present in
            reource_type field.
            1. For storage connectivity we have component = connectivity
            2. For storage connectivity we have component_id = node/host id
            """
            if info_dict[const.ENTITY] == const.ENCLOSURE:
                info_dict[const.COMPONENT] = const.CONNECTIVITY
                info_dict[const.COMPONENT_ID] = host_id

        """
        5. Setting component id
        """
        if info_dict[const.COMPONENT] == const.CONTROLLER:
            info_dict[const.COMPONENT_ID] = host_id
        elif resource_type in (const.NIC, const.NIC_CABLE):
            """
            If resource_type is node:interface:nw, node:interface:nw:cable
            then we will read the values from config to know whether it is
            data or management interface.
            Since BMC interface is also included in NIC alert we do not have to
            take any against against it.
            In case we found the interface related to BMC so we will ignore it.
            """
            comp_id = await self._get_component_id_for_nic(host_id, resource_id)
            if comp_id:
                info_dict[const.COMPONENT_ID] = comp_id
            else:
                info_dict = {}
        elif resource_type not in (const.IEM, const.ENCLOSURE):
            """
            For IEM the component id is fetched from specific info's component
            id field
            """
            info_dict[const.COMPONENT_ID] = resource_id

        return info_dict

    async def _get_component_id_for_nic(self, host_id, resource_id):
        component_id = ""
        """
        First checking if resource is found in data_nw.
        """
        nw_interface = await self._get_data_nw_interface(host_id)
        if resource_id in nw_interface:
            component_id = const.DATA
        else:
            """
            Since resource not found in data_nw lets serach is mgmt_nw.
            """
            nw_interface = await self._get_mgmt_nw_interface(host_id)
            if resource_id in nw_interface:
                component_id = const.MGMT
        return component_id