Beispiel #1
0
 def publish_event(self, healthevent: HealthEvent, healthvalue: str = ""):
     """
     Produce event
     """
     healthevent.event_type = json.loads(healthvalue).get(
         "events")[0]["status"]
     node_id = healthevent.node_id
     self.producer.publish(str(healthevent))
     healthevent.node_id = node_id
Beispiel #2
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            alert = json.loads(msg).get(ALERT_ATTRIBUTES.MESSAGE)

            event = {
                event_attr.EVENT_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_ID],
                event_attr.EVENT_TYPE : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_TYPE],
                event_attr.SEVERITY : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SEVERITY],
                event_attr.SITE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.SITE_ID],
                event_attr.RACK_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RACK_ID],
                event_attr.CLUSTER_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.CLUSTER_ID],
                event_attr.STORAGESET_ID : "TBD",
                event_attr.NODE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.NODE_ID],
                event_attr.HOST_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.HOST_ID],
                event_attr.RESOURCE_TYPE : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RESOURCE_TYPE],
                event_attr.TIMESTAMP : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.EVENT_TIME],
                event_attr.RESOURCE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RESOURCE_ID],
                event_attr.SPECIFIC_INFO : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SPECIFIC_INFO]
            }
            Log.debug(f"Parsed {event} schema")
            health_event = HealthEvent.dict_to_object(event)
            Log.info(f"Event {event[event_attr.EVENT_ID]} is parsed and converted to object.")
            return health_event

        except Exception as e:
            raise EventParserException(f"Failed to parse alert. Message: {msg}, Error: {e}")
Beispiel #3
0
    def process_event(self, message: str) -> None:
        """
        Callback function to receive and process event.

        Args:
            message (str): event message.
        """
        try:
            event = json.loads(message.decode('utf-8'))
            health_event = HealthEvent.dict_to_object(event)
        except Exception as e:
            Log.error(
                f"Invalid format for event {message}, Error: {e}. Forcefully ack."
            )
            return CONSUMER_STATUS.SUCCESS
        Log.debug(f"Captured {message} for evaluating health monitor.")
        action_handler = None
        try:
            action_list = self._rule_manager.evaluate(health_event)
            if action_list:
                Log.info(f"Evaluated {health_event} with action {action_list}")
                action_handler = ActionFactory.get_action_handler(
                    health_event, action_list)
                action_handler.act(health_event, action_list)
            return CONSUMER_STATUS.SUCCESS
        except Exception as e:
            Log.error(
                f"Failed to process {message} error: {e} {traceback.format_exc()}"
            )
            return CONSUMER_STATUS.FAILED
Beispiel #4
0
 def publish_event(self, healthevent: HealthEvent):
     """
     Produce event
     """
     node_id = healthevent.node_id
     self.producer.publish(str(healthevent))
     healthevent.node_id = node_id
Beispiel #5
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            message = json.dumps(ast.literal_eval(msg))
            cluster_resource_alert = json.loads(message)
            timestamp = str(int(time.time()))
            event_id = timestamp + str(uuid.uuid4().hex)
            node_id = cluster_resource_alert["_resource_name"]
            resource_type = cluster_resource_alert["_resource_type"]
            event_type = cluster_resource_alert["_event_type"]
            timestamp = cluster_resource_alert["_timestamp"]
            generation_id = cluster_resource_alert["_generation_id"]

            event = {
                EVENT_ATTRIBUTES.EVENT_ID:
                event_id,
                EVENT_ATTRIBUTES.EVENT_TYPE:
                event_type,
                EVENT_ATTRIBUTES.SEVERITY:
                StatusMapper.EVENT_TO_SEVERITY_MAPPING[event_type],
                EVENT_ATTRIBUTES.SITE_ID:
                self.site_id,  # TODO: Should be fetched from confstore
                EVENT_ATTRIBUTES.RACK_ID:
                self.rack_id,  # TODO: Should be fetched from confstore
                EVENT_ATTRIBUTES.CLUSTER_ID:
                self.cluster_id,  # TODO: Should be fetched from confstore
                EVENT_ATTRIBUTES.STORAGESET_ID:
                node_id,
                EVENT_ATTRIBUTES.NODE_ID:
                node_id,
                EVENT_ATTRIBUTES.HOST_ID:
                node_id,
                EVENT_ATTRIBUTES.RESOURCE_TYPE:
                resource_type,
                EVENT_ATTRIBUTES.TIMESTAMP:
                timestamp,
                EVENT_ATTRIBUTES.RESOURCE_ID:
                node_id,
                EVENT_ATTRIBUTES.SPECIFIC_INFO: {
                    "generation_id": generation_id,
                    "pod_restart": 0
                }
            }

            Log.debug(f"Parsed {event} schema")
            health_event = HealthEvent.dict_to_object(event)
            Log.debug(
                f"Event {event[EVENT_ATTRIBUTES.EVENT_ID]} is parsed and converted to object."
            )
            return health_event

        except Exception as err:
            raise EventParserException(
                f"Failed to parse cluster resource alert. Message: {msg}, Error: {err}"
            )
Beispiel #6
0
 def _update_health(self, resource_type: str, resource_id: str,
                    event_type: str):
     event_template = self._system_health.get_health_event_template(
         nodeid=resource_id, event_type=event_type)
     health_event = HealthEvent.dict_to_object(event_template)
     self._system_health.process_event(health_event)
     Log.debug(
         f"{resource_type}:{resource_id} health updated to: {event_template}"
     )
Beispiel #7
0
    def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict:
        """
        Stop Node with nodeid.
        Args:
            node_id (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
                status: Succeeded, Failed, InProgress
        """
        # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
        node_name = ConfigManager.get_node_name(node_id=node_id)
        try:
            stop_status = json.loads(super().stop(node_id, **op_kwargs))
            if stop_status != None:
                if stop_status["status"] == const.STATUSES.SUCCEEDED.value:
                    # Node is already in offline state.
                    return stop_status
                elif stop_status["status"] == const.STATUSES.FAILED.value:
                    # Node is in failed state.
                    return stop_status

            # Put node in standby mode
            self._execute.run_cmd(
                const.PCS_NODE_STANDBY.replace("<node>", node_name),
                f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}")
            Log.info(f"Executed node standby for node {node_id}")
            # TODO: EOS-23859 : STOP NODE - Use PCS_STOP_NODE from const.py with timeout value
            status = f"Standby for node {node_id} is in progress"

            # Update node health
            # TODO : Health event update to be removed once fault_tolerance branch is merged
            initial_event = self._system_health.get_health_event_template(
                nodeid=node_id, event_type=HEALTH_EVENTS.FAULT.value)
            Log.debug(
                f"Node health : {initial_event} updated for node {node_id}")
            health_event = HealthEvent.dict_to_object(initial_event)
            self._system_health.process_event(health_event)
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "output": status,
                "error": ""
            }

        except Exception as e:
            raise ClusterManagerError(
                f"Failed to stop node {node_id}, Error: {e}")
Beispiel #8
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            message = json.dumps(ast.literal_eval(msg))
            cluster_resource_alert = json.loads(message)
            timestamp = cluster_resource_alert[EventAttr.EVENT_HEADER.value][EventAttr.TIMESTAMP.value]
            event_id = cluster_resource_alert[EventAttr.EVENT_HEADER.value][EventAttr.EVENT_ID.value]
            source = cluster_resource_alert[EventAttr.EVENT_PAYLOAD.value][HealthAttr.SOURCE.value]
            node_id = cluster_resource_alert[EventAttr.EVENT_PAYLOAD.value][HealthAttr.NODE_ID.value]
            resource_type = cluster_resource_alert[EventAttr.EVENT_PAYLOAD.value][HealthAttr.RESOURCE_TYPE.value]
            resource_id = cluster_resource_alert[EventAttr.EVENT_PAYLOAD.value][HealthAttr.RESOURCE_ID.value]
            event_type = cluster_resource_alert[EventAttr.EVENT_PAYLOAD.value][HealthAttr.RESOURCE_STATUS.value]
            specific_info = cluster_resource_alert[EventAttr.EVENT_PAYLOAD.value][HealthAttr.SPECIFIC_INFO.value]
            if resource_type == CLUSTER_ELEMENTS.NODE.value:
                if specific_info and specific_info["generation_id"]:
                    generation_id = specific_info["generation_id"]
                    specific_info = {"generation_id": generation_id, "pod_restart": 0}

            event = {
                event_attr.SOURCE : source,
                event_attr.EVENT_ID : event_id,
                event_attr.EVENT_TYPE : event_type,
                event_attr.SEVERITY : StatusMapper.EVENT_TO_SEVERITY_MAPPING[event_type],
                event_attr.SITE_ID : self.site_id, # TODO: Should be fetched from confstore
                event_attr.RACK_ID : self.rack_id, # TODO: Should be fetched from confstore
                event_attr.CLUSTER_ID : self.cluster_id, # TODO: Should be fetched from confstore
                event_attr.STORAGESET_ID : node_id,
                event_attr.NODE_ID : node_id,
                event_attr.HOST_ID : node_id,
                event_attr.RESOURCE_TYPE : resource_type,
                event_attr.TIMESTAMP : timestamp,
                event_attr.RESOURCE_ID : resource_id,
                event_attr.SPECIFIC_INFO : specific_info
            }

            Log.debug(f"Parsed {event} schema")
            health_event = HealthEvent.dict_to_object(event)
            Log.debug(f"Event {event[event_attr.EVENT_ID]} is parsed and converted to object.")
            return health_event

        except Exception as err:
            raise EventParserException(f"Failed to parse cluster resource alert. Message: {msg}, Error: {err}")
Beispiel #9
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            alert = json.loads(msg)

            event = {
                "event_id":
                alert['sensor_response_type']['alert_id'],
                "event_type":
                alert['sensor_response_type']['alert_type'],
                "severity":
                alert['sensor_response_type']['severity'],
                "site_id":
                alert['sensor_response_type']['info']['site_id'],
                "rack_id":
                alert['sensor_response_type']['info']['rack_id'],
                "cluster_id":
                alert['sensor_response_type']['info']['cluster_id'],
                "storageset_id":
                "TBD",
                "node_id":
                alert['sensor_response_type']['info']['node_id'],
                "host_id":
                alert['sensor_response_type']['host_id'],
                "resource_type":
                alert['sensor_response_type']['info']['resource_type'],
                "timestamp":
                alert['sensor_response_type']['info']['event_time'],
                "resource_id":
                alert['sensor_response_type']['info']['resource_id'],
                "specific_info":
                alert['sensor_response_type']['specific_info']
            }

            health_event = HealthEvent.dict_to_object(event)

            return health_event

        except Exception as e:
            raise EventAnalyzerError(
                f"Failed to parse alert. Message: {msg}, Error: {e}")
Beispiel #10
0
def main():
    component = "hare"
    resource_type = "node"
    state = "offline"
    # import pudb.remote
    # pudb.remote.set_trace(term_size=(130, 40), port=9998)

    # Before submitting a fake event, we need to register the component
    # (just to make sure that the message will be sent)
    EventManager.get_instance().subscribe(
        component, [SubscribeEvent(resource_type, [state])])
    handler = NodeActionHandler()

    event = HealthEvent("event_id", HEALTH_STATUSES.OFFLINE.value, "severity",
                        "1", "1", "e766bd52-c19c-45b6-9c91-663fd8203c2e",
                        "storage-set-1", "localhost", "srvnode-1.mgmt.public",
                        "node", "16215009572", "iem", "Description")
    handler.publish_event(event)
Beispiel #11
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            iem_alert = json.loads(msg).get(ALERT_ATTRIBUTES.MESSAGE)

            # Parse hostname and convert to node id
            iem_description = iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.DESCRIPTION]
            hostname = re.split("=", re.split(";", re.findall("host=.+", iem_description)[0])[0])[1]
            key_val = self._confstore.get(f"{PVTFQDN_TO_NODEID_KEY}/{hostname}")
            _, node_id = key_val.popitem()

            event = {
                event_attr.EVENT_ID : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_ID],
                event_attr.EVENT_TYPE : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_TYPE],
                event_attr.SEVERITY : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SEVERITY],
                event_attr.SITE_ID : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.SITE_ID],
                event_attr.RACK_ID : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RACK_ID],
                event_attr.CLUSTER_ID : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.CLUSTER_ID],
                event_attr.STORAGESET_ID : "TBD",
                event_attr.NODE_ID : node_id, # TODO: Temporary fix till IEM framework is available.
                event_attr.HOST_ID : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.HOST_ID],
                event_attr.RESOURCE_TYPE : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SPECIFIC_INFO][ALERT_ATTRIBUTES.MODULE].lower(),
                event_attr.TIMESTAMP : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.EVENT_TIME],
                event_attr.RESOURCE_ID : node_id,
                event_attr.SPECIFIC_INFO : iem_alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SPECIFIC_INFO]
            }
            # To be removed after HA starts populating IEM messages
            if event.get(event_attr.RESOURCE_TYPE) == CLUSTER_ELEMENTS.NODE.value and event.get(event_attr.SEVERITY) == EVENT_SEVERITIES.WARNING.value:
                event[event_attr.EVENT_TYPE] = HEALTH_EVENTS.FAILED.value

            Log.debug(f"Parsed {event} schema")
            health_event = HealthEvent.dict_to_object(event)
            Log.info(f"Event {event[event_attr.EVENT_ID]} is parsed and converted to object.")
            return health_event

        except Exception as e:
            raise EventParserException(f"Failed to parse IEM. Message: {msg}, Error: {e}")
 def _get_new_event(self, event_type, resource_type, resource_id, subelement_event) -> HealthEvent:
     """
     Update health event
     """
     new_event =  HealthEvent(
         source=subelement_event.source,
         event_id=subelement_event.event_id,
         event_type=event_type,
         severity=subelement_event.severity,
         site_id=subelement_event.site_id,
         rack_id=subelement_event.rack_id,
         cluster_id=subelement_event.cluster_id,
         storageset_id=subelement_event.storageset_id,
         node_id=subelement_event.node_id,
         host_id=subelement_event.host_id,
         resource_type=resource_type,
         timestamp=subelement_event.timestamp,
         resource_id=resource_id,
         specific_info=None
     )
     Log.info(f"New event is created: {resource_type}:{resource_id}, status: {resource_id}")
     return new_event
Beispiel #13
0
 def _add_health_event(self,
                       node_id: str,
                       resource_type: str,
                       resource_id: str,
                       specific_info: dict = None) -> None:
     """
     Add health events for multiple resources (e.g. Node, CVG, disk)
     Args:
         node_id (str): node id
         resource_type (str): Resource type will be Node, CVG, disk, etc.
         resource_id (str): Resource id
         specific_info(dict): Ex. cvg_id for resource Disk
     """
     timestamp = str(int(time.time()))
     event_id = timestamp + str(uuid.uuid4().hex)
     health_event = {
         EVENT_ATTRIBUTES.SOURCE: HEALTH_EVENT_SOURCES.HA.value,
         EVENT_ATTRIBUTES.EVENT_ID: event_id,
         EVENT_ATTRIBUTES.EVENT_TYPE: HEALTH_EVENTS.UNKNOWN.value,
         EVENT_ATTRIBUTES.SEVERITY: EVENT_SEVERITIES.INFORMATIONAL.value,
         EVENT_ATTRIBUTES.SITE_ID: self._site_id,
         EVENT_ATTRIBUTES.RACK_ID: self._rack_id,
         EVENT_ATTRIBUTES.CLUSTER_ID: self._cluster_id,
         EVENT_ATTRIBUTES.STORAGESET_ID: self._storageset_id,
         EVENT_ATTRIBUTES.NODE_ID: node_id,
         EVENT_ATTRIBUTES.HOST_ID: None,
         EVENT_ATTRIBUTES.RESOURCE_TYPE: resource_type,
         EVENT_ATTRIBUTES.TIMESTAMP: timestamp,
         EVENT_ATTRIBUTES.RESOURCE_ID: resource_id,
         EVENT_ATTRIBUTES.SPECIFIC_INFO: specific_info
     }
     Log.debug(
         f"Adding initial health {health_event} for {resource_type} : {resource_id}"
     )
     health_event = HealthEvent.dict_to_object(health_event)
     system_health = SystemHealth(self._confstore)
     system_health.process_event(health_event)
Beispiel #14
0
    MSG = True
    return CONSUMER_STATUS.SUCCESS_STOP


if __name__ == '__main__':
    try:
        print("********Event Publisher********")
        event_manager = EventManager.get_instance()
        component = "csm"
        resource_type = "node:fru:disk"
        state = "failed"
        message_type = event_manager.subscribe(
            'csm', [SubscribeEvent(resource_type, [state])])
        print(f"Subscribed {component}, message type is {message_type}")
        health_event = HealthEvent("csm", "1", "failed", "fault", "1", "1",
                                   "_1", "1", "1", "1", "node", "16215009572",
                                   "1", None)
        action_event = RecoveryActionEvent(health_event)
        event_manager.publish(action_event.get_event())
        print("Consuming the action event")
        message_consumer = MessageBus.get_consumer(
            consumer_id="1",
            consumer_group='test_publisher',
            message_type=message_type,
            callback=receive)
        message_consumer.start()
        while not MSG:
            time.sleep(2)
            print("waiting for msg")
        message_consumer.stop()
        unsubscribe = event_manager.unsubscribe(
    MSG = True
    return CONSUMER_STATUS.SUCCESS_STOP


if __name__ == '__main__':
    try:
        print("********Event Publisher********")
        event_manager = EventManager.get_instance()
        component = "csm"
        resource_type = "node:fru:disk"
        state = "failed"
        message_type = event_manager.subscribe(
            'csm', [SubscribeEvent(resource_type, [state])])
        print(f"Subscribed {component}, message type is {message_type}")
        health_event = HealthEvent("event_1", "failed", "fault", "site_1",
                                   "rack_1", "cluster_1", "storageset_1",
                                   "node_1", "abcd.com", "node:fru:disk",
                                   "16215009572", "disk_1", None)
        action_event = RecoveryActionEvent(health_event)
        event_manager.publish(action_event)
        print("Consuming the action event")
        message_consumer = MessageBus.get_consumer(
            consumer_id="1",
            consumer_group='test_publisher',
            message_type=message_type,
            callback=receive)
        message_consumer.start()
        while not MSG:
            time.sleep(2)
            print("waiting for msg")
        message_consumer.stop()
        unsubscribe = event_manager.unsubscribe(
Beispiel #16
0
if __name__ == '__main__':
    # TODO: Import and use config_manager.py
    Conf.init()
    Conf.load(const.HA_GLOBAL_INDEX, f"yaml://{const.SOURCE_CONFIG_FILE}")
    log_path = Conf.get(const.HA_GLOBAL_INDEX, f"LOG{_DELIM}path")
    log_level = Conf.get(const.HA_GLOBAL_INDEX, f"LOG{_DELIM}level")
    Log.init(service_name='ha_system_health', log_path=log_path, level=log_level)

    try:
        store = ConfigManager.get_confstore()
        health = SystemHealth(store)
        """
        Test case 1
        """
        event = HealthEvent("event_id", "fault", "severity", "1", "1", "e766bd52-c19c-45b6-9c91-663fd8203c2e", "storage-set-1",
                            "2", "srvnode-1.mgmt.public", "node", "16215009572", "iem", "Description")
        health.process_event(event)
        node_info = health.get_node_status(node_id="2")
        node_status = node_info['status']
        if node_status != const.NODE_STATUSES.CLUSTER_OFFLINE.value:
            Log.error("Test case 1 failed : node status must be 'offline' for event 'fault'")

        """
        Test case 2
        """
        event = HealthEvent("event_id", "fault_resolved", "severity", "1", "1", "e766bd52-c19c-45b6-9c91-663fd8203c2e", "storage-set-1",
                            "2", "srvnode-1.mgmt.public", "node", "16215009572", "iem", "Description")
        health.process_event(event)
        node_info = health.get_node_status(node_id="2")
        node_status = node_info['status']
        if node_status != const.NODE_STATUSES.ONLINE.value:
Beispiel #17
0
    def process_event(self, healthevent: HealthEvent):
        """
        Process Event method. This method could be called for updating the health status.
        """

        # TODO: Check the user and see if allowed to update the system health.
        try:
            status = self.statusmapper.map_event(healthevent.event_type)
            component = SystemHealthComponents.get_component(
                healthevent.resource_type)

            # Get the health update hierarchy
            self.update_hierarchy = SystemHealthHierarchy.get_hierarchy(
                component)
            if (len(self.update_hierarchy) -
                    1) > self.update_hierarchy.index(component):
                next_component = self.update_hierarchy[
                    self.update_hierarchy.index(component) + 1]
            else:
                next_component = None

            # Get the component type and id received in the event.
            component_type = healthevent.resource_type.split(':')[-1]
            component_id = healthevent.resource_id
            Log.info(
                f"SystemHealth: Processing {component}:{component_type}:{component_id} with status {status}"
            )

            # Update the node map
            self.node_id = healthevent.node_id
            self.node_map = {
                'cluster_id': healthevent.cluster_id,
                'site_id': healthevent.site_id,
                'rack_id': healthevent.rack_id,
                'storageset_id': healthevent.storageset_id
            }

            # Read the currently stored health value
            current_health = self.get_status_raw(
                component,
                component_id,
                comp_type=component_type,
                cluster_id=healthevent.cluster_id,
                site_id=healthevent.site_id,
                rack_id=healthevent.rack_id,
                storageset_id=healthevent.storageset_id,
                node_id=healthevent.node_id,
                server_id=healthevent.node_id,
                storage_id=healthevent.node_id)

            current_timestamp = str(int(time.time()))
            if current_health:
                current_health_dict = json.loads(current_health)
                specific_info = current_health_dict["events"][0][
                    "specific_info"]
                if current_health and specific_info:
                    # If health is already stored and its a node_health, check further
                    stored_genration_id = current_health_dict["events"][0][
                        "specific_info"]["generation_id"]
                    incoming_generation_id = healthevent.specific_info[
                        "generation_id"]
                    incoming_health_status = current_health_dict["events"][0][
                        "status"]
                    pod_restart_val = current_health_dict["events"][0][
                        "specific_info"]["pod_restart"]
                    # Update the current health value itself.
                    latest_health = EntityHealth.read(current_health)
                    if stored_genration_id != incoming_generation_id:
                        if incoming_health_status == status:
                            # If the generation id matches and stored node health matches
                            # with incoming node health, means online event received first
                            # instead of failed event in delete scenario
                            healthevent.specific_info = {
                                "generation_id": stored_genration_id,
                                "pod_restart": 1
                            }
                            healthevent.event_type = "failed"
                            updated_health = SystemHealth.create_updated_event_object(
                                healthevent.timestamp, current_timestamp,
                                healthevent.event_type,
                                healthevent.specific_info, latest_health)
                            # Create a "failed" event and update it in system health and publish
                            self._check_and_update(current_health,
                                                   updated_health, healthevent,
                                                   next_component)
                            current_health = updated_health
                            # Now create an "online" event and update it in system health and publish
                            healthevent.specific_info = {
                                "generation_id": incoming_generation_id,
                                "pod_restart": 1
                            }
                            healthevent.event_type = "online"
                            updated_health = SystemHealth.create_updated_event_object(
                                healthevent.timestamp, current_timestamp,
                                healthevent.event_type,
                                healthevent.specific_info, latest_health)
                            self._check_and_update(current_health,
                                                   updated_health, healthevent,
                                                   next_component)
                        elif pod_restart_val is not None and pod_restart_val:
                            # Check the pod_restart value assosciated with Node, if its 1,
                            # means this alert is already updated. No need to send the alert again.
                            # Just need to reset the pod_restart value
                            key = self._prepare_key(component, cluster_id=self.node_map['cluster_id'], \
                                site_id=self.node_map['site_id'], rack_id=self.node_map['rack_id'], \
                                node_id=self.node_id)
                            latest_health_dict = json.loads(current_health)
                            new_spec_info = {
                                "generation_id": stored_genration_id,
                                "pod_restart": 0
                            }
                            latest_health_dict["events"][0][
                                "specific_info"] = new_spec_info
                            updated_health = EntityHealth.write(
                                latest_health_dict)
                            self.healthmanager.set_key(key, updated_health)
                    else:
                        # current health is there and generation id is also already present.
                        # That means its a normal failure scenario
                        updated_health = SystemHealth.create_updated_event_object(
                            healthevent.timestamp, current_timestamp, status,
                            healthevent.specific_info, latest_health)
                        self._check_and_update(current_health, updated_health,
                                               healthevent, next_component)
                else:
                    # Update hierachical components. such as site, rack
                    latest_health = EntityHealth.read(current_health)
                    updated_health = SystemHealth.create_updated_event_object(
                        healthevent.timestamp, current_timestamp, status,
                        healthevent.specific_info, latest_health)
                    self._check_and_update(current_health, updated_health,
                                           healthevent, next_component)
            else:
                # Health value not present in the store currently, create now.
                latest_health = EntityHealth()
                updated_health = SystemHealth.create_updated_event_object(
                    healthevent.timestamp, current_timestamp, status,
                    healthevent.specific_info, latest_health)
                self._check_and_update(current_health, updated_health,
                                       healthevent, next_component)
        except Exception as err:
            Log.error(
                f"Failed processing system health event with Error: {err}")
            raise HaSystemHealthException(
                "Failed processing system health event")
        state = K8S_ALERT_STATUS.STATUS_FAILED.value
        message_type = event_manager.subscribe(
            'hare', [SubscribeEvent(resource_type, [state])])
        print(f"Subscribed {component}, message type is {message_type}")
        k8s_event = K8SAlert("cortx", "node2", "cortx-data123",
                             K8S_ALERT_STATUS.STATUS_FAILED.value,
                             K8S_ALERT_RESOURCE_TYPE.RESOURCE_TYPE_POD.value,
                             "16215909572")

        timestamp = str(int(time.time()))
        event_id = timestamp + str(uuid.uuid4().hex)
        event_type = k8s_event.status
        if k8s_filter.filter_event(json.dumps(k8s_event.__dict__)):
            health_event = HealthEvent(event_id, event_type,
                                       EVENT_SEVERITIES.CRITICAL.value, "1",
                                       "1", "1", "1", "srvnode_1", "srvnode_1",
                                       "pod", "16215909572", "cortx-data-pod",
                                       {"namespace": "cortx"})
            recovery_action_event = RecoveryActionEvent(health_event)
            event_manager.publish(recovery_action_event)
        else:
            print("Event is dropped as it doesn't meet criteria")
            sys.exit(0)
        print("Consuming the action event")
        message_consumer = MessageBus.get_consumer(
            consumer_id="1",
            consumer_group='test_publisher',
            message_type=message_type,
            callback=receive)
        message_consumer.start()
        while not MSG:
Beispiel #19
0
    def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict:
        """
        Stop (poweroff) node with node_id.
        Args:
            node_id (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        poweroff = op_kwargs.get("poweroff") if op_kwargs.get(
            "poweroff") is not None else False
        storageoff = op_kwargs.get("storageoff") if op_kwargs.get(
            "storageoff") is not None else False
        # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
        node_name = ConfigManager.get_node_name(node_id=node_id)
        try:
            stop_status = json.loads(super().stop(node_id, **op_kwargs))
            if stop_status != None:
                if stop_status["status"] == const.STATUSES.SUCCEEDED.value:
                    # Node is already in offline state.
                    return stop_status
                elif stop_status["status"] == const.STATUSES.FAILED.value:
                    # Node is in failed state.
                    return stop_status

            if storageoff:
                # Stop services on node except sspl-ll
                self._controllers[const.SERVICE_CONTROLLER].stop(
                    node_id=node_name,
                    excludeResourceList=[RESOURCE.SSPL_LL.value])

                # Stop the storage enclosure on the node
                actuator_mgr = ActuatorManager()
                actuator_mgr.enclosure_stop(node_name)
                Log.info(f"Enclosure stopped for {node_name}")
                # TODO: Update enclosure health

                # Put node in standby mode
                self._execute.run_cmd(
                    const.PCS_NODE_STANDBY.replace("<node>", node_name),
                    f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}")
                Log.info(f"Executed node standby for node {node_id}")
                self._controllers[const.SERVICE_CONTROLLER].clear_resources(
                    node_id=node_name)
            else:
                self._execute.run_cmd(
                    const.PCS_NODE_STANDBY.replace("<node>", node_name),
                    f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}")
                Log.info(f"Executed node standby for node {node_id}")
            status = f"For node {node_id}, Standby is in progress"

            # Update node health
            # TODO : Health event update to be removed once fault_tolerance branch is merged
            initial_event = self._system_health.get_health_event_template(
                nodeid=node_id, event_type=HEALTH_EVENTS.FAULT.value)
            Log.debug(
                f"Node health : {initial_event} updated for node {node_id}")
            health_event = HealthEvent.dict_to_object(initial_event)
            self._system_health.process_event(health_event)

            # Node power off
            if poweroff:
                self._execute.run_cmd(
                    const.DISABLE_STONITH.replace("<node>", node_name))
                self.fencing_agent.power_off(node_id=node_name)
                status = f"Power off for node {node_id} is in progress"
            Log.info(f"Node power off successfull. status : {status}")
            # TODO : return status should be changed according to passed parameters
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "error": "",
                "output": status
            }
        except Exception as e:
            raise ClusterManagerError(
                f"Failed to stop node {node_id}, Error: {e}")
Beispiel #20
0
    def process_event(self, healthevent: HealthEvent):
        """
        Process Event method. This method could be called for updating the health status.
        """

        # TODO: Check the user and see if allowed to update the system health.
        try:
            status = self.statusmapper.map_event(healthevent)
            component = SystemHealthComponents.get_component(healthevent.resource_type)

            # Get the health update hierarchy
            self.update_hierarchy = SystemHealthHierarchy.get_hierarchy(component)
            if (len(self.update_hierarchy) - 1) > self.update_hierarchy.index(component):
                next_component = self.update_hierarchy[self.update_hierarchy.index(component) + 1]
            else:
                next_component = None

            # Disabling the hierarchical updates in HA
            next_component = None

            # Get the component type and id received in the event.
            component_type = healthevent.resource_type.split(':')[-1]
            component_id = healthevent.resource_id
            Log.info(f"SystemHealth: Processing {component}:{component_type}:{component_id} with status {status}")

            # Update the node map
            self.node_id = healthevent.node_id
            self.cvg_id = None
            if component_type == CLUSTER_ELEMENTS.DISK.value:
                if not isinstance(healthevent.specific_info, dict):
                    healthevent.specific_info = {}
                if healthevent.specific_info.get(NODE_MAP_ATTRIBUTES.CVG_ID.value):
                    self.cvg_id = healthevent.specific_info[NODE_MAP_ATTRIBUTES.CVG_ID.value]
                else:
                    match_criteria = {CLUSTER_ELEMENTS.NODE.value: self.node_id,
                                      component_type: component_id}
                    cvg_list = self._get_cvg_list(healthevent, match_criteria)
                    if len(cvg_list) > 1:
                        Log.error(f"Expected only 1 cvg_id, but received {len(cvg_list)}")
                    self.cvg_id = cvg_list[0] if cvg_list else None
                    healthevent.specific_info[NODE_MAP_ATTRIBUTES.CVG_ID.value] = self.cvg_id

            self.node_map = {'cluster_id':healthevent.cluster_id, 'site_id':healthevent.site_id,
                    'rack_id':healthevent.rack_id, 'storageset_id':healthevent.storageset_id}

            # Read the currently stored health value
            current_health = self.get_status_raw(component, component_id, comp_type=component_type,
                                        cluster_id=healthevent.cluster_id, site_id=healthevent.site_id,
                                        rack_id=healthevent.rack_id, storageset_id=healthevent.storageset_id,
                                        node_id=healthevent.node_id, server_id=healthevent.node_id,
                                        storage_id=healthevent.node_id, cvg_id=self.cvg_id)

            current_timestamp = str(int(time.time()))
            if current_health:
                current_health_dict = json.loads(current_health)
                specific_info = current_health_dict["events"][0]["specific_info"]
                if (component_type == CLUSTER_ELEMENTS.NODE.value) and specific_info \
                    and healthevent.source == HEALTH_EVENT_SOURCES.MONITOR.value:
                    # If health is already stored and its a node_health, check further
                    stored_genration_id = current_health_dict["events"][0]["specific_info"]["generation_id"]
                    stored_status = current_health_dict["events"][0]["status"]
                    incoming_generation_id = healthevent.specific_info["generation_id"]
                    incoming_health_status = healthevent.event_type
                    pod_restart_val = current_health_dict["events"][0]["specific_info"]["pod_restart"]
                    # Update the current health value itself.
                    latest_health = EntityHealth.read(current_health)
                    # TODO: Add stored_status != offline.
                    if stored_genration_id and (stored_genration_id != incoming_generation_id) and (stored_status != HEALTH_EVENTS.FAILED.value):
                        # If stored_generation_id and incoming_generation_id is not same means,
                        # pod has been restarted, but for replicaset pod down/up,
                        # stored_status is already set as failed, no need to count pod_restart,
                        # in this case it will go to else part.
                        if (incoming_health_status == HEALTH_EVENTS.ONLINE.value):
                            # In delete scenario, online event comes first, followed by failed event.
                            # System health is expected to update the failed event first, then online event.
                            # If incoming is online event, change the stored event type to failed.
                            # Update the failed event in system health and followed by incoming online event.
                            healthevent.specific_info = {"generation_id": stored_genration_id, "pod_restart": 1}
                            healthevent.event_type = "failed"
                            updated_health = SystemHealth.create_updated_event_object(healthevent.timestamp, current_timestamp, healthevent.event_type, healthevent.specific_info, latest_health)
                            # Create a "failed" event and update it in system health and publish
                            self._check_and_update(current_health, updated_health, healthevent, next_component)
                            current_health = updated_health
                            # Now create an "online" event and update it in system health and publish
                            healthevent.specific_info = {"generation_id": incoming_generation_id, "pod_restart": 1}
                            healthevent.event_type = "online"
                            updated_health = SystemHealth.create_updated_event_object(healthevent.timestamp, current_timestamp, healthevent.event_type, healthevent.specific_info, latest_health)
                            self._check_and_update(current_health, updated_health, healthevent, next_component)
                        elif pod_restart_val is not None and pod_restart_val:
                            # Check the pod_restart value associated with Node, if its 1,
                            # means this alert is already updated. No need to send the alert again.
                            # Just need to reset the pod_restart value
                            key = self._prepare_key(component, cluster_id=self.node_map['cluster_id'], \
                                site_id=self.node_map['site_id'], rack_id=self.node_map['rack_id'], \
                                node_id=self.node_id)
                            latest_health_dict = json.loads(current_health)
                            new_spec_info = {"generation_id": stored_genration_id, "pod_restart": 0}
                            latest_health_dict["events"][0]["specific_info"] = new_spec_info
                            updated_health = EntityHealth.write(latest_health_dict)
                            self.healthmanager.set_key(key, updated_health)
                    else:
                        # current health is there and generation id is also already present.
                        # That means its a normal failure scenario
                        updated_health = SystemHealth.create_updated_event_object(healthevent.timestamp, current_timestamp, status, healthevent.specific_info, latest_health)
                        self._check_and_update(current_health, updated_health, healthevent, next_component)
                else:
                    # Update hierarchical components. such as site, rack
                    latest_health = EntityHealth.read(current_health)
                    updated_health = SystemHealth.create_updated_event_object(healthevent.timestamp, current_timestamp, status, healthevent.specific_info, latest_health)
                    self._check_and_update(current_health, updated_health, healthevent, next_component)
            else:
                # Health value not present in the store currently, create now.
                latest_health = EntityHealth()
                updated_health = SystemHealth.create_updated_event_object(healthevent.timestamp, current_timestamp, status, healthevent.specific_info, latest_health)
                self._check_and_update(current_health, updated_health, healthevent, next_component)
        except Exception as err:
            Log.error(f"Failed processing system health event with Error: {err}")
            raise HaSystemHealthException("Failed processing system health event")