Beispiel #1
0
    def process_event(self, message: str) -> None:
        """
        Callback function to receive and process event.

        Args:
            message (str): event message.
        """
        try:
            event = json.loads(message.decode('utf-8'))
            health_event = HealthEvent.dict_to_object(event)
        except Exception as e:
            Log.error(
                f"Invalid format for event {message}, Error: {e}. Forcefully ack."
            )
            return CONSUMER_STATUS.SUCCESS
        Log.debug(f"Captured {message} for evaluating health monitor.")
        action_handler = None
        try:
            action_list = self._rule_manager.evaluate(health_event)
            if action_list:
                Log.info(f"Evaluated {health_event} with action {action_list}")
                action_handler = ActionFactory.get_action_handler(
                    health_event, action_list)
                action_handler.act(health_event, action_list)
            return CONSUMER_STATUS.SUCCESS
        except Exception as e:
            Log.error(
                f"Failed to process {message} error: {e} {traceback.format_exc()}"
            )
            return CONSUMER_STATUS.FAILED
Beispiel #2
0
 async def receive(request):
     Log.debug(f"Received GET request for component " \
         f"{request.rel_url.query['component']}")
     try:
         component = request.rel_url.query['component']
         EventMessage.subscribe(component=component)
         alert = EventMessage.receive()
     except EventMessageError as e:
         status_code = e.rc
         error_message = e.desc
         Log.error(f"Unable to receive event message for component: " \
             f"{component}, status code: {status_code}," \
             f" error: {error_message}")
         response_obj = {'error_code': status_code, 'exception': \
             ['EventMessageError', {'message': error_message}]}
     except Exception as e:
         exception_key = type(e).__name__
         exception = RestServerError(exception_key).http_error()
         status_code = exception[0]
         error_message = exception[1]
         Log.error(f"Internal error while receiving event messages for " \
             f"component: {component}, status code: " \
             f"{status_code}, error: {error_message}")
         response_obj = {'error_code': status_code, 'exception': \
             [exception_key, {'message': error_message}]}
         raise EventMessageError(status_code, error_message) from e
     else:
         status_code = 200  # No exception, Success
         response_obj = {'alert': alert}
         Log.debug(f"GET method finished with status code: {status_code}" \
             f"for component {component} and received event message " \
             f"alert info. - {alert['iem']['info']}.")
     finally:
         return web.Response(text=json.dumps(response_obj), \
             status=status_code)
Beispiel #3
0
    def process_resp(self, resp: str):
        """
        Parse the response and detect success / failure

        Args:
            resp : received response
        """
        if self.timeout_reached == True:
            return CONSUMER_STATUS.FAILED_STOP
        try:
            resp = json.loads(resp.decode('utf-8'))
        except Exception as e:
            Log.error(f"Invalid resp {resp}, Error: {e}")
            return CONSUMER_STATUS.SUCCESS

        Log.debug(f"Received message {resp}")
        if self._filter_event(json.dumps(resp)):
            Log.info(f"Filtered Event detected: {resp}")
            # Parse respnse for Enclosure shutdown Success/Failure
            if self._parse_response(json.dumps(resp)):
                self._encl_shutdown_successful = True

            # cleanup
            self._uuid = None
            self._is_resp_received = True
            return CONSUMER_STATUS.SUCCESS_STOP

        return CONSUMER_STATUS.SUCCESS
Beispiel #4
0
    def _register_for_resp(self):
        """
        Register to wait for a response to the sent request.
        """
        # Unique consumer_group for each actuator response
        self.consumer_group = self._uuid
        self.consumer_id = Conf.get(const.HA_GLOBAL_INDEX,
                                    f"ACTUATOR_MANAGER{_DELIM}consumer_id")
        self.resp_message_type = Conf.get(
            const.HA_GLOBAL_INDEX,
            f"ACTUATOR_MANAGER{_DELIM}resp_message_type")

        self.consumer = MessageBus.get_consumer(
            consumer_id=str(self.consumer_id),
            consumer_group=self.consumer_group,
            message_type=self.resp_message_type,
            callback=self.process_resp,
            offset="latest",
            timeout=ACTUATOR_MSG_WAIT_TIME)
        # Start the thread to listen to response
        self.consumer.start()

        Log.debug(
            f"Waiting to get response on message_type {self.resp_message_type}"
        )
 async def send_webhook_info(request):
     Log.debug("Received POST request for webhook information")
     try:
         external_server_info = await request.json()
         # write webhook info to the external server
         AuditLogRequestHandler.webhook_info = external_server_info
         # TODO store webhook_info to persistent storage
     except AuditLogError as e:
         status_code = e.rc
         error_message = e.desc
         Log.error(f"Unable to receive audit webhook information, status code: " \
             f"{status_code}, error: {error_message}")
         response_obj = {'error_code': status_code, 'exception': \
             ['AuditLogError', {'message': error_message}]}
     except Exception as e:
         exception_key = type(e).__name__
         exception = RestServerError(exception_key).http_error()
         status_code = exception[0]
         error_message = exception[1]
         Log.error(f"Internal error while receiving webhook info." \
             f"status code: {status_code}, error: {error_message}")
         response_obj = {'error_code': status_code, 'exception': \
             [exception_key, {'message': error_message}]}
         raise AuditLogError(status_code, error_message) from e
     else:
         status_code = 200  # No exception, Success
         response_obj = {}
         Log.debug(f"Receiving webhook info using POST method finished with status " \
             f"code: {status_code}")
         response_obj = {'status_code': status_code, 'status': 'success'}
     finally:
         return web.Response(text=json.dumps(response_obj), \
             status=status_code)
def main(action: str = '') -> int:
    """
    Main function acts as switch case for IPHealthChecker resource agent.

    Args:
        action (str): Resource agent action called by Pacemaker. Defaults to ''.

    Returns:
        int: Provide output as int code provided by pacemaker.
    """
    try:
        if action == "meta-data":
            return VipHealthMonitor.metadata()
        ConfigManager.init("resource_agent")
        resource_agent = VipHealthMonitor()
        Log.debug(f"{resource_agent} initialized for action {action}")
        if action == "monitor":
            return resource_agent.monitor()
        elif action == "start":
            return resource_agent.start()
        elif action == "stop":
            return resource_agent.stop()
        else:
            print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]")
            exit(0)
    except Exception as e:
        Log.error(
            f"vip health check failed to perform {action}. Error: {traceback.format_exc()} {e}"
        )
        return const.OCF_ERR_GENERIC
    def remove_rule(self, resource: str, event: HEALTH_STATUSES,
                    action: HEALTH_MON_ACTIONS):
        """
        For the rule resource/event  remove "action" from confstore.
        If actions list becomes empty, delete the rule

        Args:
            resource(str): resource name
            event(str): event type
            action(str): action to be removed
        """
        self._validate_action(action)
        key = self._prepare_key(resource, event)
        val = []
        Log.info(f"Removing rule for key: {key} ,value: {action}")
        kv = self._get_val(key)
        if kv:
            _, val = self._get_k_v(kv)
            if action not in val:
                Log.warn(f"KV not found for key: {key}, value: {action}")
            else:
                val.remove(action)
                if len(val) == 0:
                    self._confstore.delete(key)
                    Log.debug(
                        f"key value removed for {key} , {action}. value list empty; deleting key {key}"
                    )
                else:
                    val = json.dumps(val)
                    self._confstore.update(key, val)
                    Log.debug(f"KV removed for {key} , {action}")
        else:
            Log.warn(f"key {key} not found")
Beispiel #8
0
    def run_cmd(self, cmd, check_error=True):
        """
        Run command and throw error if cmd failed

        Args:
            cmd ([string]): Command to execute on system.

        Raises:
            Exception: raise command failed exception.
            HACommandTerminated: Command termineted exception

        Returns:
            string: Command output.
        """
        try:
            _err = ""
            _proc = SimpleProcess(cmd)
            _output, _err, _rc = _proc.run(universal_newlines=True)
            Log.debug(f"cmd: {cmd}, output: {_output}, err: {_err}, rc: {_rc}")
            if _rc != 0 and check_error:
                Log.error(
                    f"cmd: {cmd}, output: {_output}, err: {_err}, rc: {_rc}")
                raise Exception(f"Failed to execute {cmd}")
            return _output, _err, _rc
        except Exception as e:
            Log.error("Failed to execute  %s Error: %s %s" % (cmd, e, _err))
            raise HACommandTerminated("Failed to execute %s Error: %s %s" %
                                      (cmd, e, _err))
Beispiel #9
0
 def __init__(self, config: str):
     """Constructor."""
     try:
         super(ResetCmd, self).__init__(config)
     except Exception as e:
         Log.debug("Initializing reset phase failed")
         raise OpenldapPROVError(f'exception: {e}')
Beispiel #10
0
    def cleanup_db(self, node, data_only):
        """
        Args:
            node ([string]): Node name.
            data_only ([boolean]): Remove data only.

        Action:
            consul data:
                {'entity': 'enclosure', 'entity_id': '0',
                'component': 'controller', 'component_id': 'node1'}
            if data_only is True then remove data else remove
            data and perform cleanup.
        """
        resources = Conf.get(const.RESOURCE_GLOBAL_INDEX, "resources")
        node = "all" if node is None else node
        Log.debug(f"Performing cleanup for {node} node")
        for key in resources.keys():
            if node == "all":
                self._decision_monitor.acknowledge_resource(key, data_only)
            elif node in key:
                self._decision_monitor.acknowledge_resource(key, data_only)
            else:
                pass
        if not data_only:
            Log.info(f"Reseting HA decision event for {node}")
            self.reset_failover(node)
Beispiel #11
0
 def reset_failover(self, node=None, soft_cleanup=False):
     """
     Cleanup pacemaker failcount to allow failback.
     """
     node = "all" if node is None else node
     cmd = const.PCS_CLEANUP if node == "all" else const.PCS_CLEANUP + f" --node {node}"
     if soft_cleanup:
         if self.is_cleanup_required(node):
             _output, _err, _rc = self._execute.run_cmd(
                 const.PCS_FAILCOUNT_STATUS)
             Log.info(
                 f"Resource failcount before Failback: {_output}, Error:{_err}, RC:{_rc}"
             )
             _output, _err, _rc = self._execute.run_cmd(cmd)
             Log.info(
                 f"Failback is happened, Output:{_output}, Error:{_err}, RC:{_rc}"
             )
             _output, _err, _rc = self._execute.run_cmd(
                 const.PCS_FAILCOUNT_STATUS)
             Log.info(
                 f"Resource failcount after Failback: {_output}, Error:{_err}, RC:{_rc}"
             )
         else:
             Log.debug(
                 "cleanup is not required alerts are not yet resolved.")
     else:
         self._execute.run_cmd(cmd)
     Log.debug(f"Status: {self._execute.run_cmd(const.PCS_STATUS)}")
    def register_message_type(self, admin_id: str, message_types: list, \
        partitions: int):
        """
        Creates a list of message types.

        Parameters:
        admin_id        A String that represents Admin client ID.
        message_types   This is essentially equivalent to the list of
                        queue/topic name. For e.g. ["Alert"]
        partitions      Integer that represents number of partitions to be
                        created.
        """
        Log.debug(f"Register message type {message_types} using {admin_id}" \
            f" with {partitions} partitions")
        admin = self._clients['admin'][admin_id]
        new_message_type = [NewTopic(each_message_type, \
            num_partitions=partitions) for each_message_type in message_types]
        created_message_types = admin.create_topics(new_message_type)
        self._task_status(created_message_types, method='register_message_type')

        for each_message_type in message_types:
            for list_retry in range(1, self._max_list_message_type_count+2):
                if each_message_type not in \
                    list(self._get_metadata(admin).keys()):
                    if list_retry > self._max_list_message_type_count:
                        Log.error(f"MessageBusError: Timed out after retry " \
                            f"{list_retry} while creating message_type " \
                            f"{each_message_type}")
                        raise MessageBusError(errno.ETIMEDOUT, "Timed out " +\
                            "after retry %d while creating message_type %s.", \
                            list_retry, each_message_type)
                    time.sleep(list_retry*1)
                    continue
                else:
                    break
 def monitor(self, state=const.STATE_RUNNING):
     """
     Monitor hardware and gives result
     """
     filename, path, service, node = self._get_params()
     Log.debug(f"In monitor for {filename}")
     if not os.path.exists(const.HA_INIT_DIR +
                           filename) and state != const.STATE_STOP:
         return const.OCF_NOT_RUNNING
     self_node, other_node, self_node_status, other_node_status = self._get_status(
         self.decision_monitor.get_resource_status, path)
     Log.debug(f"In monitor group key: {path}, node: {self_node} "
               f"status: {self_node_status}, service: {service}")
     if node != '-' and node != self_node and other_node_status == Action.RESOLVED:
         Log.info(f"Ack IEM for {filename} with key {path} node {node}")
         self._acknowledge_event(path + '_' + node)
     return self._monitor_action(self._acknowledge_event,
                                 state,
                                 self_node=self_node,
                                 other_node=other_node,
                                 self_node_status=self_node_status,
                                 other_node_status=other_node_status,
                                 filename=filename,
                                 path=path,
                                 service=service)
    def start(self) -> int:
        """
        Start service and provide output.

        Command to start service:
            $ systemctl reset-failed service
            $ systemctl start service

        Returns:
            int: Return as per service status.
                active: return const.OCF_SUCCESS.
                unknown: Wait till timeout.
                failed or timeout will cause failover or moved to Stopped state.
        """
        service = self._get_systemd_service()
        Log.debug(f"Start: Start {service} service")
        self._execute.run_cmd(f"systemctl reset-failed {service}",
                              check_error=False)
        self._execute.run_cmd(f"systemctl start {service}", check_error=False)
        while True:
            Log.debug(f"Start: Starting {service} service")
            status: str = self._get_service_status(service).strip()
            if status == "active":
                break
            elif status == "failed":
                Log.info(
                    f"Start: Failed to start {service} and may cause failover or Stop."
                )
                return const.OCF_ERR_GENERIC
            else:
                time.sleep(1)
                continue
        Log.info(f"Start: Started {service} service")
        return const.OCF_SUCCESS
Beispiel #15
0
    def filter_event(self, msg: str) -> bool:
        """
        Filter event.
        Args:
            msg (str): Msg
        """
        try:
            resource_alert_required = False
            message = json.dumps(ast.literal_eval(msg))
            message = json.loads(message)

            Log.debug('Received alert from fault tolerance')
            event_resource_type = message.get(
                EventAttr.EVENT_PAYLOAD.value).get(
                    HealthAttr.RESOURCE_TYPE.value)

            required_resource_type_list = Conf.get(
                const.HA_GLOBAL_INDEX, f"CLUSTER{_DELIM}resource_type")
            if event_resource_type in required_resource_type_list:
                resource_alert_required = True
                Log.info(
                    f'This alert needs an attention: resource_type: {event_resource_type}'
                )
            return resource_alert_required
        except Exception as e:
            raise EventFilterException(
                f"Failed to filter cluster resource event. Message: {msg}, Error: {e}"
            )
Beispiel #16
0
    def run_cmd(self, cmd, check_error=True, secret=None):
        """
        Run command and throw error if cmd failed

        Args:
            cmd ([string]): Command to execute on system.

        Raises:
            Exception: raise command failed exception.
            HACommandTerminated: Command termineted exception

        Returns:
            string: Command output.
        """
        try:
            cmd_help = cmd.replace(secret,
                                   "****") if secret is not None else cmd
            _err = ""
            _proc = SimpleProcess(cmd)
            _output, _err, _rc = _proc.run(universal_newlines=True)
            Log.debug(
                f"cmd: {cmd_help}, output: {_output}, err: {_err}, rc: {_rc}")
            if _rc != 0 and check_error:
                Log.error(
                    f"cmd: {cmd_help}, output: {_output}, err: {_err}, rc: {_rc}"
                )
                raise Exception(f"Failed to execute {cmd_help}")
            return _output, _err, _rc
        except Exception as e:
            Log.error(f"Failed to execute  {cmd_help} Error: {e}.")
            raise HACommandTerminated(
                f"Failed to execute  {cmd_help} Error: {e}.")
Beispiel #17
0
    async def _exc_components_cmd(commands: List, bundle_id: str, path: str, \
            component: str, node_name: str, comment: str, config_url:str,
            services:str, binlogs:bool, coredumps:bool, stacktrace:bool,
            duration:str, size_limit:str):
        """
        Executes the Command for Bundle Generation of Every Component.

        commands:       Command of the component :type:str
        bundle_id:      Unique Bundle ID of the generation process. :type:str
        path:           Path to create the tar by components :type:str
        component:      Name of Component to be executed :type: str
        node_name:      Name of Node where the Command is being Executed :type:str
        comment:        User Comment: type:str
        """
        for command in commands:
        # SB Framework will not parse additional filters until all the components
        # accept filters in their respective support bundle scripts.
            cli_cmd = f"{command} -b {bundle_id} -t {path} -c {config_url}"\
                f" -s {services} --duration {duration} --size_limit {size_limit}"\
                f" --binlogs {binlogs} --coredumps {coredumps} --stacktrace {stacktrace}"
            Log.info(f"Executing command -> {cli_cmd}")
            cmd_proc = SimpleProcess(cli_cmd)

            output, err, return_code = cmd_proc.run()
            Log.debug(f"Command Output -> {output} {err}, {return_code}")
            if return_code != 0:
                Log.error(f"Command Output -> {output} {err}, {return_code}")
            else:
                Log.debug(f"Command Output -> {output} {err}, {return_code}")
            return component, return_code
Beispiel #18
0
    def send(self, producer_id: str, message_type: str, method: str,\
        messages: list, timeout=0.1):
        """
        Sends list of messages to Kafka cluster(s).

        Parameters:
        producer_id     A String that represents Producer client ID.
        message_type    This is essentially equivalent to the
                        queue/topic name. For e.g. "Alert"
        method          Can be set to "sync" or "async"(default).
        messages        A list of messages sent to Kafka Message Server
        """
        Log.debug(f"Producer {producer_id} sending list of messages "\
            f"{messages} of message type {message_type} to kafka server"\
            f" with method {method}")
        producer = self._clients['producer'][producer_id]
        if producer is None:
            Log.error(f"MessageBusError: "\
                f"{errors.ERR_SERVICE_NOT_INITIALIZED}. Producer: "\
                f"{producer_id} is not initialized")
            raise MessageBusError(errors.ERR_SERVICE_NOT_INITIALIZED,\
                "Producer %s is not initialized", producer_id)

        for message in messages:
            producer.produce(message_type, bytes(message, 'utf-8'),\
                callback=self.delivery_callback)
            if method == 'sync':
                producer.flush()
            else:
                producer.poll(timeout=timeout)
        Log.debug("Successfully Sent list of messages to Kafka cluster")
Beispiel #19
0
    def _validate_kafka_installation():
        """Validates kafka is installed and kafka user and group are present."""
        # check kafka package installed
        try:
            PkgV().validate('rpms', ['kafka'])
        except Exception as e:
            Log.error(f"Kafka rpm missing: {e}")
            raise KafkaSetupError(e.rc, e)

        # check kafak user exists
        try:
            kafka_user = get_user_by_name('kafka')
            kafka_group = get_group_by_name('kafka')
            if kafka_group.gr_gid != kafka_user.pw_gid:
                raise Exception
        except Exception as e:
            Log.error(f"Kafka user/group missing: {e}")
            # create kafka user and group
            cmds = [
                "adduser kafka", "usermod -aG wheel kafka",
                "groupadd --force kafka",
                "usermod --append --groups kafka kafka"
            ]
            Log.info("Creating Kafka user and group")
            for cmd in cmds:
                _, err, rc = SimpleProcess(cmd).run()
                # rc 9 if kafka user already exists & 12 if kafka user created
                if rc not in (0, 9, 12):
                    Log.debug(f"Failed in running command :{cmd}")
                    Log.error(f"Failed in creating kafka user/group:{err}")
                    raise KafkaSetupError(rc,\
                        "Failed in creating kafka user and group", err)
Beispiel #20
0
    def deregister_message_type(self, admin_id: str, message_types: list):
        """
        Deletes a list of message types.

        Parameters:
        admin_id        A String that represents Admin client ID.
        message_types   This is essentially equivalent to the list of
                        queue/topic name. For e.g. ["Alert"]
        """
        Log.debug(f"Deregister message type {message_types} using {admin_id}")
        admin = self._clients['admin'][admin_id]
        deleted_message_types = admin.delete_topics(message_types)
        self._task_status(deleted_message_types,\
            method='deregister_message_type')

        for each_message_type in message_types:
            for list_retry in range(1, self._max_list_message_type_count + 2):
                if each_message_type in list(self._get_metadata(admin).keys()):
                    if list_retry > self._max_list_message_type_count:
                        Log.error(f"MessageBusError: Timed out after "\
                            f"{list_retry} retry to delete message_type "\
                            f"{each_message_type}")
                        raise MessageBusError(errno.ETIMEDOUT,\
                            "Timed out after %d retry to delete message_type" +\
                            "%s.", list_retry, each_message_type)
                    time.sleep(list_retry * 1)
                    continue
                else:
                    break
    def get_children(self, element: str, element_id: str, **kwargs) -> dict:
        """
        Get children of element.

        Args:
            element (str): [description]

        Returns:
            dict: Map of children and ids: {component: {component_type:{component_ids:{}}}}
        """
        # TODO: update code to get com_type, currently assuming comp = com_type
        children_ids: list = []
        children: list = HealthHierarchy.get_next_components(element)
        if len(children) == 0:
            return {}
        key = ElementHealthEvaluator.prepare_key(element,
                                                 comp_id=element_id,
                                                 **kwargs)
        key = key.replace("/health", "").replace("/", "", 1)
        data = self.healthmanager.get_key(key, just_value=False)
        for element in data.keys():
            key_list = element.split("/")
            if children[0] in key_list:
                child_index = key_list.index(children[0])
                element_id = key_list[child_index + 1]
                if element_id not in children_ids:
                    children_ids.append(key_list[child_index + 1])
        Log.debug(f"Children for {element}:{element_id} are {children_ids}")
        #{component: {component_type: [component_ids]}}}
        return {children[0]: {children[0]: children_ids}}
Beispiel #22
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            alert = json.loads(msg).get(ALERT_ATTRIBUTES.MESSAGE)

            event = {
                event_attr.EVENT_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_ID],
                event_attr.EVENT_TYPE : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_TYPE],
                event_attr.SEVERITY : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SEVERITY],
                event_attr.SITE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.SITE_ID],
                event_attr.RACK_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RACK_ID],
                event_attr.CLUSTER_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.CLUSTER_ID],
                event_attr.STORAGESET_ID : "TBD",
                event_attr.NODE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.NODE_ID],
                event_attr.HOST_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.HOST_ID],
                event_attr.RESOURCE_TYPE : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RESOURCE_TYPE],
                event_attr.TIMESTAMP : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.EVENT_TIME],
                event_attr.RESOURCE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RESOURCE_ID],
                event_attr.SPECIFIC_INFO : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SPECIFIC_INFO]
            }
            Log.debug(f"Parsed {event} schema")
            health_event = HealthEvent.dict_to_object(event)
            Log.info(f"Event {event[event_attr.EVENT_ID]} is parsed and converted to object.")
            return health_event

        except Exception as e:
            raise EventParserException(f"Failed to parse alert. Message: {msg}, Error: {e}")
Beispiel #23
0
 def _monitor_action(self, callback_ack, state, **args):
     """
     Return action on status
     """
     Log.debug(str(args))
     if args[const.CURRENT_NODE_STATUS] == Action.FAILED and args[const.OTHER_NODE_STATUS] == Action.FAILED:
         return const.OCF_SUCCESS
     elif args[const.CURRENT_NODE_STATUS] == Action.FAILED:
         return const.OCF_ERR_GENERIC
     elif args[const.CURRENT_NODE_STATUS] == Action.OK:
         return const.OCF_SUCCESS
     elif args[const.CURRENT_NODE_STATUS] == Action.RESOLVED:
         Log.info(f"Ack for {args[const.FILENAME_KEY]} with key {args[const.PATH_KEY]}"
                  f" node {args[const.CURRENT_NODE]}")
         return callback_ack(args[const.PATH_KEY]+'_'+args[const.CURRENT_NODE])
     elif args[const.CURRENT_NODE_STATUS] == Action.RESTART:
         Log.info(f"Restart action taken for {args[const.FILENAME_KEY]} on {args[const.CURRENT_NODE]}")
         if state == const.STATE_START:
             return const.OCF_SUCCESS
         elif state == const.STATE_RUNNING:
             return const.OCF_ERR_GENERIC
         elif state == const.STATE_STOP:
             callback_ack(args[const.PATH_KEY]+'_'+args[const.CURRENT_NODE])
             Log.info(f"Ack for {args[const.FILENAME_KEY]} with key {args[const.PATH_KEY]} "
                      f" node {args[const.CURRENT_NODE]}")
             return Action.RESTART
         return const.OCF_SUCCESS
     else:
         Log.error(f"Unimplemented value for status {args[const.CURRENT_NODE_STATUS]}")
         return const.OCF_ERR_UNIMPLEMENTED
Beispiel #24
0
 def _exc_components_cmd(commands: List, bundle_id: str, path: str,
                         component: str, node_name: str, comment: str):
     """
     Executes the Command for Bundle Generation of Every Component.
     :param commands: Command of the component :type:str
     :param bundle_id: Unique Bundle ID of the generation process. :type:str
     :param path: Path to create the tar by components :type:str
     :param component: Name of Component to be executed :type: str
     :param node_name:Name of Node where the Command is being Executed
     :type:str
     :param comment: :User Comment: type:str
     :return:
     """
     for command in commands:
         Log.info(f"Executing command -> {command} {bundle_id} {path}")
         cmd_proc = SimpleProcess(f"{command} {bundle_id} {path}")
         output, err, return_code = cmd_proc.run()
         Log.debug(f"Command Output -> {output} {err}, {return_code}")
         if return_code != 0:
             Log.error(f"Command Output -> {output} {err}, {return_code}")
             ComponentsBundle._publish_log(
                 f"Bundle generation failed for '{component}'", ERROR,
                 bundle_id, node_name, comment)
         else:
             ComponentsBundle._publish_log(
                 f"Bundle generation started for '{component}'", INFO,
                 bundle_id, node_name, comment)
Beispiel #25
0
def main(resource, action=''):
    try:
        if action == 'meta-data':
            return resource.metadata()
        Conf.load(const.HA_GLOBAL_INDEX, Yaml(const.HA_CONFIG_FILE))
        log_path = Conf.get(const.HA_GLOBAL_INDEX, f"LOG{_DELIM}path")
        log_level = Conf.get(const.HA_GLOBAL_INDEX, f"LOG{_DELIM}level")
        Log.init(service_name='resource_agent', log_path=log_path, level=log_level)
        with open(const.RESOURCE_SCHEMA, 'r') as f:
            resource_schema = json.load(f)
        os.makedirs(const.RA_LOG_DIR, exist_ok=True)
        resource_agent = resource(DecisionMonitor(), resource_schema)
        Log.debug(f"{resource_agent} initialized for action {action}")
        if action == 'monitor':
            return resource_agent.monitor()
        elif action == 'start':
            return resource_agent.start()
        elif action == 'stop':
            return resource_agent.stop()
        else:
            print('Usage %s [monitor] [start] [stop] [meta-data]' % sys.argv[0])
            exit()
    except Exception as e:
        Log.error(f"{traceback.format_exc()}")
        return const.OCF_ERR_GENERIC
 def process_message(self, message: str):
     """Callback method for MessageConsumer"""
     Log.debug(f'Received the message from message bus: {message}')
     try:
         EventAnalyzer(message.decode('utf-8'))
         return CONSUMER_STATUS.SUCCESS
     except ConsulException as e:
         Log.error(f"consule exception {e} {traceback.format_exc()} for {message}. Ack Message.")
         return CONSUMER_STATUS.SUCCESS
     except ConfError as e:
         Log.error(f"config exception {e} {traceback.format_exc()} for {message}. Ack Message.")
         return CONSUMER_STATUS.SUCCESS
     except EventFilterException as e:
         Log.error(f"Filter exception {e} {traceback.format_exc()} for {message}. Ack Message.")
         return CONSUMER_STATUS.SUCCESS
     except EventParserException as e:
         Log.error(f"Parser exception {e} {traceback.format_exc()} for {message}.  Ack Message.")
         return CONSUMER_STATUS.SUCCESS
     except SubscriberException as e:
         Log.error(f"Subscriber exception {e} {traceback.format_exc()} for {message}, retry without ack.")
         return CONSUMER_STATUS.SUCCESS
     except Exception as e:
         Log.error(f"Unknown Exception caught {e} {traceback.format_exc()}")
         Log.error(f"Forcefully ack as success. msg: {message}")
         return CONSUMER_STATUS.SUCCESS
def main(resource: DynamicFidServiceRA, action: str = '') -> int:
    """
    Main function acts as switch case for DynamicFidServiceRA resource agent.

    Args:
        resource (DynamicFidServiceRA): Resource agent
        action (str): Resource agent action called by Pacemaker. Defaults to ''.

    Returns:
        int: Provide output as int code provided by pacemaker.
    """
    try:
        if action == "meta-data":
            return resource.metadata()
        ConfigManager.init("resource_agent")
        Log.debug(f"{resource} initialized for action {action}")
        if action == "monitor":
            return resource_agent.monitor()
        elif action == "start":
            return resource_agent.start()
        elif action == "stop":
            return resource_agent.stop()
        else:
            print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]")
            exit(0)
    except Exception as e:
        Log.error(
            f"systemd_fid_wrapper_ra failed to perform {action}. Error: {e}")
        return const.OCF_ERR_GENERIC
Beispiel #28
0
 def remove_node(self, node):
     """
     Remove node from pcs cluster
     """
     # TODO: Limitation for node remove (in cluster node cannot remove it self)
     # Check if node already removed
     _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
     Log.info(
         f"Cluster status output before remove node: {_output}, {_err}, {_rc}"
     )
     _rc, status = self.node_status(node)
     if _rc != 1:
         self._execute.run_cmd(f"pcs cluster node remove {node} --force")
         _rc, status = self.node_status(node)
         Log.debug(f"For node {node} status: {status}, rc: {_rc}")
         if _rc != 1:
             Log.error(f"Failed to remove {node}")
             raise Exception(f"Failed to remove {node}")
         else:
             Log.info(f"Node {node} removed from cluster")
     else:
         Log.info(f"Node {node} already removed from cluster")
     _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
     Log.info(
         f"Cluster status output after remove node: {_output}, {_err}, {_rc}"
     )
    def _load_rules(self):
        """
        Reads the json structured rule data from the file, and returns it.

        in dict format.
        """
        rules_data = None
        try:
            if self._rule_file is None:
                return None
            Log.debug(
                f"Loading rules json into memory. File: {self._rule_file}")
            with open(self._rule_file, 'r') as fp:
                rules_data = fp.read()
            if rules_data:
                rules_json = JsonMessage(rules_data)
                self._rules_schema = rules_json.load()
        except OSError as os_error:
            if os_error.errno == errno.ENOENT:
                Log.error(f'File {self._rule_file} does not exist')
            elif os_error.errno == errno.EACCES:
                Log.error(
                    f'Not enough permission to read {self._rule_file} file')
            else:
                Log.error(f'Error while reading from file {self._rule_file}')
Beispiel #30
0
    def parse_event(self, msg: str) -> HealthEvent:
        """
        Parse event.
        Args:
            msg (str): Msg
        """
        try:
            message = json.dumps(ast.literal_eval(msg))
            cluster_resource_alert = json.loads(message)
            timestamp = str(int(time.time()))
            event_id = timestamp + str(uuid.uuid4().hex)
            node_id = cluster_resource_alert["_resource_name"]
            resource_type = cluster_resource_alert["_resource_type"]
            event_type = cluster_resource_alert["_event_type"]
            timestamp = cluster_resource_alert["_timestamp"]
            generation_id = cluster_resource_alert["_generation_id"]

            event = {
                EVENT_ATTRIBUTES.EVENT_ID:
                event_id,
                EVENT_ATTRIBUTES.EVENT_TYPE:
                event_type,
                EVENT_ATTRIBUTES.SEVERITY:
                StatusMapper.EVENT_TO_SEVERITY_MAPPING[event_type],
                EVENT_ATTRIBUTES.SITE_ID:
                self.site_id,  # TODO: Should be fetched from confstore
                EVENT_ATTRIBUTES.RACK_ID:
                self.rack_id,  # TODO: Should be fetched from confstore
                EVENT_ATTRIBUTES.CLUSTER_ID:
                self.cluster_id,  # TODO: Should be fetched from confstore
                EVENT_ATTRIBUTES.STORAGESET_ID:
                node_id,
                EVENT_ATTRIBUTES.NODE_ID:
                node_id,
                EVENT_ATTRIBUTES.HOST_ID:
                node_id,
                EVENT_ATTRIBUTES.RESOURCE_TYPE:
                resource_type,
                EVENT_ATTRIBUTES.TIMESTAMP:
                timestamp,
                EVENT_ATTRIBUTES.RESOURCE_ID:
                node_id,
                EVENT_ATTRIBUTES.SPECIFIC_INFO: {
                    "generation_id": generation_id,
                    "pod_restart": 0
                }
            }

            Log.debug(f"Parsed {event} schema")
            health_event = HealthEvent.dict_to_object(event)
            Log.debug(
                f"Event {event[EVENT_ATTRIBUTES.EVENT_ID]} is parsed and converted to object."
            )
            return health_event

        except Exception as err:
            raise EventParserException(
                f"Failed to parse cluster resource alert. Message: {msg}, Error: {err}"
            )