def process_event(self, message: str) -> None: """ Callback function to receive and process event. Args: message (str): event message. """ try: event = json.loads(message.decode('utf-8')) health_event = HealthEvent.dict_to_object(event) except Exception as e: Log.error( f"Invalid format for event {message}, Error: {e}. Forcefully ack." ) return CONSUMER_STATUS.SUCCESS Log.debug(f"Captured {message} for evaluating health monitor.") action_handler = None try: action_list = self._rule_manager.evaluate(health_event) if action_list: Log.info(f"Evaluated {health_event} with action {action_list}") action_handler = ActionFactory.get_action_handler( health_event, action_list) action_handler.act(health_event, action_list) return CONSUMER_STATUS.SUCCESS except Exception as e: Log.error( f"Failed to process {message} error: {e} {traceback.format_exc()}" ) return CONSUMER_STATUS.FAILED
async def receive(request): Log.debug(f"Received GET request for component " \ f"{request.rel_url.query['component']}") try: component = request.rel_url.query['component'] EventMessage.subscribe(component=component) alert = EventMessage.receive() except EventMessageError as e: status_code = e.rc error_message = e.desc Log.error(f"Unable to receive event message for component: " \ f"{component}, status code: {status_code}," \ f" error: {error_message}") response_obj = {'error_code': status_code, 'exception': \ ['EventMessageError', {'message': error_message}]} except Exception as e: exception_key = type(e).__name__ exception = RestServerError(exception_key).http_error() status_code = exception[0] error_message = exception[1] Log.error(f"Internal error while receiving event messages for " \ f"component: {component}, status code: " \ f"{status_code}, error: {error_message}") response_obj = {'error_code': status_code, 'exception': \ [exception_key, {'message': error_message}]} raise EventMessageError(status_code, error_message) from e else: status_code = 200 # No exception, Success response_obj = {'alert': alert} Log.debug(f"GET method finished with status code: {status_code}" \ f"for component {component} and received event message " \ f"alert info. - {alert['iem']['info']}.") finally: return web.Response(text=json.dumps(response_obj), \ status=status_code)
def process_resp(self, resp: str): """ Parse the response and detect success / failure Args: resp : received response """ if self.timeout_reached == True: return CONSUMER_STATUS.FAILED_STOP try: resp = json.loads(resp.decode('utf-8')) except Exception as e: Log.error(f"Invalid resp {resp}, Error: {e}") return CONSUMER_STATUS.SUCCESS Log.debug(f"Received message {resp}") if self._filter_event(json.dumps(resp)): Log.info(f"Filtered Event detected: {resp}") # Parse respnse for Enclosure shutdown Success/Failure if self._parse_response(json.dumps(resp)): self._encl_shutdown_successful = True # cleanup self._uuid = None self._is_resp_received = True return CONSUMER_STATUS.SUCCESS_STOP return CONSUMER_STATUS.SUCCESS
def _register_for_resp(self): """ Register to wait for a response to the sent request. """ # Unique consumer_group for each actuator response self.consumer_group = self._uuid self.consumer_id = Conf.get(const.HA_GLOBAL_INDEX, f"ACTUATOR_MANAGER{_DELIM}consumer_id") self.resp_message_type = Conf.get( const.HA_GLOBAL_INDEX, f"ACTUATOR_MANAGER{_DELIM}resp_message_type") self.consumer = MessageBus.get_consumer( consumer_id=str(self.consumer_id), consumer_group=self.consumer_group, message_type=self.resp_message_type, callback=self.process_resp, offset="latest", timeout=ACTUATOR_MSG_WAIT_TIME) # Start the thread to listen to response self.consumer.start() Log.debug( f"Waiting to get response on message_type {self.resp_message_type}" )
async def send_webhook_info(request): Log.debug("Received POST request for webhook information") try: external_server_info = await request.json() # write webhook info to the external server AuditLogRequestHandler.webhook_info = external_server_info # TODO store webhook_info to persistent storage except AuditLogError as e: status_code = e.rc error_message = e.desc Log.error(f"Unable to receive audit webhook information, status code: " \ f"{status_code}, error: {error_message}") response_obj = {'error_code': status_code, 'exception': \ ['AuditLogError', {'message': error_message}]} except Exception as e: exception_key = type(e).__name__ exception = RestServerError(exception_key).http_error() status_code = exception[0] error_message = exception[1] Log.error(f"Internal error while receiving webhook info." \ f"status code: {status_code}, error: {error_message}") response_obj = {'error_code': status_code, 'exception': \ [exception_key, {'message': error_message}]} raise AuditLogError(status_code, error_message) from e else: status_code = 200 # No exception, Success response_obj = {} Log.debug(f"Receiving webhook info using POST method finished with status " \ f"code: {status_code}") response_obj = {'status_code': status_code, 'status': 'success'} finally: return web.Response(text=json.dumps(response_obj), \ status=status_code)
def main(action: str = '') -> int: """ Main function acts as switch case for IPHealthChecker resource agent. Args: action (str): Resource agent action called by Pacemaker. Defaults to ''. Returns: int: Provide output as int code provided by pacemaker. """ try: if action == "meta-data": return VipHealthMonitor.metadata() ConfigManager.init("resource_agent") resource_agent = VipHealthMonitor() Log.debug(f"{resource_agent} initialized for action {action}") if action == "monitor": return resource_agent.monitor() elif action == "start": return resource_agent.start() elif action == "stop": return resource_agent.stop() else: print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]") exit(0) except Exception as e: Log.error( f"vip health check failed to perform {action}. Error: {traceback.format_exc()} {e}" ) return const.OCF_ERR_GENERIC
def remove_rule(self, resource: str, event: HEALTH_STATUSES, action: HEALTH_MON_ACTIONS): """ For the rule resource/event remove "action" from confstore. If actions list becomes empty, delete the rule Args: resource(str): resource name event(str): event type action(str): action to be removed """ self._validate_action(action) key = self._prepare_key(resource, event) val = [] Log.info(f"Removing rule for key: {key} ,value: {action}") kv = self._get_val(key) if kv: _, val = self._get_k_v(kv) if action not in val: Log.warn(f"KV not found for key: {key}, value: {action}") else: val.remove(action) if len(val) == 0: self._confstore.delete(key) Log.debug( f"key value removed for {key} , {action}. value list empty; deleting key {key}" ) else: val = json.dumps(val) self._confstore.update(key, val) Log.debug(f"KV removed for {key} , {action}") else: Log.warn(f"key {key} not found")
def run_cmd(self, cmd, check_error=True): """ Run command and throw error if cmd failed Args: cmd ([string]): Command to execute on system. Raises: Exception: raise command failed exception. HACommandTerminated: Command termineted exception Returns: string: Command output. """ try: _err = "" _proc = SimpleProcess(cmd) _output, _err, _rc = _proc.run(universal_newlines=True) Log.debug(f"cmd: {cmd}, output: {_output}, err: {_err}, rc: {_rc}") if _rc != 0 and check_error: Log.error( f"cmd: {cmd}, output: {_output}, err: {_err}, rc: {_rc}") raise Exception(f"Failed to execute {cmd}") return _output, _err, _rc except Exception as e: Log.error("Failed to execute %s Error: %s %s" % (cmd, e, _err)) raise HACommandTerminated("Failed to execute %s Error: %s %s" % (cmd, e, _err))
def __init__(self, config: str): """Constructor.""" try: super(ResetCmd, self).__init__(config) except Exception as e: Log.debug("Initializing reset phase failed") raise OpenldapPROVError(f'exception: {e}')
def cleanup_db(self, node, data_only): """ Args: node ([string]): Node name. data_only ([boolean]): Remove data only. Action: consul data: {'entity': 'enclosure', 'entity_id': '0', 'component': 'controller', 'component_id': 'node1'} if data_only is True then remove data else remove data and perform cleanup. """ resources = Conf.get(const.RESOURCE_GLOBAL_INDEX, "resources") node = "all" if node is None else node Log.debug(f"Performing cleanup for {node} node") for key in resources.keys(): if node == "all": self._decision_monitor.acknowledge_resource(key, data_only) elif node in key: self._decision_monitor.acknowledge_resource(key, data_only) else: pass if not data_only: Log.info(f"Reseting HA decision event for {node}") self.reset_failover(node)
def reset_failover(self, node=None, soft_cleanup=False): """ Cleanup pacemaker failcount to allow failback. """ node = "all" if node is None else node cmd = const.PCS_CLEANUP if node == "all" else const.PCS_CLEANUP + f" --node {node}" if soft_cleanup: if self.is_cleanup_required(node): _output, _err, _rc = self._execute.run_cmd( const.PCS_FAILCOUNT_STATUS) Log.info( f"Resource failcount before Failback: {_output}, Error:{_err}, RC:{_rc}" ) _output, _err, _rc = self._execute.run_cmd(cmd) Log.info( f"Failback is happened, Output:{_output}, Error:{_err}, RC:{_rc}" ) _output, _err, _rc = self._execute.run_cmd( const.PCS_FAILCOUNT_STATUS) Log.info( f"Resource failcount after Failback: {_output}, Error:{_err}, RC:{_rc}" ) else: Log.debug( "cleanup is not required alerts are not yet resolved.") else: self._execute.run_cmd(cmd) Log.debug(f"Status: {self._execute.run_cmd(const.PCS_STATUS)}")
def register_message_type(self, admin_id: str, message_types: list, \ partitions: int): """ Creates a list of message types. Parameters: admin_id A String that represents Admin client ID. message_types This is essentially equivalent to the list of queue/topic name. For e.g. ["Alert"] partitions Integer that represents number of partitions to be created. """ Log.debug(f"Register message type {message_types} using {admin_id}" \ f" with {partitions} partitions") admin = self._clients['admin'][admin_id] new_message_type = [NewTopic(each_message_type, \ num_partitions=partitions) for each_message_type in message_types] created_message_types = admin.create_topics(new_message_type) self._task_status(created_message_types, method='register_message_type') for each_message_type in message_types: for list_retry in range(1, self._max_list_message_type_count+2): if each_message_type not in \ list(self._get_metadata(admin).keys()): if list_retry > self._max_list_message_type_count: Log.error(f"MessageBusError: Timed out after retry " \ f"{list_retry} while creating message_type " \ f"{each_message_type}") raise MessageBusError(errno.ETIMEDOUT, "Timed out " +\ "after retry %d while creating message_type %s.", \ list_retry, each_message_type) time.sleep(list_retry*1) continue else: break
def monitor(self, state=const.STATE_RUNNING): """ Monitor hardware and gives result """ filename, path, service, node = self._get_params() Log.debug(f"In monitor for {filename}") if not os.path.exists(const.HA_INIT_DIR + filename) and state != const.STATE_STOP: return const.OCF_NOT_RUNNING self_node, other_node, self_node_status, other_node_status = self._get_status( self.decision_monitor.get_resource_status, path) Log.debug(f"In monitor group key: {path}, node: {self_node} " f"status: {self_node_status}, service: {service}") if node != '-' and node != self_node and other_node_status == Action.RESOLVED: Log.info(f"Ack IEM for {filename} with key {path} node {node}") self._acknowledge_event(path + '_' + node) return self._monitor_action(self._acknowledge_event, state, self_node=self_node, other_node=other_node, self_node_status=self_node_status, other_node_status=other_node_status, filename=filename, path=path, service=service)
def start(self) -> int: """ Start service and provide output. Command to start service: $ systemctl reset-failed service $ systemctl start service Returns: int: Return as per service status. active: return const.OCF_SUCCESS. unknown: Wait till timeout. failed or timeout will cause failover or moved to Stopped state. """ service = self._get_systemd_service() Log.debug(f"Start: Start {service} service") self._execute.run_cmd(f"systemctl reset-failed {service}", check_error=False) self._execute.run_cmd(f"systemctl start {service}", check_error=False) while True: Log.debug(f"Start: Starting {service} service") status: str = self._get_service_status(service).strip() if status == "active": break elif status == "failed": Log.info( f"Start: Failed to start {service} and may cause failover or Stop." ) return const.OCF_ERR_GENERIC else: time.sleep(1) continue Log.info(f"Start: Started {service} service") return const.OCF_SUCCESS
def filter_event(self, msg: str) -> bool: """ Filter event. Args: msg (str): Msg """ try: resource_alert_required = False message = json.dumps(ast.literal_eval(msg)) message = json.loads(message) Log.debug('Received alert from fault tolerance') event_resource_type = message.get( EventAttr.EVENT_PAYLOAD.value).get( HealthAttr.RESOURCE_TYPE.value) required_resource_type_list = Conf.get( const.HA_GLOBAL_INDEX, f"CLUSTER{_DELIM}resource_type") if event_resource_type in required_resource_type_list: resource_alert_required = True Log.info( f'This alert needs an attention: resource_type: {event_resource_type}' ) return resource_alert_required except Exception as e: raise EventFilterException( f"Failed to filter cluster resource event. Message: {msg}, Error: {e}" )
def run_cmd(self, cmd, check_error=True, secret=None): """ Run command and throw error if cmd failed Args: cmd ([string]): Command to execute on system. Raises: Exception: raise command failed exception. HACommandTerminated: Command termineted exception Returns: string: Command output. """ try: cmd_help = cmd.replace(secret, "****") if secret is not None else cmd _err = "" _proc = SimpleProcess(cmd) _output, _err, _rc = _proc.run(universal_newlines=True) Log.debug( f"cmd: {cmd_help}, output: {_output}, err: {_err}, rc: {_rc}") if _rc != 0 and check_error: Log.error( f"cmd: {cmd_help}, output: {_output}, err: {_err}, rc: {_rc}" ) raise Exception(f"Failed to execute {cmd_help}") return _output, _err, _rc except Exception as e: Log.error(f"Failed to execute {cmd_help} Error: {e}.") raise HACommandTerminated( f"Failed to execute {cmd_help} Error: {e}.")
async def _exc_components_cmd(commands: List, bundle_id: str, path: str, \ component: str, node_name: str, comment: str, config_url:str, services:str, binlogs:bool, coredumps:bool, stacktrace:bool, duration:str, size_limit:str): """ Executes the Command for Bundle Generation of Every Component. commands: Command of the component :type:str bundle_id: Unique Bundle ID of the generation process. :type:str path: Path to create the tar by components :type:str component: Name of Component to be executed :type: str node_name: Name of Node where the Command is being Executed :type:str comment: User Comment: type:str """ for command in commands: # SB Framework will not parse additional filters until all the components # accept filters in their respective support bundle scripts. cli_cmd = f"{command} -b {bundle_id} -t {path} -c {config_url}"\ f" -s {services} --duration {duration} --size_limit {size_limit}"\ f" --binlogs {binlogs} --coredumps {coredumps} --stacktrace {stacktrace}" Log.info(f"Executing command -> {cli_cmd}") cmd_proc = SimpleProcess(cli_cmd) output, err, return_code = cmd_proc.run() Log.debug(f"Command Output -> {output} {err}, {return_code}") if return_code != 0: Log.error(f"Command Output -> {output} {err}, {return_code}") else: Log.debug(f"Command Output -> {output} {err}, {return_code}") return component, return_code
def send(self, producer_id: str, message_type: str, method: str,\ messages: list, timeout=0.1): """ Sends list of messages to Kafka cluster(s). Parameters: producer_id A String that represents Producer client ID. message_type This is essentially equivalent to the queue/topic name. For e.g. "Alert" method Can be set to "sync" or "async"(default). messages A list of messages sent to Kafka Message Server """ Log.debug(f"Producer {producer_id} sending list of messages "\ f"{messages} of message type {message_type} to kafka server"\ f" with method {method}") producer = self._clients['producer'][producer_id] if producer is None: Log.error(f"MessageBusError: "\ f"{errors.ERR_SERVICE_NOT_INITIALIZED}. Producer: "\ f"{producer_id} is not initialized") raise MessageBusError(errors.ERR_SERVICE_NOT_INITIALIZED,\ "Producer %s is not initialized", producer_id) for message in messages: producer.produce(message_type, bytes(message, 'utf-8'),\ callback=self.delivery_callback) if method == 'sync': producer.flush() else: producer.poll(timeout=timeout) Log.debug("Successfully Sent list of messages to Kafka cluster")
def _validate_kafka_installation(): """Validates kafka is installed and kafka user and group are present.""" # check kafka package installed try: PkgV().validate('rpms', ['kafka']) except Exception as e: Log.error(f"Kafka rpm missing: {e}") raise KafkaSetupError(e.rc, e) # check kafak user exists try: kafka_user = get_user_by_name('kafka') kafka_group = get_group_by_name('kafka') if kafka_group.gr_gid != kafka_user.pw_gid: raise Exception except Exception as e: Log.error(f"Kafka user/group missing: {e}") # create kafka user and group cmds = [ "adduser kafka", "usermod -aG wheel kafka", "groupadd --force kafka", "usermod --append --groups kafka kafka" ] Log.info("Creating Kafka user and group") for cmd in cmds: _, err, rc = SimpleProcess(cmd).run() # rc 9 if kafka user already exists & 12 if kafka user created if rc not in (0, 9, 12): Log.debug(f"Failed in running command :{cmd}") Log.error(f"Failed in creating kafka user/group:{err}") raise KafkaSetupError(rc,\ "Failed in creating kafka user and group", err)
def deregister_message_type(self, admin_id: str, message_types: list): """ Deletes a list of message types. Parameters: admin_id A String that represents Admin client ID. message_types This is essentially equivalent to the list of queue/topic name. For e.g. ["Alert"] """ Log.debug(f"Deregister message type {message_types} using {admin_id}") admin = self._clients['admin'][admin_id] deleted_message_types = admin.delete_topics(message_types) self._task_status(deleted_message_types,\ method='deregister_message_type') for each_message_type in message_types: for list_retry in range(1, self._max_list_message_type_count + 2): if each_message_type in list(self._get_metadata(admin).keys()): if list_retry > self._max_list_message_type_count: Log.error(f"MessageBusError: Timed out after "\ f"{list_retry} retry to delete message_type "\ f"{each_message_type}") raise MessageBusError(errno.ETIMEDOUT,\ "Timed out after %d retry to delete message_type" +\ "%s.", list_retry, each_message_type) time.sleep(list_retry * 1) continue else: break
def get_children(self, element: str, element_id: str, **kwargs) -> dict: """ Get children of element. Args: element (str): [description] Returns: dict: Map of children and ids: {component: {component_type:{component_ids:{}}}} """ # TODO: update code to get com_type, currently assuming comp = com_type children_ids: list = [] children: list = HealthHierarchy.get_next_components(element) if len(children) == 0: return {} key = ElementHealthEvaluator.prepare_key(element, comp_id=element_id, **kwargs) key = key.replace("/health", "").replace("/", "", 1) data = self.healthmanager.get_key(key, just_value=False) for element in data.keys(): key_list = element.split("/") if children[0] in key_list: child_index = key_list.index(children[0]) element_id = key_list[child_index + 1] if element_id not in children_ids: children_ids.append(key_list[child_index + 1]) Log.debug(f"Children for {element}:{element_id} are {children_ids}") #{component: {component_type: [component_ids]}}} return {children[0]: {children[0]: children_ids}}
def parse_event(self, msg: str) -> HealthEvent: """ Parse event. Args: msg (str): Msg """ try: alert = json.loads(msg).get(ALERT_ATTRIBUTES.MESSAGE) event = { event_attr.EVENT_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_ID], event_attr.EVENT_TYPE : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.ALERT_TYPE], event_attr.SEVERITY : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SEVERITY], event_attr.SITE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.SITE_ID], event_attr.RACK_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RACK_ID], event_attr.CLUSTER_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.CLUSTER_ID], event_attr.STORAGESET_ID : "TBD", event_attr.NODE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.NODE_ID], event_attr.HOST_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.HOST_ID], event_attr.RESOURCE_TYPE : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RESOURCE_TYPE], event_attr.TIMESTAMP : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.EVENT_TIME], event_attr.RESOURCE_ID : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.INFO][ALERT_ATTRIBUTES.RESOURCE_ID], event_attr.SPECIFIC_INFO : alert[ALERT_ATTRIBUTES.SENSOR_RESPONSE_TYPE][ALERT_ATTRIBUTES.SPECIFIC_INFO] } Log.debug(f"Parsed {event} schema") health_event = HealthEvent.dict_to_object(event) Log.info(f"Event {event[event_attr.EVENT_ID]} is parsed and converted to object.") return health_event except Exception as e: raise EventParserException(f"Failed to parse alert. Message: {msg}, Error: {e}")
def _monitor_action(self, callback_ack, state, **args): """ Return action on status """ Log.debug(str(args)) if args[const.CURRENT_NODE_STATUS] == Action.FAILED and args[const.OTHER_NODE_STATUS] == Action.FAILED: return const.OCF_SUCCESS elif args[const.CURRENT_NODE_STATUS] == Action.FAILED: return const.OCF_ERR_GENERIC elif args[const.CURRENT_NODE_STATUS] == Action.OK: return const.OCF_SUCCESS elif args[const.CURRENT_NODE_STATUS] == Action.RESOLVED: Log.info(f"Ack for {args[const.FILENAME_KEY]} with key {args[const.PATH_KEY]}" f" node {args[const.CURRENT_NODE]}") return callback_ack(args[const.PATH_KEY]+'_'+args[const.CURRENT_NODE]) elif args[const.CURRENT_NODE_STATUS] == Action.RESTART: Log.info(f"Restart action taken for {args[const.FILENAME_KEY]} on {args[const.CURRENT_NODE]}") if state == const.STATE_START: return const.OCF_SUCCESS elif state == const.STATE_RUNNING: return const.OCF_ERR_GENERIC elif state == const.STATE_STOP: callback_ack(args[const.PATH_KEY]+'_'+args[const.CURRENT_NODE]) Log.info(f"Ack for {args[const.FILENAME_KEY]} with key {args[const.PATH_KEY]} " f" node {args[const.CURRENT_NODE]}") return Action.RESTART return const.OCF_SUCCESS else: Log.error(f"Unimplemented value for status {args[const.CURRENT_NODE_STATUS]}") return const.OCF_ERR_UNIMPLEMENTED
def _exc_components_cmd(commands: List, bundle_id: str, path: str, component: str, node_name: str, comment: str): """ Executes the Command for Bundle Generation of Every Component. :param commands: Command of the component :type:str :param bundle_id: Unique Bundle ID of the generation process. :type:str :param path: Path to create the tar by components :type:str :param component: Name of Component to be executed :type: str :param node_name:Name of Node where the Command is being Executed :type:str :param comment: :User Comment: type:str :return: """ for command in commands: Log.info(f"Executing command -> {command} {bundle_id} {path}") cmd_proc = SimpleProcess(f"{command} {bundle_id} {path}") output, err, return_code = cmd_proc.run() Log.debug(f"Command Output -> {output} {err}, {return_code}") if return_code != 0: Log.error(f"Command Output -> {output} {err}, {return_code}") ComponentsBundle._publish_log( f"Bundle generation failed for '{component}'", ERROR, bundle_id, node_name, comment) else: ComponentsBundle._publish_log( f"Bundle generation started for '{component}'", INFO, bundle_id, node_name, comment)
def main(resource, action=''): try: if action == 'meta-data': return resource.metadata() Conf.load(const.HA_GLOBAL_INDEX, Yaml(const.HA_CONFIG_FILE)) log_path = Conf.get(const.HA_GLOBAL_INDEX, f"LOG{_DELIM}path") log_level = Conf.get(const.HA_GLOBAL_INDEX, f"LOG{_DELIM}level") Log.init(service_name='resource_agent', log_path=log_path, level=log_level) with open(const.RESOURCE_SCHEMA, 'r') as f: resource_schema = json.load(f) os.makedirs(const.RA_LOG_DIR, exist_ok=True) resource_agent = resource(DecisionMonitor(), resource_schema) Log.debug(f"{resource_agent} initialized for action {action}") if action == 'monitor': return resource_agent.monitor() elif action == 'start': return resource_agent.start() elif action == 'stop': return resource_agent.stop() else: print('Usage %s [monitor] [start] [stop] [meta-data]' % sys.argv[0]) exit() except Exception as e: Log.error(f"{traceback.format_exc()}") return const.OCF_ERR_GENERIC
def process_message(self, message: str): """Callback method for MessageConsumer""" Log.debug(f'Received the message from message bus: {message}') try: EventAnalyzer(message.decode('utf-8')) return CONSUMER_STATUS.SUCCESS except ConsulException as e: Log.error(f"consule exception {e} {traceback.format_exc()} for {message}. Ack Message.") return CONSUMER_STATUS.SUCCESS except ConfError as e: Log.error(f"config exception {e} {traceback.format_exc()} for {message}. Ack Message.") return CONSUMER_STATUS.SUCCESS except EventFilterException as e: Log.error(f"Filter exception {e} {traceback.format_exc()} for {message}. Ack Message.") return CONSUMER_STATUS.SUCCESS except EventParserException as e: Log.error(f"Parser exception {e} {traceback.format_exc()} for {message}. Ack Message.") return CONSUMER_STATUS.SUCCESS except SubscriberException as e: Log.error(f"Subscriber exception {e} {traceback.format_exc()} for {message}, retry without ack.") return CONSUMER_STATUS.SUCCESS except Exception as e: Log.error(f"Unknown Exception caught {e} {traceback.format_exc()}") Log.error(f"Forcefully ack as success. msg: {message}") return CONSUMER_STATUS.SUCCESS
def main(resource: DynamicFidServiceRA, action: str = '') -> int: """ Main function acts as switch case for DynamicFidServiceRA resource agent. Args: resource (DynamicFidServiceRA): Resource agent action (str): Resource agent action called by Pacemaker. Defaults to ''. Returns: int: Provide output as int code provided by pacemaker. """ try: if action == "meta-data": return resource.metadata() ConfigManager.init("resource_agent") Log.debug(f"{resource} initialized for action {action}") if action == "monitor": return resource_agent.monitor() elif action == "start": return resource_agent.start() elif action == "stop": return resource_agent.stop() else: print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]") exit(0) except Exception as e: Log.error( f"systemd_fid_wrapper_ra failed to perform {action}. Error: {e}") return const.OCF_ERR_GENERIC
def remove_node(self, node): """ Remove node from pcs cluster """ # TODO: Limitation for node remove (in cluster node cannot remove it self) # Check if node already removed _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS) Log.info( f"Cluster status output before remove node: {_output}, {_err}, {_rc}" ) _rc, status = self.node_status(node) if _rc != 1: self._execute.run_cmd(f"pcs cluster node remove {node} --force") _rc, status = self.node_status(node) Log.debug(f"For node {node} status: {status}, rc: {_rc}") if _rc != 1: Log.error(f"Failed to remove {node}") raise Exception(f"Failed to remove {node}") else: Log.info(f"Node {node} removed from cluster") else: Log.info(f"Node {node} already removed from cluster") _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS) Log.info( f"Cluster status output after remove node: {_output}, {_err}, {_rc}" )
def _load_rules(self): """ Reads the json structured rule data from the file, and returns it. in dict format. """ rules_data = None try: if self._rule_file is None: return None Log.debug( f"Loading rules json into memory. File: {self._rule_file}") with open(self._rule_file, 'r') as fp: rules_data = fp.read() if rules_data: rules_json = JsonMessage(rules_data) self._rules_schema = rules_json.load() except OSError as os_error: if os_error.errno == errno.ENOENT: Log.error(f'File {self._rule_file} does not exist') elif os_error.errno == errno.EACCES: Log.error( f'Not enough permission to read {self._rule_file} file') else: Log.error(f'Error while reading from file {self._rule_file}')
def parse_event(self, msg: str) -> HealthEvent: """ Parse event. Args: msg (str): Msg """ try: message = json.dumps(ast.literal_eval(msg)) cluster_resource_alert = json.loads(message) timestamp = str(int(time.time())) event_id = timestamp + str(uuid.uuid4().hex) node_id = cluster_resource_alert["_resource_name"] resource_type = cluster_resource_alert["_resource_type"] event_type = cluster_resource_alert["_event_type"] timestamp = cluster_resource_alert["_timestamp"] generation_id = cluster_resource_alert["_generation_id"] event = { EVENT_ATTRIBUTES.EVENT_ID: event_id, EVENT_ATTRIBUTES.EVENT_TYPE: event_type, EVENT_ATTRIBUTES.SEVERITY: StatusMapper.EVENT_TO_SEVERITY_MAPPING[event_type], EVENT_ATTRIBUTES.SITE_ID: self.site_id, # TODO: Should be fetched from confstore EVENT_ATTRIBUTES.RACK_ID: self.rack_id, # TODO: Should be fetched from confstore EVENT_ATTRIBUTES.CLUSTER_ID: self.cluster_id, # TODO: Should be fetched from confstore EVENT_ATTRIBUTES.STORAGESET_ID: node_id, EVENT_ATTRIBUTES.NODE_ID: node_id, EVENT_ATTRIBUTES.HOST_ID: node_id, EVENT_ATTRIBUTES.RESOURCE_TYPE: resource_type, EVENT_ATTRIBUTES.TIMESTAMP: timestamp, EVENT_ATTRIBUTES.RESOURCE_ID: node_id, EVENT_ATTRIBUTES.SPECIFIC_INFO: { "generation_id": generation_id, "pod_restart": 0 } } Log.debug(f"Parsed {event} schema") health_event = HealthEvent.dict_to_object(event) Log.debug( f"Event {event[EVENT_ATTRIBUTES.EVENT_ID]} is parsed and converted to object." ) return health_event except Exception as err: raise EventParserException( f"Failed to parse cluster resource alert. Message: {msg}, Error: {err}" )