def remove_config_files(self): """ Remove file created by ha. """ files = [const.CONFIG_DIR] for file in files: CleanupCmd.remove_file(file) Log.info("All the config files are removed")
def __init__(self): """ Init method. """ super(ClusterResourceParser, self).__init__() self.cluster_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}cluster_id") self.site_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}site_id") self.rack_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}rack_id") Log.info("ClusterResource Parser is initialized ...")
def join(self): """ Blocking call, it calls join function of message bus consumer thread """ if self.consumer_thread is not None: Log.info(f"waiting for {self.name} to exit...") # wait to stop consumer thread self.consumer_thread.join() Log.info(f"The daemon {self.name} is stopped successfully.")
def set_sigterm(self, signum, frame): """ Callback function to receive a signal """ Log.info(f"Received SIGTERM: {signum}") Log.debug( f"Stopping the Health Monitor as received a signal: {signum} during execution of frame: {frame}" ) self._event_consumer.stop(flush=True)
def _get_components(components): """Get Components to Generate Support Bundle.""" if components and "all" not in components: Log.info(f"Generating bundle for {' '.join(components)}") shell_args = f"{' '.join(components)}" else: Log.info("Generating bundle for all CORTX components.") shell_args = "all" return f" -c {shell_args}"
def test(self): """ Perform configuration testing. Raises exception on error """ Log.info("Test starting...") unittest.TextTestRunner().run( unittest.TestLoader().loadTestsFromTestCase( self.get_test_module())) Log.info("Test done.") return 0
def delete_alert(self): """ Delete alert on current node. """ if not self.is_alert_exists(): return Log.info("Deleating pacemaker alert ...") self._process.run_cmd(f"pcs alert remove {AlertConfig.ALERT_ID}") Log.info(f"Alert {AlertConfig.ALERT_ID} is deleted")
def monitor_stack(cib_xml, push=False, **kwargs): """Configure monitor stack""" Log.info("HA Rules: ******* monitor_group *********") for create_resource in monitor_config: Log.info(f"HA Rules: Configure {str(create_resource)}") create_resource(cib_xml, push, **kwargs) process.run_cmd(f"pcs -f {cib_xml} resource clone monitor_group") if push: cib_push(cib_xml)
def set_sigterm(self, signum, frame): """ Callback function for signal.signal through which monitor will be notified for sigterm. """ Log.info(f"{self.name} Received signal: {signum}") Log.debug( f"{self.name} Received signal: {signum} during execution of frame: {frame}" ) self._sigterm_received.set()
def __init__(self): try: self.provisioner = provisioner Log.info("Provisioner plugin is loaded") self.provisioner.auth_init(username="******", password="******", eauth="pam") except Exception as error: self.provisioner = None Log.error(f"Provisioner module not found : {error}")
def set_sigterm(self, signum, frame): """ Callback function to receive a signal """ Log.info(f"Received SIGTERM {signum}") Log.debug( f"Stopping the Fault Tolerance Monitor received a signal: {signum} during execution of frame: {frame}" ) self.node_fault_monitor.stop(flush=True) self.cluster_stop_monitor.stop(flush=True)
def _configure_rsyslog(): """ Restart rsyslog service for reflecting supportbundle rsyslog config """ try: Log.info("Restarting rsyslog service") service_obj = Service("rsyslog.service") service_obj.restart() except Exception as e: Log.warn(f"Error in rsyslog service restart: {e}")
def init(self): """ Initalize EventAnalyzerService """ ConfigManager.init("event_analyzerd") Log.info("Event analyzer daemon initializations...") # Initialize system health confstore = ConfigManager.get_confstore() system_health = SystemHealth(confstore) # Initalize watcher self._watcher_list: dict = self._initalize_watcher(system_health)
def _destroy_cluster(self, node_name: str): """ Destroy Cluster on current node. Args: node_name (str): Node name """ Log.info(f"Destroying the cluster on {node_name}.") output = self._execute.run_cmd(const.PCS_CLUSTER_KILL) output = self._execute.run_cmd(const.PCS_CLUSTER_DESTROY) Log.info(f"Cluster destroyed. Output: {output}")
def _ack_resource(self, status_list): """ Ack resource which are already resolved Args: status_list ([dir]): Resource and its status. """ for event in status_list.keys(): if status_list[event] == Action.RESOLVED: Log.info(f"Ack of {event} event as this alert is resolved") self._decision_monitor.acknowledge_resource(event)
def __init__(self): """ Init method """ super().__init__() # Get filter type and resource types list from the alert monitor rule file self.alert_filter_components = Conf.get(ALERT_EVENT_INDEX, ALERTS.PK_ALERT_EVENT_COMPONENTS) self.alert_filter_modules = Conf.get(ALERT_EVENT_INDEX, ALERTS.PK_ALERT_EVENT_COMPONENT_MODULES) self.alert_filter_module_operations = Conf.get(ALERT_EVENT_INDEX, ALERTS.PK_ALERT_EVENT_OPERATIONS) Log.info("AlertEventFilter initialized.")
def is_alert_exists(self) -> bool: """ Check if alert already exists. """ Log.info("Checking pacemaker alert if already exists ...") output, _, rc = self._process.run_cmd("pcs alert") if rc != 0: raise AlertConfigError("Failed to execute pcs alert.") for line in output.split("\n"): if AlertConfig.ALERT_ID in line: return True return False
def run(self): """ Run server """ for watcher in self._watcher_list.keys(): Log.info(f"Starting watcher {watcher} service for event analyzer.") self._watcher_list[watcher].start() Log.info( f"Running the daemon for HA event analyzer with PID {os.getpid()}..." ) while True: time.sleep(600)
def _main() -> None: args = _parse_arguments() Log.init(service_name="pre_disruptive_upgrade", log_path=RA_LOG_DIR, level="INFO") Log.info("Script invoked as executable with params: {}".format(vars(args))) check_cluster_health() if args.backup_consul: backup_consul() backup_configuration() cluster_standby_mode() delete_resources()
def __init__(self, wait_time=10): """ Init method Create monitor objects and Sets the callbacks to sigterm """ try: # set sigterm handler signal.signal(signal.SIGTERM, self.set_sigterm) # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore # provisioner needs to be informed to add it in confstore (to be added there ) ConfigManager.init("k8s_resource_monitor") _conf_stor_search = ConftStoreSearch() self.monitors = [] # event output in pretty format kwargs = {K8SClientConst.PRETTY: True} # Seting a timeout value, 'timout_seconds', for the stream. # timeout value for connection to the server # If do not set then we will not able to stop immediately, # becuase synchronus function watch.stream() will not come back # until catch any event on which it is waiting. kwargs[K8SClientConst. TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT # Get MessageBus producer object for all monitor threads producer = self._get_producer() # Change to multiprocessing # Creating NODE monitor object node_monitor = ObjectMonitor(producer, K8SClientConst.NODE, **kwargs) self.monitors.append(node_monitor) _, nodes_list = _conf_stor_search.get_cluster_cardinality() if not nodes_list: Log.warn( f"No nodes in the cluster to watch for nodes_list: {nodes_list}" ) else: Log.info(f"Starting watch for: nodes_list: {nodes_list}") watcher_node_ids = ', '.join(node_id for node_id in nodes_list) kwargs[ K8SClientConst. LABEL_SELECTOR] = f'cortx.io/machine-id in ({watcher_node_ids})' # Creating POD monitor object pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs) self.monitors.append(pod_monitor) except Exception as err: Log.error(f'Monitor failed to start watchers: {err}')
def _get_producer(self): """ Get message bus producer """ message_type = Conf.get(const.HA_GLOBAL_INDEX, f"MONITOR{_DELIM}message_type") producer_id = Conf.get(const.HA_GLOBAL_INDEX, f"MONITOR{_DELIM}producer_id") MessageBus.init() Log.info( f"Getting producer {producer_id} for message type: {message_type}") return MessageBus.get_producer(producer_id, message_type)
def config(self): """ Performs configurations. Raises exception on error """ try: # Create backup of elasticsearch_config file. if not os.path.exists( f'{self.elasticsearch_file_path}/elasticsearch.yml.bkp'): shutil.copyfile( self.elasticsearch_config_path, f'{self.elasticsearch_file_path}/elasticsearch.yml.bkp') else: shutil.copyfile( f'{self.elasticsearch_file_path}/elasticsearch.yml.bkp', self.elasticsearch_config_path) # Get config entries that needs to add in elasticsearch.yml config_to_add = self.get_config_entries() file_contents = Elasticsearch.read_file_contents( self.elasticsearch_config_path) with open(self.elasticsearch_config_path, "a+") as f: for line in config_to_add: f.write(f'\n{line}') f.close() # load omelasticsearch module in rsyslog. file_contents = Elasticsearch.read_file_contents(self.rsyslog_conf) insert_line = 'module(load="omelasticsearch")' if not any(insert_line in i for i in file_contents): with open(self.rsyslog_conf, "w") as f: for line in file_contents: if line == '\n': continue f.write(f'\n{line}') if line.strip('\n') == "#### MODULES ####": f.write(f'\n{insert_line}\n') f.close() try: service_obj = Service('rsyslog.service') service_obj.restart() except ServiceError as e: msg = f"Restarting rsyslog.service failed due to error, {e}." Log.error(msg) except (Exception, OSError) as e: msg = f"Failed in config stage due to error {e}." Log.error(msg) raise Log.info("Config done.") return 0
def _check_for_any_resource_presence() -> None: '''Check if any resources are already present in a cluster. if yes, means, pre-upgrade steps failed. hence exit''' Log.info('Check for any resource presence in a cluster') root = _get_cib_xml() resource_list = [e.attrib["id"] for e in root.findall(".//lrm_resource") if "id" in e.attrib] if resource_list: raise UpgradeError('Some resources are already present in the cluster. \ Perform Upgrade process again')
def check_for_signals(self, k8s_watch: watch.Watch, k8s_watch_stream): """ Check if any pending signal while watching on kubernetes.watch.stream synchronusly. Note: curretnly handling only SIGTERM signal but if required we can make use of this function for convey some message/events/signals to handle while loopping over synchronus watch.stream call """ # SIGTERM signal if self._sigterm_received.is_set(): Log.info(f"{self.name} Handling SIGTERM signal.") self.handle_sigterm(k8s_watch, k8s_watch_stream) # clear the flag once handled self._sigterm_received.clear()
def stop(self): """ Stop monitoring hardware """ filename, path, service, node = self._get_params() Log.debug(f"In stop for {filename}") if self.monitor(state=const.STATE_STOP) == Action.RESTART: Log.info(f"Restarting {filename} resource") if os.path.exists(const.HA_INIT_DIR + filename): os.remove(const.HA_INIT_DIR + filename) Log.debug(f"Stopping {filename} resource") Log.debug(f"Stopped {filename} resource return success") return const.OCF_SUCCESS
def __init__(self): app = web.Application() from cortx.utils.iem_framework import IemRequestHandler from cortx.utils.message_bus import MessageBusRequestHandler app.add_routes([web.post('/EventMessage/event', IemRequestHandler.send), \ web.get('/EventMessage/event', IemRequestHandler.receive), \ web.post('/MessageBus/message/{message_type}', \ MessageBusRequestHandler.send), \ web.get('/MessageBus/message/{message_type}', \ MessageBusRequestHandler.receive)]) Log.info("Starting Message Server 127.0.0.1 on port 28300") web.run_app(app, host='127.0.0.1', port=28300)
def on_failure(self, event: HealthEvent, publish: bool) -> None: """ on failure event handle Args: event (HealthEvent): HealthEvent object publish (bool): publish bool variable Returns: None """ Log.info("Handling disk failure event.") if publish: self.publish_event(event)
def init(cls, component: str, source: str): """ Set the Event Message context Parameters: component Component that generates the IEM. For e.g. 'S3', 'SSPL' source Single character that indicates the type of component. For e.g. H-Hardware, S-Software, F-Firmware, O-OS """ cls._component = component cls._source = source Conf.load('config_file', 'json:///etc/cortx/cortx.conf', skip_reload=True) # if Log.logger is already initialized by some parent process # the same file will be used to log all the messagebus related # logs, else standard iem.log will be used. if not Log.logger: LOG_DIR='/var/log' iem_log_dir = os.path.join(LOG_DIR, 'cortx/utils/iem') log_level = Conf.get('config_file', 'utils>log_level', 'INFO') Log.init('iem', iem_log_dir, level=log_level, \ backup_count=5, file_size_in_mb=5) try: Conf.load('cluster', cls._conf_file, skip_reload=True) ids = Conf.get('cluster', 'server_node') cls._site_id = ids['site_id'] cls._rack_id = ids['rack_id'] cls._node_id = ids['node_id'] cls._cluster_id = ids['cluster_id'] except Exception as e: Log.error("Invalid config in %s." % cls._conf_file) raise EventMessageError(errno.EINVAL, "Invalid config in %s. %s", \ cls._conf_file, e) if cls._component is None: Log.error("Invalid component type: %s" % cls._component ) raise EventMessageError(errno.EINVAL, "Invalid component type: %s", \ cls._component) if cls._source not in cls._SOURCE.keys(): Log.error("Invalid source type: %s" % cls._source) raise EventMessageError(errno.EINVAL, "Invalid source type: %s", \ cls._source) cls._producer = MessageProducer(producer_id='event_producer', \ message_type='IEM', method='sync') Log.info("IEM Producer initialized for component %s and source %s" % \ (cls._component, cls._source))
async def get_active_nodes(): """ This Method is for reading hostnames, node_list information. :return: hostnames : List of Hostname :type: List :return: node_list : : List of Node Name :type: List """ Log.info("Reading hostnames, node_list information") Conf.load('cortx_cluster', 'json:///etc/cortx/cluster.conf') node_hostname_map = Conf.get('cortx_cluster', 'cluster') if not node_hostname_map: response_msg = "Node list and hostname not found." return Response(output=response_msg, rc=errno.ENODATA), None return node_hostname_map
def process_alert(self): Log.debug("Processing event for NodeAlertMonitor") # Environment variable are available in self.crm_env self.iem = IemGenerator() # Get online nodeids from corosync. nodes_ids = self._get_online_nodes() local_node_id, local_node_name = self._get_local_node() # Generate and send IEM only through the highest online node in cluster. if nodes_ids[-1].strip() == local_node_id.strip(): self.iem.generate_iem(self.crm_env["CRM_alert_node"], self.alert_event_module, self.alert_event_type) Log.info(f"Sent IEM alert from the node - name: {local_node_name}, id: {local_node_id}") else: Log.debug( f"This node does not have highest id. Local node id : {local_node_id}, all nodes: {nodes_ids.sort()}.")