Exemple #1
0
    def remove_config_files(self):
        """
        Remove file created by ha.
        """
        files = [const.CONFIG_DIR]

        for file in files:
            CleanupCmd.remove_file(file)
        Log.info("All the config files are removed")
Exemple #2
0
 def __init__(self):
     """
     Init method.
     """
     super(ClusterResourceParser, self).__init__()
     self.cluster_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}cluster_id")
     self.site_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}site_id")
     self.rack_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}rack_id")
     Log.info("ClusterResource Parser is initialized ...")
Exemple #3
0
 def join(self):
     """
     Blocking call, it calls join function of message bus consumer thread
     """
     if self.consumer_thread is not None:
         Log.info(f"waiting for {self.name} to exit...")
         # wait to stop consumer thread
         self.consumer_thread.join()
         Log.info(f"The daemon {self.name} is stopped successfully.")
 def set_sigterm(self, signum, frame):
     """
     Callback function to receive a signal
     """
     Log.info(f"Received SIGTERM: {signum}")
     Log.debug(
         f"Stopping the Health Monitor as received a signal: {signum} during execution of frame: {frame}"
     )
     self._event_consumer.stop(flush=True)
Exemple #5
0
 def _get_components(components):
     """Get Components to Generate Support Bundle."""
     if components and "all" not in components:
         Log.info(f"Generating bundle for  {' '.join(components)}")
         shell_args = f"{' '.join(components)}"
     else:
         Log.info("Generating bundle for all CORTX components.")
         shell_args = "all"
     return f" -c {shell_args}"
Exemple #6
0
    def test(self):
        """ Perform configuration testing. Raises exception on error """

        Log.info("Test starting...")
        unittest.TextTestRunner().run(
            unittest.TestLoader().loadTestsFromTestCase(
                self.get_test_module()))
        Log.info("Test done.")
        return 0
Exemple #7
0
 def delete_alert(self):
     """
     Delete alert on current node.
     """
     if not self.is_alert_exists():
         return
     Log.info("Deleating pacemaker alert ...")
     self._process.run_cmd(f"pcs alert remove {AlertConfig.ALERT_ID}")
     Log.info(f"Alert {AlertConfig.ALERT_ID} is deleted")
Exemple #8
0
def monitor_stack(cib_xml, push=False, **kwargs):
    """Configure monitor stack"""
    Log.info("HA Rules: ******* monitor_group *********")
    for create_resource in monitor_config:
        Log.info(f"HA Rules: Configure {str(create_resource)}")
        create_resource(cib_xml, push, **kwargs)
    process.run_cmd(f"pcs -f {cib_xml} resource clone monitor_group")
    if push:
        cib_push(cib_xml)
 def set_sigterm(self, signum, frame):
     """
     Callback function for signal.signal
     through which monitor will be notified for sigterm.
     """
     Log.info(f"{self.name} Received signal: {signum}")
     Log.debug(
         f"{self.name} Received signal: {signum} during execution of frame: {frame}"
     )
     self._sigterm_received.set()
Exemple #10
0
 def __init__(self):
     try:
         self.provisioner = provisioner
         Log.info("Provisioner plugin is loaded")
         self.provisioner.auth_init(username="******",
                                    password="******",
                                    eauth="pam")
     except Exception as error:
         self.provisioner = None
         Log.error(f"Provisioner module not found : {error}")
 def set_sigterm(self, signum, frame):
     """
     Callback function to receive a signal
     """
     Log.info(f"Received SIGTERM {signum}")
     Log.debug(
         f"Stopping the Fault Tolerance Monitor received a signal: {signum} during execution of frame: {frame}"
     )
     self.node_fault_monitor.stop(flush=True)
     self.cluster_stop_monitor.stop(flush=True)
Exemple #12
0
 def _configure_rsyslog():
     """
     Restart rsyslog service for reflecting supportbundle rsyslog config
     """
     try:
         Log.info("Restarting rsyslog service")
         service_obj = Service("rsyslog.service")
         service_obj.restart()
     except Exception as e:
         Log.warn(f"Error in rsyslog service restart: {e}")
Exemple #13
0
 def init(self):
     """
     Initalize EventAnalyzerService
     """
     ConfigManager.init("event_analyzerd")
     Log.info("Event analyzer daemon initializations...")
     # Initialize system health
     confstore = ConfigManager.get_confstore()
     system_health = SystemHealth(confstore)
     # Initalize watcher
     self._watcher_list: dict = self._initalize_watcher(system_health)
Exemple #14
0
    def _destroy_cluster(self, node_name: str):
        """
        Destroy Cluster on current node.

        Args:
            node_name (str): Node name
        """
        Log.info(f"Destroying the cluster on {node_name}.")
        output = self._execute.run_cmd(const.PCS_CLUSTER_KILL)
        output = self._execute.run_cmd(const.PCS_CLUSTER_DESTROY)
        Log.info(f"Cluster destroyed. Output: {output}")
Exemple #15
0
    def _ack_resource(self, status_list):
        """
        Ack resource which are already resolved

        Args:
            status_list ([dir]): Resource and its status.
        """
        for event in status_list.keys():
            if status_list[event] == Action.RESOLVED:
                Log.info(f"Ack of {event} event as this alert is resolved")
                self._decision_monitor.acknowledge_resource(event)
Exemple #16
0
    def __init__(self):
        """
        Init method
        """
        super().__init__()

        # Get filter type and resource types list from the alert monitor rule file
        self.alert_filter_components = Conf.get(ALERT_EVENT_INDEX, ALERTS.PK_ALERT_EVENT_COMPONENTS)
        self.alert_filter_modules = Conf.get(ALERT_EVENT_INDEX, ALERTS.PK_ALERT_EVENT_COMPONENT_MODULES)
        self.alert_filter_module_operations = Conf.get(ALERT_EVENT_INDEX, ALERTS.PK_ALERT_EVENT_OPERATIONS)
        Log.info("AlertEventFilter initialized.")
Exemple #17
0
 def is_alert_exists(self) -> bool:
     """
     Check if alert already exists.
     """
     Log.info("Checking pacemaker alert if already exists ...")
     output, _, rc = self._process.run_cmd("pcs alert")
     if rc != 0:
         raise AlertConfigError("Failed to execute pcs alert.")
     for line in output.split("\n"):
         if AlertConfig.ALERT_ID in line:
             return True
     return False
Exemple #18
0
 def run(self):
     """
     Run server
     """
     for watcher in self._watcher_list.keys():
         Log.info(f"Starting watcher {watcher} service for event analyzer.")
         self._watcher_list[watcher].start()
     Log.info(
         f"Running the daemon for HA event analyzer with PID {os.getpid()}..."
     )
     while True:
         time.sleep(600)
Exemple #19
0
def _main() -> None:
    args = _parse_arguments()
    Log.init(service_name="pre_disruptive_upgrade",
             log_path=RA_LOG_DIR,
             level="INFO")
    Log.info("Script invoked as executable with params: {}".format(vars(args)))
    check_cluster_health()
    if args.backup_consul:
        backup_consul()
    backup_configuration()
    cluster_standby_mode()
    delete_resources()
Exemple #20
0
    def __init__(self, wait_time=10):
        """
        Init method
        Create monitor objects and Sets the callbacks to sigterm
        """
        try:
            # set sigterm handler
            signal.signal(signal.SIGTERM, self.set_sigterm)

            # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore
            # provisioner needs to be informed to add it in confstore  (to be added there )
            ConfigManager.init("k8s_resource_monitor")
            _conf_stor_search = ConftStoreSearch()

            self.monitors = []

            # event output in pretty format
            kwargs = {K8SClientConst.PRETTY: True}

            # Seting a timeout value, 'timout_seconds', for the stream.
            # timeout value for connection to the server
            # If do not set then we will not able to stop immediately,
            # becuase synchronus function watch.stream() will not come back
            # until catch any event on which it is waiting.
            kwargs[K8SClientConst.
                   TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT

            # Get MessageBus producer object for all monitor threads
            producer = self._get_producer()

            # Change to multiprocessing
            # Creating NODE monitor object
            node_monitor = ObjectMonitor(producer, K8SClientConst.NODE,
                                         **kwargs)
            self.monitors.append(node_monitor)

            _, nodes_list = _conf_stor_search.get_cluster_cardinality()
            if not nodes_list:
                Log.warn(
                    f"No nodes in the cluster to watch for nodes_list: {nodes_list}"
                )
            else:
                Log.info(f"Starting watch for: nodes_list: {nodes_list}")
            watcher_node_ids = ', '.join(node_id for node_id in nodes_list)
            kwargs[
                K8SClientConst.
                LABEL_SELECTOR] = f'cortx.io/machine-id in ({watcher_node_ids})'

            # Creating POD monitor object
            pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs)
            self.monitors.append(pod_monitor)
        except Exception as err:
            Log.error(f'Monitor failed to start watchers: {err}')
Exemple #21
0
 def _get_producer(self):
     """
     Get message bus producer
     """
     message_type = Conf.get(const.HA_GLOBAL_INDEX,
                             f"MONITOR{_DELIM}message_type")
     producer_id = Conf.get(const.HA_GLOBAL_INDEX,
                            f"MONITOR{_DELIM}producer_id")
     MessageBus.init()
     Log.info(
         f"Getting producer {producer_id} for message type: {message_type}")
     return MessageBus.get_producer(producer_id, message_type)
    def config(self):
        """
        Performs configurations.

        Raises exception on error
        """
        try:
            # Create backup of elasticsearch_config file.
            if not os.path.exists(
                    f'{self.elasticsearch_file_path}/elasticsearch.yml.bkp'):
                shutil.copyfile(
                    self.elasticsearch_config_path,
                    f'{self.elasticsearch_file_path}/elasticsearch.yml.bkp')
            else:
                shutil.copyfile(
                    f'{self.elasticsearch_file_path}/elasticsearch.yml.bkp',
                    self.elasticsearch_config_path)
            # Get config entries that needs to add in elasticsearch.yml
            config_to_add = self.get_config_entries()
            file_contents = Elasticsearch.read_file_contents(
                self.elasticsearch_config_path)
            with open(self.elasticsearch_config_path, "a+") as f:
                for line in config_to_add:
                    f.write(f'\n{line}')
                f.close()

            # load omelasticsearch module in rsyslog.
            file_contents = Elasticsearch.read_file_contents(self.rsyslog_conf)
            insert_line = 'module(load="omelasticsearch")'
            if not any(insert_line in i for i in file_contents):
                with open(self.rsyslog_conf, "w") as f:
                    for line in file_contents:
                        if line == '\n':
                            continue
                        f.write(f'\n{line}')
                        if line.strip('\n') == "#### MODULES ####":
                            f.write(f'\n{insert_line}\n')
                    f.close()

            try:
                service_obj = Service('rsyslog.service')
                service_obj.restart()
            except ServiceError as e:
                msg = f"Restarting rsyslog.service failed due to error, {e}."
                Log.error(msg)

        except (Exception, OSError) as e:
            msg = f"Failed in config stage due to error {e}."
            Log.error(msg)
            raise

        Log.info("Config done.")
        return 0
Exemple #23
0
def _check_for_any_resource_presence() -> None:
    '''Check if any resources are already present in a cluster.
       if yes, means, pre-upgrade steps failed. hence exit'''

    Log.info('Check for any resource presence in a cluster')

    root = _get_cib_xml()
    resource_list = [e.attrib["id"] for e in root.findall(".//lrm_resource")
                if "id" in e.attrib]

    if resource_list:
        raise UpgradeError('Some resources are already present in the cluster. \
                            Perform Upgrade process again')
 def check_for_signals(self, k8s_watch: watch.Watch, k8s_watch_stream):
     """
     Check if any pending signal while watching on kubernetes.watch.stream synchronusly.
     Note: curretnly handling only SIGTERM signal
     but if required we can make use of this function for convey some message/events/signals
     to handle while loopping  over synchronus watch.stream call
     """
     # SIGTERM signal
     if self._sigterm_received.is_set():
         Log.info(f"{self.name} Handling SIGTERM signal.")
         self.handle_sigterm(k8s_watch, k8s_watch_stream)
         # clear the flag once handled
         self._sigterm_received.clear()
Exemple #25
0
 def stop(self):
     """
     Stop monitoring hardware
     """
     filename, path, service, node = self._get_params()
     Log.debug(f"In stop for {filename}")
     if self.monitor(state=const.STATE_STOP) == Action.RESTART:
         Log.info(f"Restarting {filename} resource")
     if os.path.exists(const.HA_INIT_DIR + filename):
         os.remove(const.HA_INIT_DIR + filename)
         Log.debug(f"Stopping {filename} resource")
     Log.debug(f"Stopped {filename} resource return success")
     return const.OCF_SUCCESS
    def __init__(self):
        app = web.Application()
        from cortx.utils.iem_framework import IemRequestHandler
        from cortx.utils.message_bus import MessageBusRequestHandler
        app.add_routes([web.post('/EventMessage/event', IemRequestHandler.send), \
            web.get('/EventMessage/event', IemRequestHandler.receive), \
            web.post('/MessageBus/message/{message_type}', \
            MessageBusRequestHandler.send), \
            web.get('/MessageBus/message/{message_type}', \
            MessageBusRequestHandler.receive)])

        Log.info("Starting Message Server 127.0.0.1 on port 28300")
        web.run_app(app, host='127.0.0.1', port=28300)
Exemple #27
0
    def on_failure(self, event: HealthEvent, publish: bool) -> None:
        """
        on failure event handle
        Args:
            event (HealthEvent): HealthEvent object
            publish (bool): publish bool variable

        Returns:
            None
        """
        Log.info("Handling disk failure event.")
        if publish:
            self.publish_event(event)
    def init(cls, component: str, source: str):
        """
        Set the Event Message context

        Parameters:
        component       Component that generates the IEM. For e.g. 'S3', 'SSPL'
        source          Single character that indicates the type of component.
                        For e.g. H-Hardware, S-Software, F-Firmware, O-OS
        """

        cls._component = component
        cls._source = source

        Conf.load('config_file', 'json:///etc/cortx/cortx.conf',
            skip_reload=True)
        # if Log.logger is already initialized by some parent process
        # the same file will be used to log all the messagebus related
        # logs, else standard iem.log will be used.
        if not Log.logger:
            LOG_DIR='/var/log'
            iem_log_dir = os.path.join(LOG_DIR, 'cortx/utils/iem')
            log_level = Conf.get('config_file', 'utils>log_level', 'INFO')
            Log.init('iem', iem_log_dir, level=log_level, \
                backup_count=5, file_size_in_mb=5)

        try:
            Conf.load('cluster', cls._conf_file, skip_reload=True)
            ids = Conf.get('cluster', 'server_node')
            cls._site_id = ids['site_id']
            cls._rack_id = ids['rack_id']
            cls._node_id = ids['node_id']
            cls._cluster_id = ids['cluster_id']
        except Exception as e:
            Log.error("Invalid config in %s." % cls._conf_file)
            raise EventMessageError(errno.EINVAL, "Invalid config in %s. %s", \
                cls._conf_file, e)

        if cls._component is None:
            Log.error("Invalid component type: %s" % cls._component )
            raise EventMessageError(errno.EINVAL, "Invalid component type: %s", \
                cls._component)

        if cls._source not in cls._SOURCE.keys():
            Log.error("Invalid source type: %s" % cls._source)
            raise EventMessageError(errno.EINVAL, "Invalid source type: %s", \
                cls._source)

        cls._producer = MessageProducer(producer_id='event_producer', \
            message_type='IEM', method='sync')
        Log.info("IEM Producer initialized for component %s and source %s" % \
             (cls._component, cls._source))
Exemple #29
0
    async def get_active_nodes():
        """
        This Method is for reading hostnames, node_list information.
        :return: hostnames : List of Hostname :type: List
        :return: node_list : : List of Node Name :type: List
        """
        Log.info("Reading hostnames, node_list information")
        Conf.load('cortx_cluster', 'json:///etc/cortx/cluster.conf')
        node_hostname_map = Conf.get('cortx_cluster', 'cluster')
        if not node_hostname_map:
            response_msg = "Node list and hostname not found."
            return Response(output=response_msg, rc=errno.ENODATA), None

        return node_hostname_map
 def process_alert(self):
     Log.debug("Processing event for NodeAlertMonitor")
     # Environment variable are available in self.crm_env
     self.iem = IemGenerator()
     # Get online nodeids from corosync.
     nodes_ids = self._get_online_nodes()
     local_node_id, local_node_name = self._get_local_node()
     # Generate and send IEM only through the highest online node in cluster.
     if nodes_ids[-1].strip() == local_node_id.strip():
         self.iem.generate_iem(self.crm_env["CRM_alert_node"], self.alert_event_module, self.alert_event_type)
         Log.info(f"Sent IEM alert from the node - name: {local_node_name}, id: {local_node_id}")
     else:
         Log.debug(
             f"This node does not have highest id. Local node id : {local_node_id}, all nodes: {nodes_ids.sort()}.")