def main(resource: DynamicFidServiceRA, action: str = '') -> int:
    """
    Main function acts as switch case for DynamicFidServiceRA resource agent.

    Args:
        resource (DynamicFidServiceRA): Resource agent
        action (str): Resource agent action called by Pacemaker. Defaults to ''.

    Returns:
        int: Provide output as int code provided by pacemaker.
    """
    try:
        if action == "meta-data":
            return resource.metadata()
        ConfigManager.init("resource_agent")
        Log.debug(f"{resource} initialized for action {action}")
        if action == "monitor":
            return resource_agent.monitor()
        elif action == "start":
            return resource_agent.start()
        elif action == "stop":
            return resource_agent.stop()
        else:
            print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]")
            exit(0)
    except Exception as e:
        Log.error(
            f"systemd_fid_wrapper_ra failed to perform {action}. Error: {e}")
        return const.OCF_ERR_GENERIC
Beispiel #2
0
def main(argv: list):
    try:
        if len(sys.argv) == 1:
            Cmd.usage("ha_setup")
            sys.exit(1)
        if sys.argv[1] == "cleanup":
            if not os.path.exists(const.HA_CONFIG_FILE):
                a_str = f'Cleanup can not be proceed as \
                           HA config file: {const.HA_CONFIG_FILE} \
                           is missing. Either cleanup is already done or there \
                           is some other problem'

                sys.stdout.write(a_str)
                return 0
            ConfigManager.init("ha_setup")

        desc = "HA Setup command"
        command = Cmd.get_command(desc, argv[1:])
        command.process()

    except Exception as err:
        Log.error("%s\n" % traceback.format_exc())
        sys.stderr.write(
            f"Setup command:{argv[1]} failed for cortx-ha. Error: {err}\n")
        return errno.EINVAL
Beispiel #3
0
 def __init__(self):
     """
     Init method
     """
     self.crm_env = None
     # Loads Alert event filter rules in the configuration
     ConfigManager.load_alert_events_rules()
Beispiel #4
0
def main(argv: list):
    try:
        if sys.argv[1] == "post_install":
            Conf.init(delim='.')
            Conf.load(const.HA_GLOBAL_INDEX,
                      f"yaml://{const.SOURCE_CONFIG_FILE}")
            log_path = Conf.get(const.HA_GLOBAL_INDEX, "LOG.path")
            log_level = Conf.get(const.HA_GLOBAL_INDEX, "LOG.level")
            Log.init(service_name='ha_setup',
                     log_path=log_path,
                     level=log_level)
        else:
            ConfigManager.init("ha_setup")

        desc = "HA Setup command"
        command = Cmd.get_command(desc, argv[1:])
        command.process()

        sys.stdout.write(
            f"Mini Provisioning {sys.argv[1]} configured successfully.\n")
    except Exception as err:
        Log.error("%s\n" % traceback.format_exc())
        sys.stderr.write(
            f"Setup command:{argv[1]} failed for cortx-ha. Error: {err}\n")
        return errno.EINVAL
def main(action: str = '') -> int:
    """
    Main function acts as switch case for IPHealthChecker resource agent.

    Args:
        action (str): Resource agent action called by Pacemaker. Defaults to ''.

    Returns:
        int: Provide output as int code provided by pacemaker.
    """
    try:
        if action == "meta-data":
            return VipHealthMonitor.metadata()
        ConfigManager.init("resource_agent")
        resource_agent = VipHealthMonitor()
        Log.debug(f"{resource_agent} initialized for action {action}")
        if action == "monitor":
            return resource_agent.monitor()
        elif action == "start":
            return resource_agent.start()
        elif action == "stop":
            return resource_agent.stop()
        else:
            print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]")
            exit(0)
    except Exception as e:
        Log.error(
            f"vip health check failed to perform {action}. Error: {traceback.format_exc()} {e}"
        )
        return const.OCF_ERR_GENERIC
Beispiel #6
0
 def setUp(self):
     """
     Setup the prerequisit of tests
     """
     print("Setup")
     ConfigManager.init("test_Cluster_stop_sigterm")
     self.confstore = ConfigManager.get_confstore()
     MessageBus.init()
 def __init__(self):
     """
     Init method
     Create monitor objects and Sets the callbacks to sigterm
     """
     signal.signal(signal.SIGTERM, self.set_sigterm)
     ConfigManager.init("fault_tolerance")
     self.node_fault_monitor = NodeFaultMonitor()
     self.cluster_stop_monitor = ClusterStopMonitor()
Beispiel #8
0
 def init(self):
     """
     Initalize EventAnalyzerService
     """
     ConfigManager.init("event_analyzerd")
     Log.info("Event analyzer daemon initializations...")
     # Initialize system health
     confstore = ConfigManager.get_confstore()
     system_health = SystemHealth(confstore)
     # Initalize watcher
     self._watcher_list: dict = self._initalize_watcher(system_health)
Beispiel #9
0
    def __init__(self, wait_time=10):
        """
        Init method
        Create monitor objects and Sets the callbacks to sigterm
        """
        try:
            # set sigterm handler
            signal.signal(signal.SIGTERM, self.set_sigterm)

            # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore
            # provisioner needs to be informed to add it in confstore  (to be added there )
            ConfigManager.init("k8s_resource_monitor")
            _conf_stor_search = ConftStoreSearch()

            self.monitors = []

            # event output in pretty format
            kwargs = {K8SClientConst.PRETTY: True}

            # Seting a timeout value, 'timout_seconds', for the stream.
            # timeout value for connection to the server
            # If do not set then we will not able to stop immediately,
            # becuase synchronus function watch.stream() will not come back
            # until catch any event on which it is waiting.
            kwargs[K8SClientConst.
                   TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT

            # Get MessageBus producer object for all monitor threads
            producer = self._get_producer()

            # Change to multiprocessing
            # Creating NODE monitor object
            node_monitor = ObjectMonitor(producer, K8SClientConst.NODE,
                                         **kwargs)
            self.monitors.append(node_monitor)

            _, nodes_list = _conf_stor_search.get_cluster_cardinality()
            if not nodes_list:
                Log.warn(
                    f"No nodes in the cluster to watch for nodes_list: {nodes_list}"
                )
            else:
                Log.info(f"Starting watch for: nodes_list: {nodes_list}")
            watcher_node_ids = ', '.join(node_id for node_id in nodes_list)
            kwargs[
                K8SClientConst.
                LABEL_SELECTOR] = f'cortx.io/machine-id in ({watcher_node_ids})'

            # Creating POD monitor object
            pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs)
            self.monitors.append(pod_monitor)
        except Exception as err:
            Log.error(f'Monitor failed to start watchers: {err}')
def _main() -> None:
    args = _parse_arguments()
    ConfigManager.init(log_name="pre_disruptive_upgrade",
                       log_path=RA_LOG_DIR,
                       level="INFO")
    Log.info("Script invoked as executable with params: {}".format(vars(args)))
    check_cluster_health()
    if args.backup_consul:
        backup_consul()
    backup_configuration()
    cluster_standby_mode()
    delete_resources()
Beispiel #11
0
    def __init__(self):
        """
        Init method
        """
        #Loads alert flter rules in the configuration
        ConfigManager.load_filter_rules()

        #Get filter type and resource types list from the alert rule file
        self.filter_type = Conf.get(const.ALERT_FILTER_INDEX,
                                    "alert.filter_type")
        self.resource_types_list = Conf.get(const.ALERT_FILTER_INDEX,
                                            "alert.resource_type")
Beispiel #12
0
    def __init__(self):
        """
        Init alert monitor
        """
        super(AlertMonitor, self).__init__()
        ConfigManager.init("alert_monitor")

        # get environment variables
        self.crm_env = self._get_env()
        alert_event_filter = AlertEventFilter()
        alert_event_filter.initialize_crm(self.crm_env)
        # Modules like Node, Resource, Fencing / Modules event like node became member or node lost
        self.alert_event_module, self.alert_event_type = alert_event_filter.filter_event()
Beispiel #13
0
 def setUp(self):
     ConfigManager.init('resource_agent')
     self.ts = int(time.time())
     self.td = datetime.fromtimestamp(
         self.ts).strftime('%Y-%m-%dT%H:%M:%S.000000+0000')
     with open(const.RESOURCE_SCHEMA, 'r') as f:
         self.schema = json.load(f)
     self.hw_agent = HardwareResourceAgent(DecisionMonitor(), self.schema)
     self.key = f"cortx{const.HA_DELIM}base{const.HA_DELIM}ha{const.HA_DELIM}obj"
     self.filename = 'io_path_health_c1'
     self.path = 'io'
     self.local = self.schema['nodes']['local']
     self.consul = consul.Consul()
Beispiel #14
0
 def __init__(self):
     """
     Init method.
     """
     super(ClusterResourceParser, self).__init__()
     ConfigManager.init("event_analyzer")
     self.cluster_id = Conf.get(const.HA_GLOBAL_INDEX,
                                f"COMMON_CONFIG{_DELIM}cluster_id")
     self.site_id = Conf.get(const.HA_GLOBAL_INDEX,
                             f"COMMON_CONFIG{_DELIM}site_id")
     self.rack_id = Conf.get(const.HA_GLOBAL_INDEX,
                             f"COMMON_CONFIG{_DELIM}rack_id")
     Log.info("ClusterResource Parser is initialized ...")
Beispiel #15
0
 def __init__(self, msg=None):
     '''init method'''
     ConfigManager.init('event_analyzer')
     self._confstore = ConfigManager.get_confstore()
     system_health = SystemHealth(self._confstore)
     self._cluster_resource_filter = ClusterResourceFilter()
     self._cluster_resource_parser = ClusterResourceParser()
     if self._cluster_resource_filter.filter_event(msg):
         health_event = self._cluster_resource_parser.parse_event(msg)
         try:
             system_health.process_event(health_event)
         except Exception as e:
             Log.error(f"Failed to process event. Error: {e}")
             raise SubscriberException(
                 f"Failed to process event {str(health_event)}. Error: {e}")
def perform_post_upgrade(ios_instances=None,
                         s3_instances=None,
                         do_unstandby=False,
                         mgmt_info=None,
                         node_count=None):
    '''Starting routine for post-upgrade process'''
    ConfigManager.init(log_name="post_disruptive_upgrade",
                       log_path=RA_LOG_DIR,
                       level="INFO")
    _check_for_any_resource_presence()
    _is_cluster_standby_on()
    _load_config()
    _create_resources(ios_instances, s3_instances, mgmt_info, node_count)
    if do_unstandby:
        _unstandby_cluster()
Beispiel #17
0
 def __init__(self):
     self._conf_store = ConfigManager.get_confstore()
     self._machine_id = MachineId.get_machine_id()
     self._uuid = None
     self._is_resp_received = False
     self._encl_shutdown_successful = False
     self.timeout_reached = False
Beispiel #18
0
 def __init__(self):
     """
     Initalize PcsNodeController
     """
     super(PcsNodeController, self).__init__()
     self._confstore = ConfigManager.get_confstore()
     self._system_health = SystemHealth(self._confstore)
Beispiel #19
0
 def check_cluster_feasibility(self, node_id: str) -> dict:
     """
         Check whether the cluster is going to be offline after node with node_id is stopped.
     Args:
         node_id (str): Node ID from cluster nodes.
     Returns:
         Dictionary : {"status": "", "msg":""}
     """
     # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
     node_name = ConfigManager.get_node_name(node_id=node_id)
     node_list = self._get_node_list()
     offline_nodes = self._get_offline_nodes()
     Log.debug(f"nodelist : {node_list} offlinenodes : {offline_nodes}")
     num_nodes = len(node_list)
     max_nodes_offline = num_nodes // 2 if num_nodes % 2 == 1 else (
         num_nodes // 2) - 1
     if (len(offline_nodes) + 1) > max_nodes_offline:
         Log.debug(
             f"Stopping the node {node_name} will cause a loss of the quorum"
         )
         return {
             "status": const.STATUSES.FAILED.value,
             "output": "",
             "error": "Stopping the node will cause a loss of the quorum"
         }
     else:
         Log.debug(
             f"Stopping the node {node_name} will not cause a loss of the quorum"
         )
         return {
             "status": const.STATUSES.SUCCEEDED.value,
             "output": "",
             "error": ""
         }
 def __init__(self):
     """
     Initialize IPMI Fencing Agent class.
     """
     super(IpmiFencingAgent, self).__init__()
     self._confstore = ConfigManager.get_confstore()
     self._execute = SimpleCommand()
Beispiel #21
0
 def __init__(self):
     """
     Initialize pcs controller
     """
     super(PcsController, self).__init__()
     self._execute = SimpleCommand()
     self._confstore = ConfigManager.get_confstore()
Beispiel #22
0
    def __init__(self, version="2.0", default_log_enable=True):
        """
        Manage cluster operation
        """
        self._version = version

        # TODO: Update Config manager if log utility changes.(reference EOS-17614)
        if default_log_enable is True:
            ConfigManager.init("cluster_manager")
        else:
            ConfigManager.init(None)
        self._cluster_type = Conf.get(const.HA_GLOBAL_INDEX,
                                      f"CLUSTER_MANAGER{_DELIM}cluster_type")
        self._env = Conf.get(const.HA_GLOBAL_INDEX,
                             f"CLUSTER_MANAGER{_DELIM}env")
        self._confstore = ConfigManager.get_confstore()

        # Raise exception if user does not have proper permissions
        self._validate_permissions()

        ConfigManager.load_controller_schema()
        self._controllers = ElementControllerFactory.init_controller(
            self._env, self._cluster_type)
        for controller in self._controllers.keys():
            Log.info(f"Adding {controller} property to cluster manager.")
            # Add property method for controller
            # Example: cm.cluster_controller.start()
            # Find more example in test case.
            self.__dict__[controller] = self._controllers[controller]
Beispiel #23
0
 def __init__(self, args: dict):
     """
     Init method.
     """
     self._url = args.config
     Conf.load(self._index, self._url)
     self._args = args.args
     self._execute = SimpleCommand()
     self._confstore = ConfigManager._get_confstore()
     self._cluster_manager = None
 def __init__(self, default_log_enable, singleton_check: bool = False):
     """
     Private Constructor.
     Make initialization work for Event Manager
     """
     if singleton_check is False:
         raise Exception("Please use EventManager.get_instance() to fetch \
                          singleton instance of class")
     if EventManager.__instance is None:
         EventManager.__instance = self
     else:
         raise Exception(
             "EventManager is singleton class, use EventManager.get_instance()."
         )
     if default_log_enable:
         ConfigManager.init(const.EVENT_MANAGER_LOG)
     self._confstore = ConfigManager.get_confstore()
     self._monitor_rule = MonitorRulesManager()
     self._default_action = HEALTH_MON_ACTIONS.PUBLISH_ACT.value
     MessageBus.init()
Beispiel #25
0
    def __init__(self, wait_time=10):
        """
        Init method
        Create monitor objects and Sets the callbacks to sigterm
        """
        # set sigterm handler
        signal.signal(signal.SIGTERM, self.set_sigterm)

        # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore
        # provisioner needs to be informed to add it in confstore  (to be added there )
        ConfigManager.init("k8s_resource_monitor")

        self.monitors = []

        # event output in pretty format
        kwargs = {K8SClientConst.PRETTY: True}

        # Seting a timeout value, 'timout_seconds', for the stream.
        # timeout value for connection to the server
        # If do not set then we will not able to stop immediately,
        # becuase synchronus function watch.stream() will not come back
        # until catch any event on which it is waiting.
        kwargs[K8SClientConst.
               TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT

        # Get MessageBus producer object for all monitor threads
        producer = self._get_producer()

        # Change to multiprocessing
        # Creating NODE monitor object
        node_monitor = ObjectMonitor(producer, K8SClientConst.NODE, **kwargs)
        self.monitors.append(node_monitor)

        pod_labels = Conf.get(const.HA_GLOBAL_INDEX, "data_pod_label")
        pod_label_str = ', '.join(pod_label for pod_label in pod_labels)
        # TODO : Change 'name' field to 'app' in label_selector if required.
        kwargs[K8SClientConst.LABEL_SELECTOR] = f'name in ({pod_label_str})'

        # Creating POD monitor object
        pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs)
        self.monitors.append(pod_monitor)
Beispiel #26
0
 def stop_cluster(self):
     """
     Sets the cluster stop key in confstore for the k8s monitor
     to notify cltuster shutdown is started
     """
     Log.info(f'The cluster stop message on message bus ({self._message_type}) is received.')
     confstore = ConfigManager.get_confstore()
     if not confstore.key_exists(const.CLUSTER_STOP_KEY):
         Log.info(f'Setting key {const.CLUSTER_STOP_KEY} to {const.CLUSTER_STOP_VAL_ENABLE} in confstore.')
         confstore.set(const.CLUSTER_STOP_KEY, const.CLUSTER_STOP_VAL_ENABLE)
     else:
         Log.info(f'Updating key {const.CLUSTER_STOP_KEY} to {const.CLUSTER_STOP_VAL_ENABLE} in confstore.')
         confstore.update(const.CLUSTER_STOP_KEY, const.CLUSTER_STOP_VAL_ENABLE)
def main(resource, action=''):
    try:
        if action == 'meta-data':
            return resource.metadata()
        ConfigManager.init(log_name='resource_agent')
        with open(const.RESOURCE_SCHEMA, 'r') as f:
            resource_schema = json.load(f)
        os.makedirs(const.RA_LOG_DIR, exist_ok=True)
        resource_agent = resource(DecisionMonitor(), resource_schema)
        Log.debug(f"{resource_agent} initialized for action {action}")
        if action == 'monitor':
            return resource_agent.monitor()
        elif action == 'start':
            return resource_agent.start()
        elif action == 'stop':
            return resource_agent.stop()
        else:
            print('Usage %s [monitor] [start] [stop] [meta-data]' %
                  sys.argv[0])
            exit()
    except Exception as e:
        Log.error(f"{traceback.format_exc()}")
        return const.OCF_ERR_GENERIC
Beispiel #28
0
 def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict:
     """
     Stop Node with nodeid.
     Args:
         nodeid (str): Node ID from cluster nodes.
     Returns:
         ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
             status: Succeeded, Failed, InProgress
     """
     check_cluster = op_kwargs.get("check_cluster") if op_kwargs.get(
         "check_cluster") is not None else True
     # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
     node_name = ConfigManager.get_node_name(node_id=node_id)
     try:
         timeout = const.NODE_STOP_TIMEOUT if timeout < 0 else timeout
         node_status = self._system_health.get_node_status(
             node_id=node_id).get("status")
         if node_status == HEALTH_STATUSES.OFFLINE.value:
             Log.info(
                 f"For stop node id {node_id}, Node already in offline state."
             )
             status = f"Node with node id {node_id} is already in offline state."
             return {
                 "status": const.STATUSES.SUCCEEDED.value,
                 "output": status,
                 "error": ""
             }
         elif node_status == HEALTH_STATUSES.FAILED.value:
             # In case VM, if node is Poweroff or Disconnected, system health will be updated with status FAILED.
             return {
                 "status":
                 const.STATUSES.FAILED.value,
                 "output":
                 "",
                 "error":
                 f"Node {node_id} status is {node_status}, node cannot be stopped."
             }
         else:
             if self.heal_resource(node_name):
                 time.sleep(const.BASE_WAIT_TIME)
             if check_cluster:
                 # Checks whether cluster is going to be offline if node with node_name is stopped.
                 res = json.loads(
                     self.check_cluster_feasibility(node_id=node_id))
                 if res.get("status") == const.STATUSES.FAILED.value:
                     return res
     except Exception as e:
         raise ClusterManagerError(
             f"Failed to stop node {node_id}, Error: {e}")
Beispiel #29
0
    def __init__(self, singleton_check: bool = False):
        """
        Private Constructor. Make initialization work for HealthMonitorService

        Args:
            singleton_check (bool, optional): Create instance with get_instance. Defaults to False.
        """
        if singleton_check is False:
            raise Exception(
                "Please use HealthMonitorService.get_instance() to fetch \
                             singleton instance of class")
        if HealthMonitorService.__instance is None:
            HealthMonitorService.__instance = self
        else:
            raise Exception(
                "HealthMonitorService is singleton class, use HealthMonitorService.get_instance()."
            )
        # initialize
        ConfigManager.init(const.HEALTH_MONITOR_LOG)
        # set sigterm handler
        signal.signal(signal.SIGTERM, self.set_sigterm)
        self._confstore = ConfigManager.get_confstore()
        self._rule_manager = MonitorRulesManager()
        self._event_consumer = self._get_consumer()
Beispiel #30
0
    def get_installation_type(self):
        hw_type = ConfigManager.get_hw_env()
        if hw_type is not None:
            install_type = hw_type.lower()
        else:
            Log.error("Error: Can not fetch h/w env from Config.")
            raise HaConfigException("h/w env not present in config.")

        nodes = self.get_nodelist(fetch_from=Cmd.HA_CONFSTORE)
        if len(nodes) == 1 and install_type == const.INSTALLATION_TYPE.VM:
            install_type = const.INSTALLATION_TYPE.SINGLE_VM

        Log.info(f"Nodes count = {len(nodes)}, Install type = {install_type}")

        return install_type