def remove_rule(self, resource: str, event: HEALTH_STATUSES,
                    action: HEALTH_MON_ACTIONS):
        """
        For the rule resource/event  remove "action" from confstore.
        If actions list becomes empty, delete the rule

        Args:
            resource(str): resource name
            event(str): event type
            action(str): action to be removed
        """
        self._validate_action(action)
        key = self._prepare_key(resource, event)
        val = []
        Log.info(f"Removing rule for key: {key} ,value: {action}")
        kv = self._get_val(key)
        if kv:
            _, val = self._get_k_v(kv)
            if action not in val:
                Log.warn(f"KV not found for key: {key}, value: {action}")
            else:
                val.remove(action)
                if len(val) == 0:
                    self._confstore.delete(key)
                    Log.debug(
                        f"key value removed for {key} , {action}. value list empty; deleting key {key}"
                    )
                else:
                    val = json.dumps(val)
                    self._confstore.update(key, val)
                    Log.debug(f"KV removed for {key} , {action}")
        else:
            Log.warn(f"key {key} not found")
Example #2
0
    def set_cluster_cardinality(self, index):
        """
        Set number of nodes(pods) and their machine ids in confstore used by HA
        """

        data_pods = ConftStoreSearch.get_data_pods(index)
        server_pods = ConftStoreSearch.get_server_pods(index)
        control_pods = ConftStoreSearch.get_control_nodes(index)

        # Combine the lists data_pods, server_pod, control_pods and find unique machine ids
        watch_pods = data_pods + server_pods + control_pods
        watch_pods = list(set(watch_pods))
        num_pods = len(watch_pods)

        Log.info(
            f"Cluster cardinality: number of nodes {num_pods}, machine ids for nodes {watch_pods} "
        )

        if num_pods == 0:
            Log.warn(
                f"Possible cluster cardinality issue; number of pods to be watched {num_pods}"
            )

        # Update the same to consul; if KV already present, it will be modified.
        cluster_cardinality_key = CLUSTER_CARDINALITY_KEY
        cluster_cardinality_value = {
            CLUSTER_CARDINALITY_NUM_NODES: num_pods,
            CLUSTER_CARDINALITY_LIST_NODES: watch_pods
        }
        self._confstore.update(cluster_cardinality_key,
                               json.dumps(cluster_cardinality_value))
    def add_rule(self, resource: str, event: HEALTH_STATUSES,
                 action: HEALTH_MON_ACTIONS):
        """
        Add rule to confstore for resource/event.
        If rule exists, append the "action" to same rule

        Args:
            resource(str): resource name
            event(str): event type
            action(str): action to be added
        """
        self._validate_action(action)
        key = self._prepare_key(resource, event)
        val = []
        Log.info(f"Adding rule for key: {key} ,value: {action}")
        kv = self._get_val(key)
        if kv:
            _, val = self._get_k_v(kv)
            if action not in val:
                val.append(action)
                val = json.dumps(val)
                self._confstore.update(key, val)
            else:
                Log.warn(f"key value already exists for {key} , {action}")
                return
        else:
            val.append(action)
            val = json.dumps(val)
            self._confstore.set(key, val)
Example #4
0
    def _parse_response(self, msg) -> bool:
        """
        Check if enclosure shoutdown was successful or not

        Response with following severity expected in json response
           "severity": "informational" : Successful shutdown
           "severity": "warning" :  Failure in shutdown
        Args:
           msg : response

        Return:
            True : Enclosure shutdown was successful
            False : Enclosure shutdown failed
        """
        message = json.loads(msg).get(ACTUATOR_ATTRIBUTES.MESSAGE)
        severity = message.get(ACTUATOR_ATTRIBUTES.ACTUATOR_RESPONSE_TYPE).get(
            ACTUATOR_ATTRIBUTES.SEVERITY)
        if severity == EVENT_SEVERITIES.INFORMATIONAL.value:
            return True
        elif severity == EVENT_SEVERITIES.WARNING.value:
            return False
        else:
            Log.warn(
                f"Actuator response received with unexpected status {msg}")
            return False
Example #5
0
 async def get_bundle_status(command):
     """
     Initializes the process for Displaying the Status for Support Bundle.
     :param command: Csm_cli Command Object :type: command
     :return: None
     """
     try:
         bundle_id = command.options.get(const.SB_BUNDLE_ID)
         conf = GeneralConfig(database.DATABASE)
         db = DataBaseProvider(conf)
         repo = SupportBundleRepository(db)
         all_nodes_status = await repo.retrieve_all(bundle_id)
         response = {
             "status": [
                 each_status.to_primitive()
                 for each_status in all_nodes_status
             ]
         }
         return Response(output=response, rc=OPERATION_SUCESSFUL)
     except DataAccessExternalError as e:
         Log.warn(f"Failed to connect to elasticsearch: {e}")
         return Response(
             output=("Support Bundle status is not available currently"
                     " as required services are not running."
                     " Please wait and check the /tmp/support_bundle"
                     " folder for newly generated support bundle."),
             rc=str(errno.ECONNREFUSED))
     except Exception as e:
         Log.error(f"Failed to get bundle status: {e}")
         return Response(
             output=("Support Bundle status is not available currently"
                     " as required services are not running."
                     " Failed to get status of bundle."),
             rc=str(errno.ENOENT))
Example #6
0
 def _get_cvg_count(index, node_id):
     cvg_count = Conf.get(
         index,
         GconfKeys.CVG_COUNT.value.format(_DELIM=_DELIM, node_id=node_id))
     if not cvg_count:
         Log.warn(f"CVGs are not available for this node {node_id}")
     return cvg_count
 def start(self):
     """
     Start to listen messages.
     """
     if self._consumer is not None:
         self._consumer.start()
     else:
         Log.warn(f"Consumer not found for message type {self._message_type}.")
 def stop(self, flush=False):
     """
     stop to listen messages.
     """
     if self._consumer is not None:
         self._consumer.stop(flush=flush)
     else:
         Log.warn(f"Consumer not found for message type  {self._message_type}.")
Example #9
0
 def _configure_rsyslog():
     """Restart rsyslog service for reflecting supportbundle rsyslog config."""
     try:
         Log.info("Restarting rsyslog service")
         service_obj = Service("rsyslog.service")
         service_obj.restart()
     except Exception as e:
         Log.warn(f"Error in rsyslog service restart: {e}")
Example #10
0
    def __init__(self, wait_time=10):
        """
        Init method
        Create monitor objects and Sets the callbacks to sigterm
        """
        try:
            # set sigterm handler
            signal.signal(signal.SIGTERM, self.set_sigterm)

            # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore
            # provisioner needs to be informed to add it in confstore  (to be added there )
            ConfigManager.init("k8s_resource_monitor")
            _conf_stor_search = ConftStoreSearch()

            self.monitors = []

            # event output in pretty format
            kwargs = {K8SClientConst.PRETTY: True}

            # Seting a timeout value, 'timout_seconds', for the stream.
            # timeout value for connection to the server
            # If do not set then we will not able to stop immediately,
            # becuase synchronus function watch.stream() will not come back
            # until catch any event on which it is waiting.
            kwargs[K8SClientConst.
                   TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT

            # Get MessageBus producer object for all monitor threads
            producer = self._get_producer()

            # Change to multiprocessing
            # Creating NODE monitor object
            node_monitor = ObjectMonitor(producer, K8SClientConst.NODE,
                                         **kwargs)
            self.monitors.append(node_monitor)

            _, nodes_list = _conf_stor_search.get_cluster_cardinality()
            if not nodes_list:
                Log.warn(
                    f"No nodes in the cluster to watch for nodes_list: {nodes_list}"
                )
            else:
                Log.info(f"Starting watch for: nodes_list: {nodes_list}")
            watcher_node_ids = ', '.join(node_id for node_id in nodes_list)
            kwargs[
                K8SClientConst.
                LABEL_SELECTOR] = f'cortx.io/machine-id in ({watcher_node_ids})'

            # Creating POD monitor object
            pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs)
            self.monitors.append(pod_monitor)
        except Exception as err:
            Log.error(f'Monitor failed to start watchers: {err}')
Example #11
0
def _is_cluster_standby_on() -> None:
    '''Check if cluster is in standby mode. If not, make standby mode ON'''

    Log.info('Check cluster is in standby mode')
    value = SimpleCommand().run_cmd(CHECK_PCS_STANDBY_MODE)

    standby_value = value[0].split(' ')[3].strip('\n').split('=')

    if standby_value[1].lower() != 'on':
        Log.warn('cluster is not in standby mode.')
        Log.info('switching the cluster in standby mode for performing post upgrade routines')
        _switch_cluster_mode(PCS_CLUSTER_STANDBY)
    Log.info('#### All post-upgrade prerequisites are in place ####')
Example #12
0
    def reset(self):
        """ Performs reset. Raises exception on error """

        # Check service status
        service_obj = Service('elasticsearch.service')
        service_state = service_obj.get_state()
        if service_state._state == 'active':
            Log.warn("Elasticsearch service in active state. \n"
                     "Stopping Elasticsearch service now...")
            service_obj.stop()

        # Clear log files.
        Elasticsearch.truncate_log_files(self.log_path)
        Log.info("Reset done.")
        return 0
Example #13
0
    def enable_stonith(self):
        """
        Enable stonith for HW
        Returns:

        """
        # enable the stonith here
        env_type = Conf.get(const.HA_GLOBAL_INDEX,
                            f"CLUSTER_MANAGER{const._DELIM}env")
        if env_type.lower() == const.INSTALLATION_TYPE.HW.value.lower():
            Log.info("Enabling the stonith.")
            self._execute.run_cmd(const.PCS_STONITH_ENABLE)
            Log.info("Stonith enabled successfully.")
        else:
            Log.warn(f"Stonith is not enabled, detected {env_type} env")
Example #14
0
def mgmt_vip(cib_xml, push=False, **kwargs):
    """Create mgmt Virtual IP resource."""
    if "mgmt_info" not in kwargs.keys() or len(kwargs["mgmt_info"]) == 0:
        Log.warn("Management VIP is not detected in current configuration.")
    else:
        mgmt_info = kwargs["mgmt_info"]
        output, err, rc = process.run_cmd(f"pcs -f {cib_xml} resource create mgmt-vip ocf:heartbeat:IPaddr2 \
            ip={mgmt_info['mgmt_vip']} cidr_netmask={mgmt_info['mgmt_netmask']} nic={mgmt_info['mgmt_iface']} iflabel=mgmt_vip \
            op start timeout=60s interval=0s \
            op monitor timeout=30s interval=30s \
            op stop timeout=60s interval=0s --group management_group", check_error=False)
        if rc != 0:
            raise CreateResourceError(f"Mgmt vip creation failed, mgmt info: {mgmt_info}, Err: {err}")
    if push:
        cib_push(cib_xml)
Example #15
0
 def init(self):
     """
     Initialize the object usinf configuration params passed.
     Establish connection with Kafka broker.
     """
     self._channel = None
     retry_count = 0
     try:
         while self._channel is None and int(
                 self._retry_counter) > retry_count:
             self.connect()
             if self._channel is None:
                 Log.warn(f"message bus producer connection Failed. Retry Attempt: {retry_count+1}" \
                     f" in {2**retry_count} seconds")
                 time.sleep(2**retry_count)
                 retry_count += 1
             else:
                 Log.debug(f"message bus producer connection is Initialized."\
                 f"Attempts:{retry_count+1}")
     except Exception as ex:
         Log.error(f"message bus producer initialization failed. {ex}")
         raise ConnectionEstError(
             f"Unable to connect to message bus broker. {ex}")
Example #16
0
def mgmt_vip(cib_xml, push=False, **kwargs):
    """Create mgmt Virtual IP resource."""
    mgmt_vip_start = str(
        get_res_timeout(RESOURCE.MGMT_VIP.value, TIMEOUT_ACTION.START.value))
    mgmt_vip_stop = str(
        get_res_timeout(RESOURCE.MGMT_VIP.value, TIMEOUT_ACTION.STOP.value))
    vip_health_start = str(
        get_res_timeout(RESOURCE.VIP_HEALTH_MONITOR.value,
                        TIMEOUT_ACTION.START.value))
    vip_health_stop = str(
        get_res_timeout(RESOURCE.VIP_HEALTH_MONITOR.value,
                        TIMEOUT_ACTION.STOP.value))
    if "mgmt_info" not in kwargs.keys() or len(kwargs["mgmt_info"]) == 0:
        Log.warn("Management VIP is not detected in current configuration.")
    else:
        mgmt_info = kwargs["mgmt_info"]
        output, err, rc0 = process.run_cmd(
            f"pcs -f {cib_xml} resource create {RESOURCE.VIP_HEALTH_MONITOR.value} ocf:seagate:vip_health_monitor \
            vip={mgmt_info['mgmt_vip']} nic={mgmt_info['mgmt_iface']} \
            op start timeout={vip_health_start}s interval=0s \
            op monitor timeout=29s interval=30s \
            op stop timeout={vip_health_stop}s interval=0s --group management_group",
            check_error=False)
        output, err, rc1 = process.run_cmd(
            f"pcs -f {cib_xml} resource create {RESOURCE.MGMT_VIP.value} ocf:heartbeat:IPaddr2 \
            ip={mgmt_info['mgmt_vip']} cidr_netmask={mgmt_info['mgmt_netmask']} nic={mgmt_info['mgmt_iface']} iflabel=mgmt_vip \
            op start timeout={mgmt_vip_start}s interval=0s \
            op monitor timeout=29s interval=30s \
            op stop timeout={mgmt_vip_stop}s interval=0s --group management_group",
            check_error=False)
        if rc0 != 0 or rc1 != 0:
            raise CreateResourceError(
                f"Mgmt vip creation failed, mgmt info: {mgmt_info}, Err: {err}"
            )
    if push:
        cib_push(cib_xml)
Example #17
0
    def process(self):
        """
        Process cleanup command.
        """
        Log.info("Processing cleanup command")
        try:
            nodes = self._confstore.get(const.CLUSTER_CONFSTORE_NODES_KEY)
            node_count: int = 0 if nodes is None else len(nodes)
            node_name = self.get_node_name()
            # Standby
            # TODO: handle multiple case for standby EOS-20855
            standby_output: str = self._cluster_manager.node_controller.standby(
                node_name)
            if json.loads(standby_output).get(
                    "status") == STATUSES.FAILED.value:
                Log.warn(
                    f"Standby for {node_name} failed with output: {standby_output}."
                    "Cluster will be destroyed forcefully")
            if CleanupCmd.LOCAL_CHECK and node_count > 1:
                # TODO: Update cluster kill for --local option also
                # Remove SSH
                self._remove_node(node_name)
            else:
                # Destroy
                self._destroy_cluster(node_name)

            if self._confstore.key_exists(
                    f"{const.CLUSTER_CONFSTORE_NODES_KEY}/{node_name}"):
                self._confstore.delete(
                    f"{const.CLUSTER_CONFSTORE_NODES_KEY}/{node_name}")
            # Delete the config file
            self.remove_config_files()
        except Exception as e:
            Log.error(f"Cluster cleanup command failed. Error: {e}")
            raise HaCleanupException("Cluster cleanup failed")
        Log.info("cleanup command is successful")
Example #18
0
 def destroy_cluster(self, retry_index: int = 0, force=True):
     if retry_index < const.CLUSTER_RETRY_COUNT and not self._is_pcs_cluster_running(
     ):
         Log.warn('Cluster is not running, safe to destroy the cluster')
         if force:
             Log.warn('Executing cluster kill before destroy')
             self._execute.run_cmd(const.PCS_CLUSTER_KILL)
         output = self._execute.run_cmd(const.PCS_CLUSTER_DESTROY)
         Log.error(f"Cluster is destroyed. Output: {output}")
         return
     elif retry_index == 0:
         cluster_stop_response = self.stop()
         if cluster_stop_response:
             Log.warn(
                 'Successfully stopped the cluster, destroying the cluster')
             if not self._is_pcs_cluster_running():
                 output = self._execute.run_cmd(const.PCS_CLUSTER_DESTROY)
                 Log.error(f"Cluster destroyed. Output: {output}")
                 return
     Log.info('cluster is still running, wait for cluster to stop')
     time.sleep(const.BASE_WAIT_TIME)
     retry_index += 1
     self.destroy_cluster(retry_index)
Example #19
0
    def start(self, nodeid: str) -> dict:
        """
        Start node with nodeid.
        Args:
            nodeid (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        _node_status = self.nodes_status([nodeid])[nodeid]
        if _node_status == NODE_STATUSES.ONLINE.value:
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "msg": f"Node {nodeid}, is already in Online status"
            }
        elif _node_status == NODE_STATUSES.STANDBY.value or _node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
            # make node unstandby
            if self.heal_resource(nodeid):
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid),
                    check_error=False)
                return {
                    "status":
                    const.STATUSES.IN_PROGRESS.value,
                    "msg":
                    f"Node {nodeid} : Node was in standby mode, "
                    f"Unstandby operation started successfully"
                }
            else:
                Log.error(
                    f"Node {nodeid} is in standby mode : Resource failcount found on the node, "
                    f"cleanup not worked after 2 retries")
                return {
                    "status":
                    const.STATUSES.FAILED.value,
                    "msg":
                    f"Node {nodeid} is in standby mode: Resource "
                    f"failcount found on the node cleanup not worked after 2 retries"
                }
        elif _node_status == NODE_STATUSES.CLUSTER_OFFLINE.value:
            _output, _err, _rc = self._execute.run_cmd(
                const.PCS_NODE_START.replace("<node>", nodeid),
                check_error=False)
            if _rc != 0:
                raise ClusterManagerError(f"Failed to start node {nodeid}")

            Log.info(f'Node: {nodeid} started successfully. Now, waiting for \
                       cluster to stabalize and then get the node status')

            time.sleep(const.BASE_WAIT_TIME * 2)

            # Get the status of the node again
            _node_status = self.nodes_status([nodeid])[nodeid]

            # If the node is in standby mode, unstandby here
            if _node_status == NODE_STATUSES.STANDBY.value:
                Log.warn(f'Node: {nodeid} is still in standby mode')
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid),
                    check_error=False)
                if _rc != 0:
                    raise ClusterManagerError(
                        f"Failed to unstandby the node: {nodeid}")
                return {
                    "status":
                    const.STATUSES.IN_PROGRESS.value,
                    "msg":
                    f"Node {nodeid}: Node was in offline and then switched to standby mode, "
                    f"Cluster started on node successfully"
                }

            return {
                "status":
                const.STATUSES.IN_PROGRESS.value,
                "msg":
                f"Node {nodeid} : Node was in cluster_offline mode, "
                f"Cluster started on node successfully"
            }

        elif _node_status == NODE_STATUSES.POWEROFF.value:
            # start node not in scope of VM
            Log.error("Operation not available for node type VM")
            raise ClusterManagerError(
                f"Node {nodeid} : Node was in poweroff mode, "
                "Node start : Operation not available for VM")
        else:
            Log.error(
                f"{nodeid} status is {_node_status}, node may not be started.")
            raise ClusterManagerError(
                f"Failed to start {nodeid} as found unhandled status {_node_status}"
            )
    async def _generate_bundle(command):
        """
        Initializes the process for Generating Support Bundle at shared path.
        command:    Command Object :type: command
        return:     None.
        """
        # Get Arguments From Command
        bundle_id = command.options.get(const.SB_BUNDLE_ID)
        comment = command.options.get(const.SB_COMMENT)
        duration = command.options.get(const.SB_DURATION)
        size_limit = command.options.get(const.SB_SIZE)
        config_url = command.options.get('config_url')
        binlogs = command.options.get('binlogs')
        coredumps = command.options.get('coredumps')
        stacktrace = command.options.get('stacktrace')
        components = command.options.get('components')
        config_path = config_url.split('//')[1] if '//' in config_url else ''
        path = command.options.get('target_path')
        bundle_path = os.path.join(path, bundle_id)
        try:
            os.makedirs(bundle_path)
        except FileExistsError:
            raise BundleError(
                errno.EINVAL, "Bundle ID already exists,"
                "Please use Unique Bundle ID")

        cluster_conf = MappedConf(config_url)
        # Get Node ID
        node_id = Conf.machine_id
        if node_id is None:
            raise  BundleError(errno.EINVAL, "Invalid node_id: %s", \
                node_id)
        # Update SB status in Filestore.
        # load conf for Support Bundle
        Conf.load(const.SB_INDEX,
                  'json://' + const.FILESTORE_PATH,
                  skip_reload=True)
        data = {
            'status': 'In-Progress',
            'start_time': datetime.strftime(datetime.now(),
                                            '%Y-%m-%d %H:%M:%S')
        }
        Conf.set(const.SB_INDEX, f'{node_id}>{bundle_id}', data)
        Conf.save(const.SB_INDEX)

        node_name = cluster_conf.get(f'node>{node_id}>name')
        Log.info(f'Starting SB Generation on {node_id}:{node_name}')
        # Get required SB size per component
        components_list, service_per_comp = SupportBundle._get_component_and_services(
            cluster_conf, node_id, components)

        if not components_list:
            Log.warn(f"No component specified for {node_name} in CORTX config")
            Log.warn(f"Skipping SB generation on node:{node_name}.")
            return
        num_components = len(components_list)
        size_limit_per_comp = SupportBundle.get_component_size_limit(
            size_limit, num_components)
        bundle_obj = Bundle(bundle_id=bundle_id, bundle_path=bundle_path, \
            comment=comment,node_name=node_name, components=components_list,
            services=service_per_comp)

        # Start SB Generation on Node.
        # Adding CORTX manifest data inside support Bundle.
        try:
            # Copying config file into support bundle.
            common_locations = set()
            if config_path and os.path.exists(config_path):
                Log.info(f'For manifest data collection, taking config from \
                    {config_path} location.')
                # Remove secrets from the input config.
                conf_name = config_path.split('/')[-1]
                sb_config = config_path.replace(conf_name, 'sb_cluster.conf')
                with open(sb_config, 'w+') as sb_file:
                    with open(config_path, 'r') as f:
                        content = f.read()
                        if 'secret:' in content:
                            content = re.sub(r'secret:.+', r'secret: ****',
                                             content)
                        sb_file.write(content)
                conf_target = os.path.join(bundle_path, 'common' + config_path)
                os.makedirs(conf_target.replace(f'/{conf_name}', ''),
                            exist_ok=True)
                shutil.move(sb_config, conf_target)
                common_locations.add(config_path.split('/')[1])

            # Copying "/etc/cortx/solution" directory into support bundle
            # except for "secret" folder.
            sln_target = os.path.join(bundle_path, 'common' + const\
                .CORTX_SOLUTION_DIR)
            if os.path.exists(sln_target):
                shutil.rmtree(sln_target)
            if os.path.exists(const.CORTX_SOLUTION_DIR):
                _ = shutil.copytree(const.CORTX_SOLUTION_DIR, sln_target, \
                        ignore=shutil.ignore_patterns('secret'))
                common_locations.add(const.CORTX_SOLUTION_DIR.split('/')[1])

            # Copying RELEASE.INFO file into support bundle.
            if os.path.exists(const.CORTX_RELEASE_INFO):
                rel_target = os.path.join(bundle_path, 'common' + const\
                    .CORTX_RELEASE_INFO)
                os.makedirs(rel_target.replace('/RELEASE.INFO', ''),
                            exist_ok=True)
                shutil.copyfile(const.CORTX_RELEASE_INFO, rel_target)
                common_locations.add(const.CORTX_RELEASE_INFO.split('/')[1])
            else:
                Log.warn(f'{const.CORTX_RELEASE_INFO} file not found.')

            # Adding node resources health into the support bundle.
            health_target = os.path.join(bundle_path, 'common' + '/health')
            os.makedirs(health_target, exist_ok=True)
            with open(health_target + '/node_health.json', 'w') as fp:
                info = {}
                info["resource_usage"] = {}
                info["resource_usage"]["cpu_usage"] = SupportBundle.\
                    get_cpu_overall_usage()
                info["resource_usage"]["uptime"] = SupportBundle.\
                    get_system_uptime()
                info["resource_usage"]["disk_usage"] = SupportBundle.\
                    get_disk_overall_usage()
                info["resource_usage"]["memory_usage"] = SupportBundle.\
                    get_mem_overall_usage()
                json.dump(info, fp, indent=4)
            common_locations.add('health')

            try:
                common_path = os.path.join(bundle_path, 'common')
                common_tar = os.path.join(common_path, 'common.tar.gz')
                with tarfile.open(common_tar, "w:gz") as tar:
                    if os.path.exists(common_path):
                        tar.add(common_path, arcname='common')

                # Deleting untar directories from the common folder.
                for location in common_locations:
                    untar_location = os.path.join(common_path, location)
                    if os.path.exists(untar_location):
                        shutil.rmtree(untar_location)
            except (OSError, tarfile.TarError) as err:
                Log.error(
                    "Facing issues while adding manifest data into common "
                    "directory: {0}".format(err))

        except BundleError as be:
            Log.error(
                f"Failed to add CORTX manifest data inside Support Bundle.{be}"
            )

        try:
            await ComponentsBundle.init(bundle_obj,
                                        node_id,
                                        config_url,
                                        duration=duration,
                                        size_limit=size_limit_per_comp,
                                        binlogs=binlogs,
                                        coredumps=coredumps,
                                        stacktrace=stacktrace)
        except BundleError as be:
            Log.error(f"Bundle generation failed.{be}")
        except Exception as e:
            Log.error(f"Internal error, bundle generation failed {e}")

        if command.sub_command_name == 'generate':
            display_string_len = len(bundle_obj.bundle_id) + 4
            response_msg = (
                f"Please use the below bundle id for checking the status of support bundle."
                f"\n{'-' * display_string_len}"
                f"\n| {bundle_obj.bundle_id} |"
                f"\n{'-' * display_string_len}"
                f"\nPlease Find the file on -> {bundle_obj.bundle_path} .\n")
            return Response(output=response_msg, rc=OPERATION_SUCESSFUL)
        return bundle_obj
Example #21
0
    def check_resource_layout(self) -> bool:
        """
        Check that all necessary resources are created.

        "Bad" resources are logged and skipped to check others.
        """
        check_res = True
        for res, desc in self.layout.resources.items():
            resource_list = []
            try:
                name_list = []
                # Counters is syntastic sugar to specify names for several
                # identical resources
                if desc[RESOURCE_ATTRIBUTES.PROVIDER][
                        RESOURCE_ATTRIBUTES.COUNTERS]:
                    for counter in desc[RESOURCE_ATTRIBUTES.PROVIDER][
                            RESOURCE_ATTRIBUTES.COUNTERS]:
                        name_list.append(f"{res}-{counter}")
                else:
                    name_list = [res]
                for res_name in name_list:
                    # Check that resource actually exists
                    if desc[RESOURCE_ATTRIBUTES.HA][
                            RESOURCE_ATTRIBUTES.
                            MODE] == RESOURCE_ATTRIBUTES.ACTIVE_ACTIVE:
                        if desc[RESOURCE_ATTRIBUTES.GROUP] != "":
                            resource = self.status.get_resource_from_cloned_group_by_name(
                                res_name)
                            resource_list.append(resource)
                        else:
                            resource = self.status.get_clone_resource_by_name(
                                res_name)
                            resource_list.extend(resource.copies)
                    else:
                        resource = self.status.get_unique_resource_by_name(
                            res_name)
                        resource_list.append(resource)

                    if not resource_list:
                        Log.info(f"Resource {res_name} not found in status")
                        check_res = False
                        continue
            except Exception:
                check_res = False
                continue

            for a_resource in resource_list:
                # Check provider and service
                expected = "{}:{}".format(
                    desc[RESOURCE_ATTRIBUTES.PROVIDER][
                        RESOURCE_ATTRIBUTES.NAME],
                    desc[RESOURCE_ATTRIBUTES.PROVIDER][
                        RESOURCE_ATTRIBUTES.SERVICE])
                actual = a_resource.resource_agent
                if expected != actual:
                    Log.info(
                        f"{res}: invalid resource agent is used {actual} instead of {expected}"
                    )
                    check_res = False

                try:
                    if desc[RESOURCE_ATTRIBUTES.GROUP] != a_resource.group:
                        Log.info(
                            f'{res}: wrong group {a_resource.group} vs expected {desc[RESOURCE_ATTRIBUTES.GROUP]}'
                        )
                        check_res = False
                except KeyError:
                    Log.warn(f"{res} : Group is not defined.")
                # TODO: Location to be checked once component files become part of provisioning

        return check_res
Example #22
0
    async def init(command: List):
        """
        Initializes the Process of Support Bundle Generation for Every Component.
        :param command: Csm_cli Command Object :type: command
        :return:
        """
        # Fetch Command Arguments.
        Log.init("support_bundle",
                 syslog_server="localhost",
                 syslog_port=514,
                 log_path=Conf.get("cortx_conf",
                                   "support>support_bundle_path"),
                 level="INFO")

        bundle_id = command.options.get(const.SB_BUNDLE_ID, "")
        node_name = command.options.get(const.SB_NODE_NAME, "")
        comment = command.options.get(const.SB_COMMENT, "")
        components = command.options.get(const.SB_COMPONENTS, [])

        Log.debug((
            f"{const.SB_BUNDLE_ID}: {bundle_id}, {const.SB_NODE_NAME}: {node_name}, "
            f" {const.SB_COMMENT}: {comment}, {const.SB_COMPONENTS}: {components},"
            f" {const.SOS_COMP}"))
        # Read Commands.Yaml and Check's If It Exists.
        cmd_setup_file = os.path.join(Conf.get("cortx_conf", "install_path"),
                                      "cortx/utils/conf/support_bundle.yaml")
        support_bundle_config = Yaml(cmd_setup_file).load()
        if not support_bundle_config:
            ComponentsBundle._publish_log(f"No such file {cmd_setup_file}",
                                          ERROR, bundle_id, node_name, comment)
            return None
        # Path Location for creating Support Bundle.
        path = os.path.join(
            Conf.get("cortx_conf", "support>support_bundle_path"))

        if os.path.isdir(path):
            try:
                shutil.rmtree(path)
            except PermissionError:
                Log.warn(f"Incorrect permissions for path:{path}")

        bundle_path = os.path.join(path, bundle_id)
        os.makedirs(bundle_path)
        # Start Execution for each Component Command.
        threads = []
        command_files_info = support_bundle_config.get("COMPONENTS")
        # OS Logs are specifically generated hence here Even When All is Selected O.S. Logs Will Be Skipped.
        if components:
            if "all" not in components:
                components_list = list(
                    set(command_files_info.keys()).intersection(
                        set(components)))
            else:
                components_list = list(command_files_info.keys())
                components_list.remove(const.SOS_COMP)
        Log.debug(
            f"Generating for {const.SB_COMPONENTS} {' '.join(components_list)}"
        )
        for each_component in components_list:
            components_commands = []
            components_files = command_files_info[each_component]
            for file_path in components_files:
                file_data = Yaml(file_path).load()
                if file_data:
                    components_commands = file_data.get(
                        const.SUPPORT_BUNDLE.lower(), [])
                if components_commands:
                    thread_obj = threading.Thread(
                        ComponentsBundle._exc_components_cmd(
                            components_commands, bundle_id,
                            f"{bundle_path}{os.sep}", each_component,
                            node_name, comment))
                    thread_obj.start()
                    Log.debug(
                        f"Started thread -> {thread_obj.ident}  Component -> {each_component}"
                    )
                    threads.append(thread_obj)
        directory_path = Conf.get("cortx_conf", "support>support_bundle_path")
        tar_file_name = os.path.join(directory_path,
                                     f"{bundle_id}_{node_name}.tar.gz")

        ComponentsBundle._create_summary_file(bundle_id, node_name, comment,
                                              bundle_path)

        symlink_path = const.SYMLINK_PATH
        if os.path.exists(symlink_path):
            try:
                shutil.rmtree(symlink_path)
            except PermissionError:
                Log.warn(const.PERMISSION_ERROR_MSG.format(path=symlink_path))
        os.makedirs(symlink_path, exist_ok=True)

        # Wait Until all the Threads Execution is not Complete.
        for each_thread in threads:
            Log.debug(
                f"Waiting for thread - {each_thread.ident} to complete process"
            )
            each_thread.join(timeout=1800)
        try:
            Log.debug(
                f"Generating tar.gz file on path {tar_file_name} from {bundle_path}"
            )
            Tar(tar_file_name).dump([bundle_path])
        except Exception as e:
            ComponentsBundle._publish_log(f"Could not generate tar file {e}",
                                          ERROR, bundle_id, node_name, comment)
            return None
        try:
            Log.debug("Create soft-link for generated tar.")
            os.symlink(
                tar_file_name,
                os.path.join(symlink_path,
                             f"{const.SUPPORT_BUNDLE}.{bundle_id}"))
            ComponentsBundle._publish_log(
                f"Tar file linked at location - {symlink_path}", INFO,
                bundle_id, node_name, comment)
        except Exception as e:
            ComponentsBundle._publish_log(f"Linking failed {e}", ERROR,
                                          bundle_id, node_name, comment)
        finally:
            if os.path.isdir(bundle_path):
                shutil.rmtree(bundle_path)
        msg = "Support bundle generation completed."
        ComponentsBundle._publish_log(msg, INFO, bundle_id, node_name, comment)
Example #23
0
    def stop(self) -> dict:
        """
        Stop cluster and all service. It is Blocking call.

        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        status: str = ""
        if not self._is_pcs_cluster_running():
            raise ClusterManagerError(
                "Cluster not running on current node."
                "To stop cluster, It should be running on current node.")
        node_group: list = self._get_node_group()
        local_node: str = ConfigManager.get_local_node()
        Log.info(
            f"Node group for cluster start {node_group}, local node {local_node}"
        )
        self_group: list = list(
            filter(lambda group: (local_node in group), node_group))[0]
        node_group.remove(self_group)
        offline_nodes = self._get_filtered_nodes(
            [NODE_STATUSES.POWEROFF.value])
        # Stop cluster for other group
        for node_subgroup in node_group:
            for nodeid in node_subgroup:
                # Offline node can not be started without stonith.
                if nodeid not in offline_nodes:
                    if self.heal_resource(nodeid):
                        time.sleep(const.BASE_WAIT_TIME)
                    res = json.loads(
                        self._controllers[const.NODE_CONTROLLER].stop(nodeid))
                    Log.info(f"Stopping node {nodeid}, output {res}")
                    if NODE_STATUSES.POWEROFF.value in res.get("msg"):
                        offline_nodes.append(nodeid)
                        Log.warn(
                            f"Node {nodeid}, is in offline or lost from network."
                        )
                    elif res.get("status") == const.STATUSES.FAILED.value:
                        raise ClusterManagerError(
                            f"Cluster Stop failed. Unable to stop {nodeid}")
                    else:
                        Log.info(f"Node {nodeid} stop is in progress.")
                else:
                    Log.info(
                        f"Node {nodeid}, is in offline or lost from network.")
            # Wait till resource will get stop.
            Log.info(f"Waiting, for {node_subgroup} to stop is in progress.")
        # Stop self group of cluster
        try:
            Log.info(
                f"Please Wait, trying to stop self node group: {self_group}")
            timeout = const.NODE_STOP_TIMEOUT * len(self_group)
            self._execute.run_cmd(
                const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout)))
            Log.info("Cluster stop completed.")
        except Exception as e:
            raise ClusterManagerError(f"Cluster stop failed. Error: {e}")
        status = "Cluster stop is in progress."
        if len(offline_nodes) != 0:
            status += f" Warning, Found {offline_nodes}, may be poweroff or not in network"
        return {"status": const.STATUSES.IN_PROGRESS.value, "msg": status}
Example #24
0
    def start(self, node_id: str, **op_kwargs) -> dict:
        """
        Start node with the node_id.
        Args:
            node_id (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
                status: Succeeded, Failed, InProgress
        """
        try:
            # Get the node_name (pvtfqdn) from node_id
            node_name = ConfigManager.get_node_name(node_id=node_id)
            self._is_node_in_cluster(node_id=node_name)
            node_status = self.nodes_status([node_name])[node_name]
            Log.debug(f"Node {node_name} cluster status is {node_status}")
            node_health = self._system_health.get_node_status(
                node_id=node_id).get("status")
            Log.debug(f"Node {node_name} health is {node_health}")
            if node_status == NODE_STATUSES.ONLINE.value and node_health == HEALTH_STATUSES.ONLINE.value:
                Log.debug(f"Node {node_name} is already online")
                return {
                    "status": const.STATUSES.SUCCEEDED.value,
                    "output": NODE_STATUSES.ONLINE.value,
                    "error": ""
                }
            elif node_status == NODE_STATUSES.STANDBY.value or node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
                # Unstandby the node
                if self.heal_resource(node_name):
                    _output, _err, _rc = self._execute.run_cmd(
                        const.PCS_NODE_UNSTANDBY.replace("<node>", node_name),
                        check_error=False)
                    if _rc != 0:
                        Log.error(
                            f"Failed to start node {node_name}, Error: {_err}")
                        return {
                            "status":
                            const.STATUSES.FAILED.value,
                            "output":
                            "",
                            "error":
                            f"Failed to start node {node_id}, Error: {_err}"
                        }
                    Log.debug(
                        f"Node {node_name} was in standby mode, unstandby operation started successfully"
                    )
                else:
                    Log.error(
                        f"Node {node_name} is in standby mode : Resource failcount found on the node, cleanup did not work"
                    )
                    return {
                        "status":
                        const.STATUSES.FAILED.value,
                        "output":
                        "",
                        "error":
                        f"Node {node_id} is in standby mode, resource failcount found on the node, cleanup did not work"
                    }
            elif node_status == NODE_STATUSES.CLUSTER_OFFLINE.value:
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_NODE_START.replace("<node>", node_name),
                    check_error=False)
                if _rc != 0:
                    Log.error(
                        f"Failed to start node {node_name}, Error: {_err}")
                    return {
                        "status": const.STATUSES.FAILED.value,
                        "output": "",
                        "error":
                        f"Failed to start node {node_id}, Error: {_err}"
                    }
                Log.debug(
                    f"Node {node_name} started successfully. Waiting for cluster to stabalize and then get the node status"
                )
                time.sleep(const.BASE_WAIT_TIME * 2)
                # Get the status of the node again
                node_status = self.nodes_status([node_name])[node_name]
                # If the node is in standby mode, unstandby here
                if node_status == NODE_STATUSES.STANDBY.value:
                    Log.warn(f'Node {node_name} is still in standby mode')
                    _output, _err, _rc = self._execute.run_cmd(
                        const.PCS_NODE_UNSTANDBY.replace("<node>", node_name),
                        check_error=False)
                    if _rc != 0:
                        Log.error(
                            f"Failed to start node {node_name}, Error: {_err}")
                        return {
                            "status":
                            const.STATUSES.FAILED.value,
                            "output":
                            "",
                            "error":
                            f"Failed to start node {node_id}, Error: {_err}"
                        }
            else:
                Log.error(
                    f"{node_name} status is {node_status}, node cannot be started."
                )
                return {
                    "status":
                    const.STATUSES.FAILED.value,
                    "output":
                    "",
                    "error":
                    f"Node {node_id} status is {node_status}, node cannot be started."
                }

            # TODO: Update the storage enclosure status in system health.
            # Update the node status in system health
            self._update_health(const.COMPONENTS.NODE.value, node_id,
                                HEALTH_EVENTS.FAULT_RESOLVED.value)
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "output": NODE_STATUSES.ONLINE.value,
                "error": ""
            }
        except Exception as e:
            Log.error(f"Failed to start node {node_id}")
            raise ClusterManagerError(
                f"Failed to start node {node_id}, Error {e}")
Example #25
0
    def stop(self, sync=False, timeout=30) -> dict:
        """
        Stop cluster and all service.

        Args:
            sync (bool, optional): if sync is True then stop will check the status for timeout seconds.
            timeout (int, optional): timeout(in seconds) can be specified for sync=True otherwise ignored.

        Returns:
            ([dict]): Return dictionary. {"status": "", "output":"", "error":""}
                status: Succeeded, Failed, InProgress
        """
        status: str = ""
        if not self._is_pcs_cluster_running():
            raise ClusterManagerError(
                "Cluster not running on current node."
                "To stop cluster, It should be running on current node.")
        node_group: list = self._get_node_group()
        local_node: str = ConfigManager.get_local_node()
        Log.info(
            f"Node group for cluster start {node_group}, local node {local_node}"
        )
        self_group: list = list(
            filter(lambda group: (local_node in group), node_group))[0]
        node_group.remove(self_group)
        offline_nodes = self._get_filtered_nodes(
            [NODE_STATUSES.POWEROFF.value])
        # Stop cluster for other group
        for node_subgroup in node_group:
            for node_name in node_subgroup:
                # Offline node can not be started without stonith.
                if node_name not in offline_nodes:
                    if self.heal_resource(node_name):
                        time.sleep(const.BASE_WAIT_TIME)
                    node_id = ConfigManager.get_node_id(node_name)
                    res = json.loads(
                        self._controllers[const.NODE_CONTROLLER].stop(node_id))
                    Log.info(f"Stopping node {node_id}, output {res}")
                    if NODE_STATUSES.POWEROFF.value in res.get("output"):
                        offline_nodes.append(node_id)
                        Log.warn(
                            f"Node {node_id}, is in offline or lost from network."
                        )
                    elif res.get("status") == const.STATUSES.FAILED.value:
                        raise ClusterManagerError(
                            f"Cluster Stop failed. Unable to stop {node_id}")
                    else:
                        Log.info(f"Node {node_id} stop is in progress.")
                else:
                    Log.info(
                        f"Node {node_name}, is in offline or lost from network."
                    )
            # Wait till resource will get stop.
            Log.info(f"Waiting, for {node_subgroup} to stop is in progress.")
        # Stop self group of cluster
        try:
            Log.info(
                f"Please Wait, trying to stop self node group: {self_group}")
            timeout = const.NODE_STOP_TIMEOUT * len(self_group)
            self._execute.run_cmd(
                const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout)))
            Log.info("Cluster stop completed.")
        except Exception as e:
            raise ClusterManagerError(f"Cluster stop failed. Error: {e}")
        status = "Cluster stop is in progress."
        if len(offline_nodes) != 0:
            status += f" Warning, Found {offline_nodes}, may be poweroff or not in network"

        if sync:
            timeout = timeout - const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE * len(
                node_group)
            in_expected_state = self._verify_expected_cluster_status(
                const.CLUSTER_STATUS.OFFLINE, timeout)
            if in_expected_state:
                return {
                    "status": const.STATUSES.SUCCEEDED.value,
                    "output": "Cluster is offline.",
                    "error": ""
                }
            else:
                return {
                    "status": const.STATUSES.FAILED.value,
                    "output": "Retry suggested.",
                    "error": "Operation timed out."
                }

        return {
            "status": const.STATUSES.IN_PROGRESS.value,
            "output": status,
            "error": ""
        }