Python ClusterManagerError Examples, ha.core.error.ClusterManagerError Python Examples

Example #1

0

Show file

 def stop(self, nodeid: str, timeout: int = -1) -> dict:
     """
     Stop Cluster on node with nodeid.
     Args:
         nodeid (str): Node ID from cluster nodes.
     Returns:
         ([dict]): Return dictionary. {"status": "", "msg":""}
             status: Succeeded, Failed, InProgress
     """
     timeout = const.NODE_STOP_TIMEOUT if timeout < 0 else timeout
     node_status = self.nodes_status([nodeid]).get(nodeid)
     if node_status == NODE_STATUSES.CLUSTER_OFFLINE.value:
         Log.info(f"For stop {nodeid}, Node already in offline state.")
         status = f"Node {nodeid} is already in offline state."
     elif node_status == NODE_STATUSES.POWEROFF.value:
         raise ClusterManagerError(f"Failed to stop {nodeid}."
                                   f"node is in {node_status}.")
     else:
         if self.heal_resource(nodeid):
             time.sleep(const.BASE_WAIT_TIME)
         try:
             Log.info(f"Please Wait, trying to stop node: {nodeid}")
             self._execute.run_cmd(
                 const.PCS_STOP_NODE.replace("<node>", nodeid).replace(
                     "<seconds>", str(timeout)))
             Log.info(
                 f"Executed node stop for {nodeid}, Waiting to stop resource"
             )
             time.sleep(const.BASE_WAIT_TIME)
             status = f"Stop for {nodeid} is in progress, waiting to stop resource"
         except Exception as e:
             raise ClusterManagerError(
                 f"Failed to stop {nodeid}, Error: {e}")
     return {"status": const.STATUSES.IN_PROGRESS.value, "msg": status}

Example #2

0

Show file

    def stop(self, node_id: str, excludeResourceList: list = None) -> dict:
        """
        Stop service.

        Args:
            node_id (str): Private fqdn define in conf store.
            excludeResourceList (list): Service list which are not stopped.

        Returns:
            ([dict]): Return dictionary. {"status": "", "output":"", "error"}
                status: Succeeded, Failed, InProgress
        """
        try:
            resources: list = []
            output, _, _ = self._execute.run_cmd(const.LIST_PCS_RESOURCES, check_error=False)
            if "NO resources".lower() not in output.lower():
                for resource in output.split("\n"):
                    res = resource.split(":")[0]
                    if res != "" and res not in resources and res not in excludeResourceList:
                        resources.append(res)
            for resource in resources:
                self._execute.run_cmd(const.PCS_BAN_RESOURCES.replace("<resource_id>", resource).replace("<node>", node_id))
            Log.info(f"Waiting to stop resource on node {node_id}")
            time.sleep(const.BASE_WAIT_TIME)
            # TODO: Check if the resources are stopped EOS-23386
            return {"status": const.STATUSES.IN_PROGRESS.value, "msg": f"Resources stopped on node {node_id}"}
        except Exception as e:
            raise ClusterManagerError(f"Failed to stop resources on {node_id}, Error: {e}")

Example #3

0

Show file

    def get_system_health(
            self,
            element: CLUSTER_ELEMENTS = CLUSTER_ELEMENTS.CLUSTER.value,
            depth: int = 1,
            **kwargs) -> json:
        """
        Return health status for the requested elements.
        Args:
            element ([CLUSTER_ELEMENTS]): The element whose health status is to be returned.
            depth ([int]): A depth of elements starting from the input "element" that the health status
                is to be returned.
            **kwargs([dict]): Variable number of arguments that are used as filters,
                e.g. "id" of the input "element".
        Returns:
            ([dict]): Returns dictionary. {"status": "Succeeded"/"Failed"/"Partial", "output": "", "error": ""}
                status: Succeeded, Failed, Partial
                output: Dictionary with element health status
                error: Error information if the request "Failed"
        """

        try:
            # Fetch the health status
            system_health_controller = SystemHealthController(self._confstore)
            return system_health_controller.get_status(component=element,
                                                       depth=depth,
                                                       version=self._version,
                                                       **kwargs)
        except Exception as e:
            Log.error(f"Failed returning system health . Error: {e}")
            raise ClusterManagerError(
                "Failed returning system health, internal error")

Example #4

0

Show file

    def nodes_status(self, nodeids: list = None) -> dict:
        """
        Get pcs status of nodes.
        Args:
            nodeids (list): List of Node IDs from cluster nodes.
                Default provide list of all node status.
                if 'local' then provide local node status.

        Returns:
            ([dict]): Return dictionary. {"node_id1": "status of node_id1",
                                          "node_id2": "status of node_id2"...}
        """
        nodeids = self._get_node_list(
        ) if nodeids == None or len(nodeids) == 0 else nodeids
        all_nodes_status = dict()
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES,
                                                   check_error=False)
        if not isinstance(nodeids, list):
            raise ClusterManagerError(
                f"Invalid nodeids type `{type(nodeids)}`, required `list`")
        for nodeid in nodeids:
            if nodeid in _output:
                for status in _output.split("\n"):
                    nodes = status.split(":")
                    if len(nodes) > 1 and nodeid.lower() in nodes[1].strip(
                    ).lower():
                        if nodes[0].strip().lower(
                        ) == NODE_STATUSES.STANDBY.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.STANDBY.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.MAINTENANCE.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.MAINTENANCE.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.CLUSTER_OFFLINE.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.CLUSTER_OFFLINE.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.ONLINE.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.ONLINE.value
                        break
                else:
                    all_nodes_status[nodeid] = NODE_STATUSES.UNKNOWN.value
            else:
                raise HAInvalidNode(f"Node {nodeid} is not a part of cluster")
        for node in all_nodes_status.keys():
            status = all_nodes_status[node]
            if status == NODE_STATUSES.CLUSTER_OFFLINE.value:
                _output, _err, _rc = self._execute.run_cmd(f"ping -c 1 {node}",
                                                           check_error=False)
                if _rc != 0:
                    all_nodes_status[node] = NODE_STATUSES.POWEROFF.value
        return all_nodes_status

Example #5

0

Show file

File: cluster_controller.py Project: akash2144/cortx-ha

    def create_cluster(self, name: str, user: str, secret: str,
                       nodeid: str) -> dict:
        """
        Create cluster if not created.

        Args:
            name (str): Cluster name.
            user (str): Cluster User.
            secret (str): Cluster passward.
            nodeid (str): Node name, nodeid of current node.

        Returns:
            dict: Return dictionary. {"status": "", "output":"", "error":""}
        """
        try:
            self._check_non_empty(name=name,
                                  user=user,
                                  secret=secret,
                                  nodeid=nodeid)
            if not self._is_pcs_cluster_running():
                self._auth_node(nodeid, user, secret)
                self._execute.run_cmd(
                    const.PCS_SETUP_CLUSTER.replace("<cluster_name>",
                                                    name).replace(
                                                        "<node>", nodeid))
                Log.info("Pacmaker cluster created, waiting to start node.")
                self._execute.run_cmd(const.PCS_CLUSTER_START_NODE)
                self._execute.run_cmd(const.PCS_CLUSTER_ENABLE)
                Log.info("Node started and enabled successfully.")
                time.sleep(const.BASE_WAIT_TIME * 2)
            if self._is_pcs_cluster_running():
                if self.wait_for_node_online(nodeid):
                    # TODO: Divide class into vm, hw when stonith is needed.
                    self._execute.run_cmd(const.PCS_STONITH_DISABLE)
                    return {
                        "status": const.STATUSES.SUCCEEDED.value,
                        "output": "Cluster created successfully.",
                        "error": ""
                    }
                else:
                    raise ClusterManagerError("Node is not online.")
            else:
                raise ClusterManagerError("Cluster is not started.")
        except Exception as e:
            raise ClusterManagerError(f"Failed to create cluster. Error: {e}")

Example #6

0

Show file

File: pcs_controller.py Project: akash2144/cortx-ha

 def _get_cluster_size(self):
     """
     Auth node to add
     """
     try:
         _output, _err, _rc = self._execute.run_cmd(const.PCS_CLUSTER_PCSD_STATUS)
         return len(_output.split("\n"))
     except Exception as e:
         raise ClusterManagerError(f"Unable to get cluster : with reason : {e}")

Example #7

0

Show file

File: cluster_controller.py Project: akash2144/cortx-ha

 def _get_node_group(self) -> list:
     """
     Get node_group
     """
     if self._is_pcs_cluster_running() is False:
         raise ClusterManagerError(
             "Cluster is not running on current node.")
     res = json.loads(self.node_list())
     if res.get("status") != const.STATUSES.SUCCEEDED.value:
         raise ClusterManagerError("Failed to get node list.")
     else:
         node_list: list = res.get("output")
         Log.info(f"Node List : {node_list}")
         if node_list is not None:
             node_group: list = [
                 node_list[i:i + const.PCS_NODE_GROUP_SIZE] for i in range(
                     0, len(node_list), const.PCS_NODE_GROUP_SIZE)
             ]
     return node_group

Example #8

0

Show file

File: pcs_controller.py Project: akash2144/cortx-ha

 def load_json_file(json_file):
     """
     Load json file to read node & the cluster details to auth node
     :param json_file:
     """
     try:
         with open(json_file) as f:
             return json.load(f)
     except Exception as e:
         raise ClusterManagerError(f"Error in reading desc_file, reason : {e}")

Example #9

0

Show file

File: pcs_controller.py Project: akash2144/cortx-ha

    def _check_non_empty(self, **kwargs):
        """
        Check if params are not empty.

        Raises:
            ClusterManagerError: [description]
        """
        for key in kwargs.keys():
            if kwargs[key] is None or kwargs[key] == "":
                raise ClusterManagerError(f"Failed: Invalid parameter, {key} cannot be empty.")

Example #10

0

Show file

    def start(self) -> dict:
        """
        Start cluster and all service.

        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        if self._is_pcs_cluster_running() is False:
            res = self._pcs_cluster_start()
            time.sleep(const.BASE_WAIT_TIME)
            if res != const.STATUSES.SUCCEEDED.value:
                raise ClusterManagerError("Cluster start operation failed")
        status = ""
        failed_node_list: list = []
        try:
            node_group: list = self._get_node_group()
            for node_subgroup in node_group:
                for node_id in node_subgroup:
                    res = json.loads(self._controllers[
                        const.NODE_CONTROLLER].start(node_id))
                    Log.info(f'res: {res}')
                    if res.get("status") == const.STATUSES.FAILED.value:
                        msg = res.get("msg")
                        Log.error(f"Node {node_id} : {msg}")
                        failed_node_list.append(node_id)
                # Wait till all the resources get started in the sub group
                time.sleep(const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE)
        except Exception as e:
            raise ClusterManagerError(f"Failed to start Cluster. Error: {e}")
        status = "Cluster start is in process."
        if len(failed_node_list) != 0 and len(failed_node_list) != len(
                json.loads(self.node_list()).get("msg")):
            status += f"Warning, Some of nodes failed to start are {failed_node_list}"
        elif len(failed_node_list) != 0:
            raise ClusterManagerError(
                f"Failed to start all nodes {failed_node_list}")
        else:
            status += "All node started successfully, resource start in progress."
        return {
            "status": const.STATUSES.IN_PROGRESS.value,
            "msg": "Cluster start operation performed"
        }

Example #11

0

Show file

    def _create_req(self, target_node_name: str) -> str:
        """
        Create actuator request for enclosure stop

        Args:
            target_node_name : node on which enclosure is to be stopped

        Return:
            created actuator request
        """

        with open(ACTUATOR_SCHEMA, 'r') as actuator_req_schema_file:
            actuator_req = json.load(actuator_req_schema_file)

        # Get details for current node
        node_name = Conf.get(const.HA_GLOBAL_INDEX,
                             f"CLUSTER_MANAGER{_DELIM}local_node")

        key_val = self._conf_store.get(f"{PVTFQDN_TO_NODEID_KEY}/{node_name}")
        _, node_id = key_val.popitem()

        system_health = SystemHealth(self._conf_store)
        health_manager = SystemHealthManager(self._conf_store)
        key = system_health._prepare_key(const.COMPONENTS.NODE_MAP.value,
                                         node_id=node_id)

        node_map_val = health_manager.get_key(key)
        if node_map_val is None:
            raise ClusterManagerError("Failed to fetch node_map value")

        node_map_dict = ast.literal_eval(node_map_val)
        self._generate_uuid()

        # Populate the actuator request schema
        actuator_req[ACTUATOR_ATTRIBUTES.TIME] = str(int(time.time()))
        actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][
            ACTUATOR_ATTRIBUTES.REQUEST_PATH][
                ACTUATOR_ATTRIBUTES.SITE_ID] = node_map_dict[
                    NODE_MAP_ATTRIBUTES.SITE_ID.value]
        actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][
            ACTUATOR_ATTRIBUTES.REQUEST_PATH][
                ACTUATOR_ATTRIBUTES.RACK_ID] = node_map_dict[
                    NODE_MAP_ATTRIBUTES.RACK_ID.value]
        actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][
            ACTUATOR_ATTRIBUTES.REQUEST_PATH][
                ACTUATOR_ATTRIBUTES.NODE_ID] = node_id
        actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][
            ACTUATOR_ATTRIBUTES.TARGET_NODE_ID] = target_node_name
        actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][ACTUATOR_ATTRIBUTES.HEADER][
            ACTUATOR_ATTRIBUTES.UUID] = self._uuid

        actuator_req_schema_file.close()
        req = json.dumps(actuator_req)

        return req

Example #12

0

Show file

File: node_controller.py Project: akash2144/cortx-ha

 def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict:
     """
     Stop Node with nodeid.
     Args:
         nodeid (str): Node ID from cluster nodes.
     Returns:
         ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
             status: Succeeded, Failed, InProgress
     """
     check_cluster = op_kwargs.get("check_cluster") if op_kwargs.get(
         "check_cluster") is not None else True
     # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
     node_name = ConfigManager.get_node_name(node_id=node_id)
     try:
         timeout = const.NODE_STOP_TIMEOUT if timeout < 0 else timeout
         node_status = self._system_health.get_node_status(
             node_id=node_id).get("status")
         if node_status == HEALTH_STATUSES.OFFLINE.value:
             Log.info(
                 f"For stop node id {node_id}, Node already in offline state."
             )
             status = f"Node with node id {node_id} is already in offline state."
             return {
                 "status": const.STATUSES.SUCCEEDED.value,
                 "output": status,
                 "error": ""
             }
         elif node_status == HEALTH_STATUSES.FAILED.value:
             # In case VM, if node is Poweroff or Disconnected, system health will be updated with status FAILED.
             return {
                 "status":
                 const.STATUSES.FAILED.value,
                 "output":
                 "",
                 "error":
                 f"Node {node_id} status is {node_status}, node cannot be stopped."
             }
         else:
             if self.heal_resource(node_name):
                 time.sleep(const.BASE_WAIT_TIME)
             if check_cluster:
                 # Checks whether cluster is going to be offline if node with node_name is stopped.
                 res = json.loads(
                     self.check_cluster_feasibility(node_id=node_id))
                 if res.get("status") == const.STATUSES.FAILED.value:
                     return res
     except Exception as e:
         raise ClusterManagerError(
             f"Failed to stop node {node_id}, Error: {e}")

Example #13

0

Show file

File: pcs_controller.py Project: akash2144/cortx-ha

    def _get_node_list(self) -> list:
        """
        Return list of nodes.
        """
        #TODO: This is temporary implementation and It should be removed once nodelist is available in the system health.
        nodelist = []
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES, check_error=False)

        if _rc != 0:
            raise ClusterManagerError("Failed to get nodes status")
        for status in _output.split("\n"):
            nodes = status.split(":")
            if len(nodes) > 1:
                nodelist.extend(nodes[1].split())
        return nodelist

Example #14

0

Show file

File: node_controller.py Project: akash2144/cortx-ha

    def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict:
        """
        Stop Node with nodeid.
        Args:
            node_id (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
                status: Succeeded, Failed, InProgress
        """
        # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
        node_name = ConfigManager.get_node_name(node_id=node_id)
        try:
            stop_status = json.loads(super().stop(node_id, **op_kwargs))
            if stop_status != None:
                if stop_status["status"] == const.STATUSES.SUCCEEDED.value:
                    # Node is already in offline state.
                    return stop_status
                elif stop_status["status"] == const.STATUSES.FAILED.value:
                    # Node is in failed state.
                    return stop_status

            # Put node in standby mode
            self._execute.run_cmd(
                const.PCS_NODE_STANDBY.replace("<node>", node_name),
                f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}")
            Log.info(f"Executed node standby for node {node_id}")
            # TODO: EOS-23859 : STOP NODE - Use PCS_STOP_NODE from const.py with timeout value
            status = f"Standby for node {node_id} is in progress"

            # Update node health
            # TODO : Health event update to be removed once fault_tolerance branch is merged
            initial_event = self._system_health.get_health_event_template(
                nodeid=node_id, event_type=HEALTH_EVENTS.FAULT.value)
            Log.debug(
                f"Node health : {initial_event} updated for node {node_id}")
            health_event = HealthEvent.dict_to_object(initial_event)
            self._system_health.process_event(health_event)
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "output": status,
                "error": ""
            }

        except Exception as e:
            raise ClusterManagerError(
                f"Failed to stop node {node_id}, Error: {e}")

Example #15

0

Show file

    def enclosure_stop(
        self,
        node_name: str,
        timeout: int = int(ACTUATOR_RESP_RETRY_COUNT * ACTUATOR_RESP_WAIT_TIME)
    ) -> bool:
        """
        Send actuator request to monitor for stopping enclosure

        Args:
            node_name : node on which enclosure is to be stopped

        Return:
            True : if enclose stop successful; else exception is raised
        """
        req = self._create_req(node_name)
        self._register_for_resp()
        self._send_req(req)
        retry_count = int(timeout // ACTUATOR_RESP_WAIT_TIME)

        # Wait for 60 sec max. Expected max wait time = 40 sec + 20 sec buffer
        for _ in range(0, retry_count):
            time.sleep(ACTUATOR_RESP_WAIT_TIME)
            if self._is_resp_received:
                self.consumer.stop()
                break

        if self._is_resp_received and self._encl_shutdown_successful:
            Log.info(f"Enclosure shutdown successful on node {node_name}")
            self._is_resp_received = self._encl_shutdown_successful = False
            return True

        if not self._is_resp_received:
            self.timeout_reached = True
            self.consumer.stop()
            self.timeout_reached = False
            Log.error(
                f"Actuator response not received; enclosure shutdown failed on node {node_name}"
            )
        else:
            Log.error(f"Unable to shutdown enclosure on node {node_name}")

        self._is_resp_received = self._encl_shutdown_successful = False
        raise ClusterManagerError("Failed to shutdown the enclosure")

Example #16

0

Show file

File: cluster_controller.py Project: akash2144/cortx-ha

 def _auth_node(self, node_id, cluster_user, cluster_password):
     """
     Auth node to add
     """
     try:
         auth_cmd = const.PCS_CLUSTER_NODE_AUTH.replace(
             "<node>",
             node_id).replace("<username>",
                              cluster_user).replace("<password>",
                                                    cluster_password)
         self._execute.run_cmd(auth_cmd, secret=cluster_password)
         Log.info(
             f"Node {node_id} authenticated with {cluster_user} Successfully."
         )
     except Exception as e:
         Log.error(
             f"Failed to authenticate node : {node_id} with reason : {e}")
         raise ClusterManagerError(
             f"Failed to authenticate node : {node_id}, Please check username or password"
         )

Example #17

0

Show file

    def clear_resources(self, node_id: str):
        """
        Clear resources on node.

        Args:
            nodeid (str): Private fqdn define in conf store.
        """
        try:
            resources: list = []
            output, _, _ = self._execute.run_cmd(const.LIST_PCS_RESOURCES, check_error=False)
            if "NO resources".lower() not in output.lower():
                for resource in output.split("\n"):
                    res = resource.split(":")[0]
                    if res != "" and res not in resources:
                        resources.append(res)
            for resource in resources:
                self._execute.run_cmd(const.PCS_CLEAR_RESOURCES.replace("<resource_id>", resource).replace("<node>", node_id))
            Log.info(f"Cleared resource on node {node_id}")
        except Exception as e:
            raise ClusterManagerError(f"Failed to clear resources on {node_id}, Error: {e}")

Example #18

0

Show file

File: node_controller.py Project: akash2144/cortx-ha

 def start(self, node_id: str, **op_kwargs) -> dict:
     """
     Start node with the node_id.
     Args:
         node_id (str): Node ID from cluster nodes.
     Returns:
         ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
             status: Succeeded, Failed, InProgress
     """
     try:
         poweron = op_kwargs.get("poweron") if op_kwargs.get(
             "poweron") is not None else False
         node_name = ConfigManager.get_node_name(node_id=node_id)
         self._is_node_in_cluster(node_id=node_name)
         power_status = self.fencing_agent.power_status(node_id=node_name)
         if power_status == const.SERVER_POWER_STATUS.OFF.value and poweron is False:
             Log.debug(
                 f"Node {node_name} is powered-off and poweron was not set")
             return {
                 "status": const.STATUSES.FAILED.value,
                 "output": "",
                 "error":
                 f"Node {node_id} is powered-off, use poweron option"
             }
         elif power_status == const.SERVER_POWER_STATUS.OFF.value and poweron is True:
             self.fencing_agent.power_on(node_id=node_name)
             Log.debug(
                 f"Node {node_name} is powered-on, waiting for node boot")
             time.sleep(const.NODE_POWERON_DELAY)
         start_status = super().start(node_id, **op_kwargs)
         # TODO: Move this to base class after during stop stonith is disabled for VM as well.
         start_status_json = json.loads(start_status)
         if start_status_json["status"] == const.STATUSES.SUCCEEDED.value:
             self._execute.run_cmd(
                 const.ENABLE_STONITH.replace("<node>", node_name))
         return start_status
     except Exception as e:
         raise ClusterManagerError(f"Failed to start {node_id}, Error: {e}")

Example #19

0

Show file

File: cluster_controller.py Project: akash2144/cortx-ha

    def start(self, sync=False, timeout=120) -> dict:
        """
        Start cluster and all service.

        Args:
            sync (bool, optional): if sync is True then start will check the status for timeout seconds.
            timeout (int, optional): timeout(in seconds) can be specified for sync=True otherwise ignored.

        Returns:
            ([dict]): Return dictionary. {"status": "", "output":"", "error":""}
                status: Succeeded, Failed, InProgress
        """
        # Current behavior of start, starts all the nodes and bring all the nodes out of standby.
        # This is deviation from cluster start as envisioned. So the documentation should change.

        if self._is_pcs_cluster_running() is False:
            res = self._pcs_cluster_start()
            time.sleep(const.BASE_WAIT_TIME)
            if res != const.STATUSES.SUCCEEDED.value:
                raise ClusterManagerError("Cluster start operation failed")
        status = ""
        failed_node_list: list = []
        try:
            node_group: list = self._get_node_group()
            for node_subgroup in node_group:
                for node_name in node_subgroup:
                    node_id = ConfigManager.get_node_id(node_name)
                    res = json.loads(self._controllers[
                        const.NODE_CONTROLLER].start(node_id))
                    Log.info(f'res: {res}')
                    if res.get("status") == const.STATUSES.FAILED.value:
                        msg = res.get("error")
                        Log.error(f"Node {node_name} : {msg}")
                        failed_node_list.append(node_name)
                # Wait till all the resources get started in the sub group
                time.sleep(const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE)
        except Exception as e:
            raise ClusterManagerError(f"Failed to start Cluster. Error: {e}")
        status = "Cluster start is in process."
        if len(failed_node_list) != 0 and len(failed_node_list) != len(
                json.loads(self.node_list()).get("output")):
            status += f"Warning, Some of nodes failed to start are {failed_node_list}"
        elif len(failed_node_list) != 0:
            raise ClusterManagerError(
                f"Failed to start all nodes {failed_node_list}")
        else:
            status += "All node started successfully, resource start in progress."
            self.enable_stonith()

        if sync:
            timeout = timeout - const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE * len(
                node_group)
            in_expected_state = self._verify_expected_cluster_status(
                const.CLUSTER_STATUS.ONLINE, timeout)
            if in_expected_state:
                return {
                    "status": const.STATUSES.SUCCEEDED.value,
                    "output": "Cluster is online.",
                    "error": ""
                }
            else:
                return {
                    "status": const.STATUSES.FAILED.value,
                    "output": "Retry suggested.",
                    "error": "Operation timed out."
                }

        return {
            "status": const.STATUSES.IN_PROGRESS.value,
            "output": "Cluster start operation performed",
            "error": ""
        }

Example #20

0

Show file

    def start(self, nodeid: str) -> dict:
        """
        Start node with nodeid.
        Args:
            nodeid (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        _node_status = self.nodes_status([nodeid])[nodeid]
        if _node_status == NODE_STATUSES.ONLINE.value:
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "msg": f"Node {nodeid}, is already in Online status"
            }
        elif _node_status == NODE_STATUSES.STANDBY.value or _node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
            # make node unstandby
            if self.heal_resource(nodeid):
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid),
                    check_error=False)
                return {
                    "status":
                    const.STATUSES.IN_PROGRESS.value,
                    "msg":
                    f"Node {nodeid} : Node was in standby mode, "
                    f"Unstandby operation started successfully"
                }
            else:
                Log.error(
                    f"Node {nodeid} is in standby mode : Resource failcount found on the node, "
                    f"cleanup not worked after 2 retries")
                return {
                    "status":
                    const.STATUSES.FAILED.value,
                    "msg":
                    f"Node {nodeid} is in standby mode: Resource "
                    f"failcount found on the node cleanup not worked after 2 retries"
                }
        elif _node_status == NODE_STATUSES.CLUSTER_OFFLINE.value:
            _output, _err, _rc = self._execute.run_cmd(
                const.PCS_NODE_START.replace("<node>", nodeid),
                check_error=False)
            if _rc != 0:
                raise ClusterManagerError(f"Failed to start node {nodeid}")

            Log.info(f'Node: {nodeid} started successfully. Now, waiting for \
                       cluster to stabalize and then get the node status')

            time.sleep(const.BASE_WAIT_TIME * 2)

            # Get the status of the node again
            _node_status = self.nodes_status([nodeid])[nodeid]

            # If the node is in standby mode, unstandby here
            if _node_status == NODE_STATUSES.STANDBY.value:
                Log.warn(f'Node: {nodeid} is still in standby mode')
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid),
                    check_error=False)
                if _rc != 0:
                    raise ClusterManagerError(
                        f"Failed to unstandby the node: {nodeid}")
                return {
                    "status":
                    const.STATUSES.IN_PROGRESS.value,
                    "msg":
                    f"Node {nodeid}: Node was in offline and then switched to standby mode, "
                    f"Cluster started on node successfully"
                }

            return {
                "status":
                const.STATUSES.IN_PROGRESS.value,
                "msg":
                f"Node {nodeid} : Node was in cluster_offline mode, "
                f"Cluster started on node successfully"
            }

        elif _node_status == NODE_STATUSES.POWEROFF.value:
            # start node not in scope of VM
            Log.error("Operation not available for node type VM")
            raise ClusterManagerError(
                f"Node {nodeid} : Node was in poweroff mode, "
                "Node start : Operation not available for VM")
        else:
            Log.error(
                f"{nodeid} status is {_node_status}, node may not be started.")
            raise ClusterManagerError(
                f"Failed to start {nodeid} as found unhandled status {_node_status}"
            )

Example #21

0

Show file

    def stop(self) -> dict:
        """
        Stop cluster and all service. It is Blocking call.

        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        status: str = ""
        if not self._is_pcs_cluster_running():
            raise ClusterManagerError(
                "Cluster not running on current node."
                "To stop cluster, It should be running on current node.")
        node_group: list = self._get_node_group()
        local_node: str = ConfigManager.get_local_node()
        Log.info(
            f"Node group for cluster start {node_group}, local node {local_node}"
        )
        self_group: list = list(
            filter(lambda group: (local_node in group), node_group))[0]
        node_group.remove(self_group)
        offline_nodes = self._get_filtered_nodes(
            [NODE_STATUSES.POWEROFF.value])
        # Stop cluster for other group
        for node_subgroup in node_group:
            for nodeid in node_subgroup:
                # Offline node can not be started without stonith.
                if nodeid not in offline_nodes:
                    if self.heal_resource(nodeid):
                        time.sleep(const.BASE_WAIT_TIME)
                    res = json.loads(
                        self._controllers[const.NODE_CONTROLLER].stop(nodeid))
                    Log.info(f"Stopping node {nodeid}, output {res}")
                    if NODE_STATUSES.POWEROFF.value in res.get("msg"):
                        offline_nodes.append(nodeid)
                        Log.warn(
                            f"Node {nodeid}, is in offline or lost from network."
                        )
                    elif res.get("status") == const.STATUSES.FAILED.value:
                        raise ClusterManagerError(
                            f"Cluster Stop failed. Unable to stop {nodeid}")
                    else:
                        Log.info(f"Node {nodeid} stop is in progress.")
                else:
                    Log.info(
                        f"Node {nodeid}, is in offline or lost from network.")
            # Wait till resource will get stop.
            Log.info(f"Waiting, for {node_subgroup} to stop is in progress.")
        # Stop self group of cluster
        try:
            Log.info(
                f"Please Wait, trying to stop self node group: {self_group}")
            timeout = const.NODE_STOP_TIMEOUT * len(self_group)
            self._execute.run_cmd(
                const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout)))
            Log.info("Cluster stop completed.")
        except Exception as e:
            raise ClusterManagerError(f"Cluster stop failed. Error: {e}")
        status = "Cluster stop is in progress."
        if len(offline_nodes) != 0:
            status += f" Warning, Found {offline_nodes}, may be poweroff or not in network"
        return {"status": const.STATUSES.IN_PROGRESS.value, "msg": status}

Example #22

0

Show file

File: node_controller.py Project: akash2144/cortx-ha

    def start(self, node_id: str, **op_kwargs) -> dict:
        """
        Start node with the node_id.
        Args:
            node_id (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "output": "", "error": ""}
                status: Succeeded, Failed, InProgress
        """
        try:
            # Get the node_name (pvtfqdn) from node_id
            node_name = ConfigManager.get_node_name(node_id=node_id)
            self._is_node_in_cluster(node_id=node_name)
            node_status = self.nodes_status([node_name])[node_name]
            Log.debug(f"Node {node_name} cluster status is {node_status}")
            node_health = self._system_health.get_node_status(
                node_id=node_id).get("status")
            Log.debug(f"Node {node_name} health is {node_health}")
            if node_status == NODE_STATUSES.ONLINE.value and node_health == HEALTH_STATUSES.ONLINE.value:
                Log.debug(f"Node {node_name} is already online")
                return {
                    "status": const.STATUSES.SUCCEEDED.value,
                    "output": NODE_STATUSES.ONLINE.value,
                    "error": ""
                }
            elif node_status == NODE_STATUSES.STANDBY.value or node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
                # Unstandby the node
                if self.heal_resource(node_name):
                    _output, _err, _rc = self._execute.run_cmd(
                        const.PCS_NODE_UNSTANDBY.replace("<node>", node_name),
                        check_error=False)
                    if _rc != 0:
                        Log.error(
                            f"Failed to start node {node_name}, Error: {_err}")
                        return {
                            "status":
                            const.STATUSES.FAILED.value,
                            "output":
                            "",
                            "error":
                            f"Failed to start node {node_id}, Error: {_err}"
                        }
                    Log.debug(
                        f"Node {node_name} was in standby mode, unstandby operation started successfully"
                    )
                else:
                    Log.error(
                        f"Node {node_name} is in standby mode : Resource failcount found on the node, cleanup did not work"
                    )
                    return {
                        "status":
                        const.STATUSES.FAILED.value,
                        "output":
                        "",
                        "error":
                        f"Node {node_id} is in standby mode, resource failcount found on the node, cleanup did not work"
                    }
            elif node_status == NODE_STATUSES.CLUSTER_OFFLINE.value:
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_NODE_START.replace("<node>", node_name),
                    check_error=False)
                if _rc != 0:
                    Log.error(
                        f"Failed to start node {node_name}, Error: {_err}")
                    return {
                        "status": const.STATUSES.FAILED.value,
                        "output": "",
                        "error":
                        f"Failed to start node {node_id}, Error: {_err}"
                    }
                Log.debug(
                    f"Node {node_name} started successfully. Waiting for cluster to stabalize and then get the node status"
                )
                time.sleep(const.BASE_WAIT_TIME * 2)
                # Get the status of the node again
                node_status = self.nodes_status([node_name])[node_name]
                # If the node is in standby mode, unstandby here
                if node_status == NODE_STATUSES.STANDBY.value:
                    Log.warn(f'Node {node_name} is still in standby mode')
                    _output, _err, _rc = self._execute.run_cmd(
                        const.PCS_NODE_UNSTANDBY.replace("<node>", node_name),
                        check_error=False)
                    if _rc != 0:
                        Log.error(
                            f"Failed to start node {node_name}, Error: {_err}")
                        return {
                            "status":
                            const.STATUSES.FAILED.value,
                            "output":
                            "",
                            "error":
                            f"Failed to start node {node_id}, Error: {_err}"
                        }
            else:
                Log.error(
                    f"{node_name} status is {node_status}, node cannot be started."
                )
                return {
                    "status":
                    const.STATUSES.FAILED.value,
                    "output":
                    "",
                    "error":
                    f"Node {node_id} status is {node_status}, node cannot be started."
                }

            # TODO: Update the storage enclosure status in system health.
            # Update the node status in system health
            self._update_health(const.COMPONENTS.NODE.value, node_id,
                                HEALTH_EVENTS.FAULT_RESOLVED.value)
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "output": NODE_STATUSES.ONLINE.value,
                "error": ""
            }
        except Exception as e:
            Log.error(f"Failed to start node {node_id}")
            raise ClusterManagerError(
                f"Failed to start node {node_id}, Error {e}")

Example #23

0

Show file

File: node_controller.py Project: akash2144/cortx-ha

    def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict:
        """
        Stop (poweroff) node with node_id.
        Args:
            node_id (str): Node ID from cluster nodes.
        Returns:
            ([dict]): Return dictionary. {"status": "", "msg":""}
                status: Succeeded, Failed, InProgress
        """
        poweroff = op_kwargs.get("poweroff") if op_kwargs.get(
            "poweroff") is not None else False
        storageoff = op_kwargs.get("storageoff") if op_kwargs.get(
            "storageoff") is not None else False
        # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid
        node_name = ConfigManager.get_node_name(node_id=node_id)
        try:
            stop_status = json.loads(super().stop(node_id, **op_kwargs))
            if stop_status != None:
                if stop_status["status"] == const.STATUSES.SUCCEEDED.value:
                    # Node is already in offline state.
                    return stop_status
                elif stop_status["status"] == const.STATUSES.FAILED.value:
                    # Node is in failed state.
                    return stop_status

            if storageoff:
                # Stop services on node except sspl-ll
                self._controllers[const.SERVICE_CONTROLLER].stop(
                    node_id=node_name,
                    excludeResourceList=[RESOURCE.SSPL_LL.value])

                # Stop the storage enclosure on the node
                actuator_mgr = ActuatorManager()
                actuator_mgr.enclosure_stop(node_name)
                Log.info(f"Enclosure stopped for {node_name}")
                # TODO: Update enclosure health

                # Put node in standby mode
                self._execute.run_cmd(
                    const.PCS_NODE_STANDBY.replace("<node>", node_name),
                    f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}")
                Log.info(f"Executed node standby for node {node_id}")
                self._controllers[const.SERVICE_CONTROLLER].clear_resources(
                    node_id=node_name)
            else:
                self._execute.run_cmd(
                    const.PCS_NODE_STANDBY.replace("<node>", node_name),
                    f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}")
                Log.info(f"Executed node standby for node {node_id}")
            status = f"For node {node_id}, Standby is in progress"

            # Update node health
            # TODO : Health event update to be removed once fault_tolerance branch is merged
            initial_event = self._system_health.get_health_event_template(
                nodeid=node_id, event_type=HEALTH_EVENTS.FAULT.value)
            Log.debug(
                f"Node health : {initial_event} updated for node {node_id}")
            health_event = HealthEvent.dict_to_object(initial_event)
            self._system_health.process_event(health_event)

            # Node power off
            if poweroff:
                self._execute.run_cmd(
                    const.DISABLE_STONITH.replace("<node>", node_name))
                self.fencing_agent.power_off(node_id=node_name)
                status = f"Power off for node {node_id} is in progress"
            Log.info(f"Node power off successfull. status : {status}")
            # TODO : return status should be changed according to passed parameters
            return {
                "status": const.STATUSES.SUCCEEDED.value,
                "error": "",
                "output": status
            }
        except Exception as e:
            raise ClusterManagerError(
                f"Failed to stop node {node_id}, Error: {e}")

Example #24

0

Show file

File: cluster_controller.py Project: akash2144/cortx-ha

    def stop(self, sync=False, timeout=30) -> dict:
        """
        Stop cluster and all service.

        Args:
            sync (bool, optional): if sync is True then stop will check the status for timeout seconds.
            timeout (int, optional): timeout(in seconds) can be specified for sync=True otherwise ignored.

        Returns:
            ([dict]): Return dictionary. {"status": "", "output":"", "error":""}
                status: Succeeded, Failed, InProgress
        """
        status: str = ""
        if not self._is_pcs_cluster_running():
            raise ClusterManagerError(
                "Cluster not running on current node."
                "To stop cluster, It should be running on current node.")
        node_group: list = self._get_node_group()
        local_node: str = ConfigManager.get_local_node()
        Log.info(
            f"Node group for cluster start {node_group}, local node {local_node}"
        )
        self_group: list = list(
            filter(lambda group: (local_node in group), node_group))[0]
        node_group.remove(self_group)
        offline_nodes = self._get_filtered_nodes(
            [NODE_STATUSES.POWEROFF.value])
        # Stop cluster for other group
        for node_subgroup in node_group:
            for node_name in node_subgroup:
                # Offline node can not be started without stonith.
                if node_name not in offline_nodes:
                    if self.heal_resource(node_name):
                        time.sleep(const.BASE_WAIT_TIME)
                    node_id = ConfigManager.get_node_id(node_name)
                    res = json.loads(
                        self._controllers[const.NODE_CONTROLLER].stop(node_id))
                    Log.info(f"Stopping node {node_id}, output {res}")
                    if NODE_STATUSES.POWEROFF.value in res.get("output"):
                        offline_nodes.append(node_id)
                        Log.warn(
                            f"Node {node_id}, is in offline or lost from network."
                        )
                    elif res.get("status") == const.STATUSES.FAILED.value:
                        raise ClusterManagerError(
                            f"Cluster Stop failed. Unable to stop {node_id}")
                    else:
                        Log.info(f"Node {node_id} stop is in progress.")
                else:
                    Log.info(
                        f"Node {node_name}, is in offline or lost from network."
                    )
            # Wait till resource will get stop.
            Log.info(f"Waiting, for {node_subgroup} to stop is in progress.")
        # Stop self group of cluster
        try:
            Log.info(
                f"Please Wait, trying to stop self node group: {self_group}")
            timeout = const.NODE_STOP_TIMEOUT * len(self_group)
            self._execute.run_cmd(
                const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout)))
            Log.info("Cluster stop completed.")
        except Exception as e:
            raise ClusterManagerError(f"Cluster stop failed. Error: {e}")
        status = "Cluster stop is in progress."
        if len(offline_nodes) != 0:
            status += f" Warning, Found {offline_nodes}, may be poweroff or not in network"

        if sync:
            timeout = timeout - const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE * len(
                node_group)
            in_expected_state = self._verify_expected_cluster_status(
                const.CLUSTER_STATUS.OFFLINE, timeout)
            if in_expected_state:
                return {
                    "status": const.STATUSES.SUCCEEDED.value,
                    "output": "Cluster is offline.",
                    "error": ""
                }
            else:
                return {
                    "status": const.STATUSES.FAILED.value,
                    "output": "Retry suggested.",
                    "error": "Operation timed out."
                }

        return {
            "status": const.STATUSES.IN_PROGRESS.value,
            "output": status,
            "error": ""
        }