def stop(self, nodeid: str, timeout: int = -1) -> dict: """ Stop Cluster on node with nodeid. Args: nodeid (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ timeout = const.NODE_STOP_TIMEOUT if timeout < 0 else timeout node_status = self.nodes_status([nodeid]).get(nodeid) if node_status == NODE_STATUSES.CLUSTER_OFFLINE.value: Log.info(f"For stop {nodeid}, Node already in offline state.") status = f"Node {nodeid} is already in offline state." elif node_status == NODE_STATUSES.POWEROFF.value: raise ClusterManagerError(f"Failed to stop {nodeid}." f"node is in {node_status}.") else: if self.heal_resource(nodeid): time.sleep(const.BASE_WAIT_TIME) try: Log.info(f"Please Wait, trying to stop node: {nodeid}") self._execute.run_cmd( const.PCS_STOP_NODE.replace("<node>", nodeid).replace( "<seconds>", str(timeout))) Log.info( f"Executed node stop for {nodeid}, Waiting to stop resource" ) time.sleep(const.BASE_WAIT_TIME) status = f"Stop for {nodeid} is in progress, waiting to stop resource" except Exception as e: raise ClusterManagerError( f"Failed to stop {nodeid}, Error: {e}") return {"status": const.STATUSES.IN_PROGRESS.value, "msg": status}
def stop(self, node_id: str, excludeResourceList: list = None) -> dict: """ Stop service. Args: node_id (str): Private fqdn define in conf store. excludeResourceList (list): Service list which are not stopped. Returns: ([dict]): Return dictionary. {"status": "", "output":"", "error"} status: Succeeded, Failed, InProgress """ try: resources: list = [] output, _, _ = self._execute.run_cmd(const.LIST_PCS_RESOURCES, check_error=False) if "NO resources".lower() not in output.lower(): for resource in output.split("\n"): res = resource.split(":")[0] if res != "" and res not in resources and res not in excludeResourceList: resources.append(res) for resource in resources: self._execute.run_cmd(const.PCS_BAN_RESOURCES.replace("<resource_id>", resource).replace("<node>", node_id)) Log.info(f"Waiting to stop resource on node {node_id}") time.sleep(const.BASE_WAIT_TIME) # TODO: Check if the resources are stopped EOS-23386 return {"status": const.STATUSES.IN_PROGRESS.value, "msg": f"Resources stopped on node {node_id}"} except Exception as e: raise ClusterManagerError(f"Failed to stop resources on {node_id}, Error: {e}")
def get_system_health( self, element: CLUSTER_ELEMENTS = CLUSTER_ELEMENTS.CLUSTER.value, depth: int = 1, **kwargs) -> json: """ Return health status for the requested elements. Args: element ([CLUSTER_ELEMENTS]): The element whose health status is to be returned. depth ([int]): A depth of elements starting from the input "element" that the health status is to be returned. **kwargs([dict]): Variable number of arguments that are used as filters, e.g. "id" of the input "element". Returns: ([dict]): Returns dictionary. {"status": "Succeeded"/"Failed"/"Partial", "output": "", "error": ""} status: Succeeded, Failed, Partial output: Dictionary with element health status error: Error information if the request "Failed" """ try: # Fetch the health status system_health_controller = SystemHealthController(self._confstore) return system_health_controller.get_status(component=element, depth=depth, version=self._version, **kwargs) except Exception as e: Log.error(f"Failed returning system health . Error: {e}") raise ClusterManagerError( "Failed returning system health, internal error")
def nodes_status(self, nodeids: list = None) -> dict: """ Get pcs status of nodes. Args: nodeids (list): List of Node IDs from cluster nodes. Default provide list of all node status. if 'local' then provide local node status. Returns: ([dict]): Return dictionary. {"node_id1": "status of node_id1", "node_id2": "status of node_id2"...} """ nodeids = self._get_node_list( ) if nodeids == None or len(nodeids) == 0 else nodeids all_nodes_status = dict() _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES, check_error=False) if not isinstance(nodeids, list): raise ClusterManagerError( f"Invalid nodeids type `{type(nodeids)}`, required `list`") for nodeid in nodeids: if nodeid in _output: for status in _output.split("\n"): nodes = status.split(":") if len(nodes) > 1 and nodeid.lower() in nodes[1].strip( ).lower(): if nodes[0].strip().lower( ) == NODE_STATUSES.STANDBY.value: all_nodes_status[ nodeid] = NODE_STATUSES.STANDBY.value elif nodes[0].strip().lower( ) == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value: all_nodes_status[ nodeid] = NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value elif nodes[0].strip().lower( ) == NODE_STATUSES.MAINTENANCE.value: all_nodes_status[ nodeid] = NODE_STATUSES.MAINTENANCE.value elif nodes[0].strip().lower( ) == NODE_STATUSES.CLUSTER_OFFLINE.value: all_nodes_status[ nodeid] = NODE_STATUSES.CLUSTER_OFFLINE.value elif nodes[0].strip().lower( ) == NODE_STATUSES.ONLINE.value: all_nodes_status[ nodeid] = NODE_STATUSES.ONLINE.value break else: all_nodes_status[nodeid] = NODE_STATUSES.UNKNOWN.value else: raise HAInvalidNode(f"Node {nodeid} is not a part of cluster") for node in all_nodes_status.keys(): status = all_nodes_status[node] if status == NODE_STATUSES.CLUSTER_OFFLINE.value: _output, _err, _rc = self._execute.run_cmd(f"ping -c 1 {node}", check_error=False) if _rc != 0: all_nodes_status[node] = NODE_STATUSES.POWEROFF.value return all_nodes_status
def create_cluster(self, name: str, user: str, secret: str, nodeid: str) -> dict: """ Create cluster if not created. Args: name (str): Cluster name. user (str): Cluster User. secret (str): Cluster passward. nodeid (str): Node name, nodeid of current node. Returns: dict: Return dictionary. {"status": "", "output":"", "error":""} """ try: self._check_non_empty(name=name, user=user, secret=secret, nodeid=nodeid) if not self._is_pcs_cluster_running(): self._auth_node(nodeid, user, secret) self._execute.run_cmd( const.PCS_SETUP_CLUSTER.replace("<cluster_name>", name).replace( "<node>", nodeid)) Log.info("Pacmaker cluster created, waiting to start node.") self._execute.run_cmd(const.PCS_CLUSTER_START_NODE) self._execute.run_cmd(const.PCS_CLUSTER_ENABLE) Log.info("Node started and enabled successfully.") time.sleep(const.BASE_WAIT_TIME * 2) if self._is_pcs_cluster_running(): if self.wait_for_node_online(nodeid): # TODO: Divide class into vm, hw when stonith is needed. self._execute.run_cmd(const.PCS_STONITH_DISABLE) return { "status": const.STATUSES.SUCCEEDED.value, "output": "Cluster created successfully.", "error": "" } else: raise ClusterManagerError("Node is not online.") else: raise ClusterManagerError("Cluster is not started.") except Exception as e: raise ClusterManagerError(f"Failed to create cluster. Error: {e}")
def _get_cluster_size(self): """ Auth node to add """ try: _output, _err, _rc = self._execute.run_cmd(const.PCS_CLUSTER_PCSD_STATUS) return len(_output.split("\n")) except Exception as e: raise ClusterManagerError(f"Unable to get cluster : with reason : {e}")
def _get_node_group(self) -> list: """ Get node_group """ if self._is_pcs_cluster_running() is False: raise ClusterManagerError( "Cluster is not running on current node.") res = json.loads(self.node_list()) if res.get("status") != const.STATUSES.SUCCEEDED.value: raise ClusterManagerError("Failed to get node list.") else: node_list: list = res.get("output") Log.info(f"Node List : {node_list}") if node_list is not None: node_group: list = [ node_list[i:i + const.PCS_NODE_GROUP_SIZE] for i in range( 0, len(node_list), const.PCS_NODE_GROUP_SIZE) ] return node_group
def load_json_file(json_file): """ Load json file to read node & the cluster details to auth node :param json_file: """ try: with open(json_file) as f: return json.load(f) except Exception as e: raise ClusterManagerError(f"Error in reading desc_file, reason : {e}")
def _check_non_empty(self, **kwargs): """ Check if params are not empty. Raises: ClusterManagerError: [description] """ for key in kwargs.keys(): if kwargs[key] is None or kwargs[key] == "": raise ClusterManagerError(f"Failed: Invalid parameter, {key} cannot be empty.")
def start(self) -> dict: """ Start cluster and all service. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ if self._is_pcs_cluster_running() is False: res = self._pcs_cluster_start() time.sleep(const.BASE_WAIT_TIME) if res != const.STATUSES.SUCCEEDED.value: raise ClusterManagerError("Cluster start operation failed") status = "" failed_node_list: list = [] try: node_group: list = self._get_node_group() for node_subgroup in node_group: for node_id in node_subgroup: res = json.loads(self._controllers[ const.NODE_CONTROLLER].start(node_id)) Log.info(f'res: {res}') if res.get("status") == const.STATUSES.FAILED.value: msg = res.get("msg") Log.error(f"Node {node_id} : {msg}") failed_node_list.append(node_id) # Wait till all the resources get started in the sub group time.sleep(const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE) except Exception as e: raise ClusterManagerError(f"Failed to start Cluster. Error: {e}") status = "Cluster start is in process." if len(failed_node_list) != 0 and len(failed_node_list) != len( json.loads(self.node_list()).get("msg")): status += f"Warning, Some of nodes failed to start are {failed_node_list}" elif len(failed_node_list) != 0: raise ClusterManagerError( f"Failed to start all nodes {failed_node_list}") else: status += "All node started successfully, resource start in progress." return { "status": const.STATUSES.IN_PROGRESS.value, "msg": "Cluster start operation performed" }
def _create_req(self, target_node_name: str) -> str: """ Create actuator request for enclosure stop Args: target_node_name : node on which enclosure is to be stopped Return: created actuator request """ with open(ACTUATOR_SCHEMA, 'r') as actuator_req_schema_file: actuator_req = json.load(actuator_req_schema_file) # Get details for current node node_name = Conf.get(const.HA_GLOBAL_INDEX, f"CLUSTER_MANAGER{_DELIM}local_node") key_val = self._conf_store.get(f"{PVTFQDN_TO_NODEID_KEY}/{node_name}") _, node_id = key_val.popitem() system_health = SystemHealth(self._conf_store) health_manager = SystemHealthManager(self._conf_store) key = system_health._prepare_key(const.COMPONENTS.NODE_MAP.value, node_id=node_id) node_map_val = health_manager.get_key(key) if node_map_val is None: raise ClusterManagerError("Failed to fetch node_map value") node_map_dict = ast.literal_eval(node_map_val) self._generate_uuid() # Populate the actuator request schema actuator_req[ACTUATOR_ATTRIBUTES.TIME] = str(int(time.time())) actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][ ACTUATOR_ATTRIBUTES.REQUEST_PATH][ ACTUATOR_ATTRIBUTES.SITE_ID] = node_map_dict[ NODE_MAP_ATTRIBUTES.SITE_ID.value] actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][ ACTUATOR_ATTRIBUTES.REQUEST_PATH][ ACTUATOR_ATTRIBUTES.RACK_ID] = node_map_dict[ NODE_MAP_ATTRIBUTES.RACK_ID.value] actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][ ACTUATOR_ATTRIBUTES.REQUEST_PATH][ ACTUATOR_ATTRIBUTES.NODE_ID] = node_id actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][ ACTUATOR_ATTRIBUTES.TARGET_NODE_ID] = target_node_name actuator_req[ACTUATOR_ATTRIBUTES.MESSAGE][ACTUATOR_ATTRIBUTES.HEADER][ ACTUATOR_ATTRIBUTES.UUID] = self._uuid actuator_req_schema_file.close() req = json.dumps(actuator_req) return req
def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict: """ Stop Node with nodeid. Args: nodeid (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded, Failed, InProgress """ check_cluster = op_kwargs.get("check_cluster") if op_kwargs.get( "check_cluster") is not None else True # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid node_name = ConfigManager.get_node_name(node_id=node_id) try: timeout = const.NODE_STOP_TIMEOUT if timeout < 0 else timeout node_status = self._system_health.get_node_status( node_id=node_id).get("status") if node_status == HEALTH_STATUSES.OFFLINE.value: Log.info( f"For stop node id {node_id}, Node already in offline state." ) status = f"Node with node id {node_id} is already in offline state." return { "status": const.STATUSES.SUCCEEDED.value, "output": status, "error": "" } elif node_status == HEALTH_STATUSES.FAILED.value: # In case VM, if node is Poweroff or Disconnected, system health will be updated with status FAILED. return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} status is {node_status}, node cannot be stopped." } else: if self.heal_resource(node_name): time.sleep(const.BASE_WAIT_TIME) if check_cluster: # Checks whether cluster is going to be offline if node with node_name is stopped. res = json.loads( self.check_cluster_feasibility(node_id=node_id)) if res.get("status") == const.STATUSES.FAILED.value: return res except Exception as e: raise ClusterManagerError( f"Failed to stop node {node_id}, Error: {e}")
def _get_node_list(self) -> list: """ Return list of nodes. """ #TODO: This is temporary implementation and It should be removed once nodelist is available in the system health. nodelist = [] _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES, check_error=False) if _rc != 0: raise ClusterManagerError("Failed to get nodes status") for status in _output.split("\n"): nodes = status.split(":") if len(nodes) > 1: nodelist.extend(nodes[1].split()) return nodelist
def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict: """ Stop Node with nodeid. Args: node_id (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded, Failed, InProgress """ # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid node_name = ConfigManager.get_node_name(node_id=node_id) try: stop_status = json.loads(super().stop(node_id, **op_kwargs)) if stop_status != None: if stop_status["status"] == const.STATUSES.SUCCEEDED.value: # Node is already in offline state. return stop_status elif stop_status["status"] == const.STATUSES.FAILED.value: # Node is in failed state. return stop_status # Put node in standby mode self._execute.run_cmd( const.PCS_NODE_STANDBY.replace("<node>", node_name), f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}") Log.info(f"Executed node standby for node {node_id}") # TODO: EOS-23859 : STOP NODE - Use PCS_STOP_NODE from const.py with timeout value status = f"Standby for node {node_id} is in progress" # Update node health # TODO : Health event update to be removed once fault_tolerance branch is merged initial_event = self._system_health.get_health_event_template( nodeid=node_id, event_type=HEALTH_EVENTS.FAULT.value) Log.debug( f"Node health : {initial_event} updated for node {node_id}") health_event = HealthEvent.dict_to_object(initial_event) self._system_health.process_event(health_event) return { "status": const.STATUSES.SUCCEEDED.value, "output": status, "error": "" } except Exception as e: raise ClusterManagerError( f"Failed to stop node {node_id}, Error: {e}")
def enclosure_stop( self, node_name: str, timeout: int = int(ACTUATOR_RESP_RETRY_COUNT * ACTUATOR_RESP_WAIT_TIME) ) -> bool: """ Send actuator request to monitor for stopping enclosure Args: node_name : node on which enclosure is to be stopped Return: True : if enclose stop successful; else exception is raised """ req = self._create_req(node_name) self._register_for_resp() self._send_req(req) retry_count = int(timeout // ACTUATOR_RESP_WAIT_TIME) # Wait for 60 sec max. Expected max wait time = 40 sec + 20 sec buffer for _ in range(0, retry_count): time.sleep(ACTUATOR_RESP_WAIT_TIME) if self._is_resp_received: self.consumer.stop() break if self._is_resp_received and self._encl_shutdown_successful: Log.info(f"Enclosure shutdown successful on node {node_name}") self._is_resp_received = self._encl_shutdown_successful = False return True if not self._is_resp_received: self.timeout_reached = True self.consumer.stop() self.timeout_reached = False Log.error( f"Actuator response not received; enclosure shutdown failed on node {node_name}" ) else: Log.error(f"Unable to shutdown enclosure on node {node_name}") self._is_resp_received = self._encl_shutdown_successful = False raise ClusterManagerError("Failed to shutdown the enclosure")
def _auth_node(self, node_id, cluster_user, cluster_password): """ Auth node to add """ try: auth_cmd = const.PCS_CLUSTER_NODE_AUTH.replace( "<node>", node_id).replace("<username>", cluster_user).replace("<password>", cluster_password) self._execute.run_cmd(auth_cmd, secret=cluster_password) Log.info( f"Node {node_id} authenticated with {cluster_user} Successfully." ) except Exception as e: Log.error( f"Failed to authenticate node : {node_id} with reason : {e}") raise ClusterManagerError( f"Failed to authenticate node : {node_id}, Please check username or password" )
def clear_resources(self, node_id: str): """ Clear resources on node. Args: nodeid (str): Private fqdn define in conf store. """ try: resources: list = [] output, _, _ = self._execute.run_cmd(const.LIST_PCS_RESOURCES, check_error=False) if "NO resources".lower() not in output.lower(): for resource in output.split("\n"): res = resource.split(":")[0] if res != "" and res not in resources: resources.append(res) for resource in resources: self._execute.run_cmd(const.PCS_CLEAR_RESOURCES.replace("<resource_id>", resource).replace("<node>", node_id)) Log.info(f"Cleared resource on node {node_id}") except Exception as e: raise ClusterManagerError(f"Failed to clear resources on {node_id}, Error: {e}")
def start(self, node_id: str, **op_kwargs) -> dict: """ Start node with the node_id. Args: node_id (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded, Failed, InProgress """ try: poweron = op_kwargs.get("poweron") if op_kwargs.get( "poweron") is not None else False node_name = ConfigManager.get_node_name(node_id=node_id) self._is_node_in_cluster(node_id=node_name) power_status = self.fencing_agent.power_status(node_id=node_name) if power_status == const.SERVER_POWER_STATUS.OFF.value and poweron is False: Log.debug( f"Node {node_name} is powered-off and poweron was not set") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} is powered-off, use poweron option" } elif power_status == const.SERVER_POWER_STATUS.OFF.value and poweron is True: self.fencing_agent.power_on(node_id=node_name) Log.debug( f"Node {node_name} is powered-on, waiting for node boot") time.sleep(const.NODE_POWERON_DELAY) start_status = super().start(node_id, **op_kwargs) # TODO: Move this to base class after during stop stonith is disabled for VM as well. start_status_json = json.loads(start_status) if start_status_json["status"] == const.STATUSES.SUCCEEDED.value: self._execute.run_cmd( const.ENABLE_STONITH.replace("<node>", node_name)) return start_status except Exception as e: raise ClusterManagerError(f"Failed to start {node_id}, Error: {e}")
def start(self, sync=False, timeout=120) -> dict: """ Start cluster and all service. Args: sync (bool, optional): if sync is True then start will check the status for timeout seconds. timeout (int, optional): timeout(in seconds) can be specified for sync=True otherwise ignored. Returns: ([dict]): Return dictionary. {"status": "", "output":"", "error":""} status: Succeeded, Failed, InProgress """ # Current behavior of start, starts all the nodes and bring all the nodes out of standby. # This is deviation from cluster start as envisioned. So the documentation should change. if self._is_pcs_cluster_running() is False: res = self._pcs_cluster_start() time.sleep(const.BASE_WAIT_TIME) if res != const.STATUSES.SUCCEEDED.value: raise ClusterManagerError("Cluster start operation failed") status = "" failed_node_list: list = [] try: node_group: list = self._get_node_group() for node_subgroup in node_group: for node_name in node_subgroup: node_id = ConfigManager.get_node_id(node_name) res = json.loads(self._controllers[ const.NODE_CONTROLLER].start(node_id)) Log.info(f'res: {res}') if res.get("status") == const.STATUSES.FAILED.value: msg = res.get("error") Log.error(f"Node {node_name} : {msg}") failed_node_list.append(node_name) # Wait till all the resources get started in the sub group time.sleep(const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE) except Exception as e: raise ClusterManagerError(f"Failed to start Cluster. Error: {e}") status = "Cluster start is in process." if len(failed_node_list) != 0 and len(failed_node_list) != len( json.loads(self.node_list()).get("output")): status += f"Warning, Some of nodes failed to start are {failed_node_list}" elif len(failed_node_list) != 0: raise ClusterManagerError( f"Failed to start all nodes {failed_node_list}") else: status += "All node started successfully, resource start in progress." self.enable_stonith() if sync: timeout = timeout - const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE * len( node_group) in_expected_state = self._verify_expected_cluster_status( const.CLUSTER_STATUS.ONLINE, timeout) if in_expected_state: return { "status": const.STATUSES.SUCCEEDED.value, "output": "Cluster is online.", "error": "" } else: return { "status": const.STATUSES.FAILED.value, "output": "Retry suggested.", "error": "Operation timed out." } return { "status": const.STATUSES.IN_PROGRESS.value, "output": "Cluster start operation performed", "error": "" }
def start(self, nodeid: str) -> dict: """ Start node with nodeid. Args: nodeid (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ _node_status = self.nodes_status([nodeid])[nodeid] if _node_status == NODE_STATUSES.ONLINE.value: return { "status": const.STATUSES.SUCCEEDED.value, "msg": f"Node {nodeid}, is already in Online status" } elif _node_status == NODE_STATUSES.STANDBY.value or _node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value: # make node unstandby if self.heal_resource(nodeid): _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid), check_error=False) return { "status": const.STATUSES.IN_PROGRESS.value, "msg": f"Node {nodeid} : Node was in standby mode, " f"Unstandby operation started successfully" } else: Log.error( f"Node {nodeid} is in standby mode : Resource failcount found on the node, " f"cleanup not worked after 2 retries") return { "status": const.STATUSES.FAILED.value, "msg": f"Node {nodeid} is in standby mode: Resource " f"failcount found on the node cleanup not worked after 2 retries" } elif _node_status == NODE_STATUSES.CLUSTER_OFFLINE.value: _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_START.replace("<node>", nodeid), check_error=False) if _rc != 0: raise ClusterManagerError(f"Failed to start node {nodeid}") Log.info(f'Node: {nodeid} started successfully. Now, waiting for \ cluster to stabalize and then get the node status') time.sleep(const.BASE_WAIT_TIME * 2) # Get the status of the node again _node_status = self.nodes_status([nodeid])[nodeid] # If the node is in standby mode, unstandby here if _node_status == NODE_STATUSES.STANDBY.value: Log.warn(f'Node: {nodeid} is still in standby mode') _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid), check_error=False) if _rc != 0: raise ClusterManagerError( f"Failed to unstandby the node: {nodeid}") return { "status": const.STATUSES.IN_PROGRESS.value, "msg": f"Node {nodeid}: Node was in offline and then switched to standby mode, " f"Cluster started on node successfully" } return { "status": const.STATUSES.IN_PROGRESS.value, "msg": f"Node {nodeid} : Node was in cluster_offline mode, " f"Cluster started on node successfully" } elif _node_status == NODE_STATUSES.POWEROFF.value: # start node not in scope of VM Log.error("Operation not available for node type VM") raise ClusterManagerError( f"Node {nodeid} : Node was in poweroff mode, " "Node start : Operation not available for VM") else: Log.error( f"{nodeid} status is {_node_status}, node may not be started.") raise ClusterManagerError( f"Failed to start {nodeid} as found unhandled status {_node_status}" )
def stop(self) -> dict: """ Stop cluster and all service. It is Blocking call. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ status: str = "" if not self._is_pcs_cluster_running(): raise ClusterManagerError( "Cluster not running on current node." "To stop cluster, It should be running on current node.") node_group: list = self._get_node_group() local_node: str = ConfigManager.get_local_node() Log.info( f"Node group for cluster start {node_group}, local node {local_node}" ) self_group: list = list( filter(lambda group: (local_node in group), node_group))[0] node_group.remove(self_group) offline_nodes = self._get_filtered_nodes( [NODE_STATUSES.POWEROFF.value]) # Stop cluster for other group for node_subgroup in node_group: for nodeid in node_subgroup: # Offline node can not be started without stonith. if nodeid not in offline_nodes: if self.heal_resource(nodeid): time.sleep(const.BASE_WAIT_TIME) res = json.loads( self._controllers[const.NODE_CONTROLLER].stop(nodeid)) Log.info(f"Stopping node {nodeid}, output {res}") if NODE_STATUSES.POWEROFF.value in res.get("msg"): offline_nodes.append(nodeid) Log.warn( f"Node {nodeid}, is in offline or lost from network." ) elif res.get("status") == const.STATUSES.FAILED.value: raise ClusterManagerError( f"Cluster Stop failed. Unable to stop {nodeid}") else: Log.info(f"Node {nodeid} stop is in progress.") else: Log.info( f"Node {nodeid}, is in offline or lost from network.") # Wait till resource will get stop. Log.info(f"Waiting, for {node_subgroup} to stop is in progress.") # Stop self group of cluster try: Log.info( f"Please Wait, trying to stop self node group: {self_group}") timeout = const.NODE_STOP_TIMEOUT * len(self_group) self._execute.run_cmd( const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout))) Log.info("Cluster stop completed.") except Exception as e: raise ClusterManagerError(f"Cluster stop failed. Error: {e}") status = "Cluster stop is in progress." if len(offline_nodes) != 0: status += f" Warning, Found {offline_nodes}, may be poweroff or not in network" return {"status": const.STATUSES.IN_PROGRESS.value, "msg": status}
def start(self, node_id: str, **op_kwargs) -> dict: """ Start node with the node_id. Args: node_id (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded, Failed, InProgress """ try: # Get the node_name (pvtfqdn) from node_id node_name = ConfigManager.get_node_name(node_id=node_id) self._is_node_in_cluster(node_id=node_name) node_status = self.nodes_status([node_name])[node_name] Log.debug(f"Node {node_name} cluster status is {node_status}") node_health = self._system_health.get_node_status( node_id=node_id).get("status") Log.debug(f"Node {node_name} health is {node_health}") if node_status == NODE_STATUSES.ONLINE.value and node_health == HEALTH_STATUSES.ONLINE.value: Log.debug(f"Node {node_name} is already online") return { "status": const.STATUSES.SUCCEEDED.value, "output": NODE_STATUSES.ONLINE.value, "error": "" } elif node_status == NODE_STATUSES.STANDBY.value or node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value: # Unstandby the node if self.heal_resource(node_name): _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", node_name), check_error=False) if _rc != 0: Log.error( f"Failed to start node {node_name}, Error: {_err}") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Failed to start node {node_id}, Error: {_err}" } Log.debug( f"Node {node_name} was in standby mode, unstandby operation started successfully" ) else: Log.error( f"Node {node_name} is in standby mode : Resource failcount found on the node, cleanup did not work" ) return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} is in standby mode, resource failcount found on the node, cleanup did not work" } elif node_status == NODE_STATUSES.CLUSTER_OFFLINE.value: _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_START.replace("<node>", node_name), check_error=False) if _rc != 0: Log.error( f"Failed to start node {node_name}, Error: {_err}") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Failed to start node {node_id}, Error: {_err}" } Log.debug( f"Node {node_name} started successfully. Waiting for cluster to stabalize and then get the node status" ) time.sleep(const.BASE_WAIT_TIME * 2) # Get the status of the node again node_status = self.nodes_status([node_name])[node_name] # If the node is in standby mode, unstandby here if node_status == NODE_STATUSES.STANDBY.value: Log.warn(f'Node {node_name} is still in standby mode') _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", node_name), check_error=False) if _rc != 0: Log.error( f"Failed to start node {node_name}, Error: {_err}") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Failed to start node {node_id}, Error: {_err}" } else: Log.error( f"{node_name} status is {node_status}, node cannot be started." ) return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} status is {node_status}, node cannot be started." } # TODO: Update the storage enclosure status in system health. # Update the node status in system health self._update_health(const.COMPONENTS.NODE.value, node_id, HEALTH_EVENTS.FAULT_RESOLVED.value) return { "status": const.STATUSES.SUCCEEDED.value, "output": NODE_STATUSES.ONLINE.value, "error": "" } except Exception as e: Log.error(f"Failed to start node {node_id}") raise ClusterManagerError( f"Failed to start node {node_id}, Error {e}")
def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict: """ Stop (poweroff) node with node_id. Args: node_id (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ poweroff = op_kwargs.get("poweroff") if op_kwargs.get( "poweroff") is not None else False storageoff = op_kwargs.get("storageoff") if op_kwargs.get( "storageoff") is not None else False # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid node_name = ConfigManager.get_node_name(node_id=node_id) try: stop_status = json.loads(super().stop(node_id, **op_kwargs)) if stop_status != None: if stop_status["status"] == const.STATUSES.SUCCEEDED.value: # Node is already in offline state. return stop_status elif stop_status["status"] == const.STATUSES.FAILED.value: # Node is in failed state. return stop_status if storageoff: # Stop services on node except sspl-ll self._controllers[const.SERVICE_CONTROLLER].stop( node_id=node_name, excludeResourceList=[RESOURCE.SSPL_LL.value]) # Stop the storage enclosure on the node actuator_mgr = ActuatorManager() actuator_mgr.enclosure_stop(node_name) Log.info(f"Enclosure stopped for {node_name}") # TODO: Update enclosure health # Put node in standby mode self._execute.run_cmd( const.PCS_NODE_STANDBY.replace("<node>", node_name), f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}") Log.info(f"Executed node standby for node {node_id}") self._controllers[const.SERVICE_CONTROLLER].clear_resources( node_id=node_name) else: self._execute.run_cmd( const.PCS_NODE_STANDBY.replace("<node>", node_name), f" --wait={const.CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}") Log.info(f"Executed node standby for node {node_id}") status = f"For node {node_id}, Standby is in progress" # Update node health # TODO : Health event update to be removed once fault_tolerance branch is merged initial_event = self._system_health.get_health_event_template( nodeid=node_id, event_type=HEALTH_EVENTS.FAULT.value) Log.debug( f"Node health : {initial_event} updated for node {node_id}") health_event = HealthEvent.dict_to_object(initial_event) self._system_health.process_event(health_event) # Node power off if poweroff: self._execute.run_cmd( const.DISABLE_STONITH.replace("<node>", node_name)) self.fencing_agent.power_off(node_id=node_name) status = f"Power off for node {node_id} is in progress" Log.info(f"Node power off successfull. status : {status}") # TODO : return status should be changed according to passed parameters return { "status": const.STATUSES.SUCCEEDED.value, "error": "", "output": status } except Exception as e: raise ClusterManagerError( f"Failed to stop node {node_id}, Error: {e}")
def stop(self, sync=False, timeout=30) -> dict: """ Stop cluster and all service. Args: sync (bool, optional): if sync is True then stop will check the status for timeout seconds. timeout (int, optional): timeout(in seconds) can be specified for sync=True otherwise ignored. Returns: ([dict]): Return dictionary. {"status": "", "output":"", "error":""} status: Succeeded, Failed, InProgress """ status: str = "" if not self._is_pcs_cluster_running(): raise ClusterManagerError( "Cluster not running on current node." "To stop cluster, It should be running on current node.") node_group: list = self._get_node_group() local_node: str = ConfigManager.get_local_node() Log.info( f"Node group for cluster start {node_group}, local node {local_node}" ) self_group: list = list( filter(lambda group: (local_node in group), node_group))[0] node_group.remove(self_group) offline_nodes = self._get_filtered_nodes( [NODE_STATUSES.POWEROFF.value]) # Stop cluster for other group for node_subgroup in node_group: for node_name in node_subgroup: # Offline node can not be started without stonith. if node_name not in offline_nodes: if self.heal_resource(node_name): time.sleep(const.BASE_WAIT_TIME) node_id = ConfigManager.get_node_id(node_name) res = json.loads( self._controllers[const.NODE_CONTROLLER].stop(node_id)) Log.info(f"Stopping node {node_id}, output {res}") if NODE_STATUSES.POWEROFF.value in res.get("output"): offline_nodes.append(node_id) Log.warn( f"Node {node_id}, is in offline or lost from network." ) elif res.get("status") == const.STATUSES.FAILED.value: raise ClusterManagerError( f"Cluster Stop failed. Unable to stop {node_id}") else: Log.info(f"Node {node_id} stop is in progress.") else: Log.info( f"Node {node_name}, is in offline or lost from network." ) # Wait till resource will get stop. Log.info(f"Waiting, for {node_subgroup} to stop is in progress.") # Stop self group of cluster try: Log.info( f"Please Wait, trying to stop self node group: {self_group}") timeout = const.NODE_STOP_TIMEOUT * len(self_group) self._execute.run_cmd( const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout))) Log.info("Cluster stop completed.") except Exception as e: raise ClusterManagerError(f"Cluster stop failed. Error: {e}") status = "Cluster stop is in progress." if len(offline_nodes) != 0: status += f" Warning, Found {offline_nodes}, may be poweroff or not in network" if sync: timeout = timeout - const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE * len( node_group) in_expected_state = self._verify_expected_cluster_status( const.CLUSTER_STATUS.OFFLINE, timeout) if in_expected_state: return { "status": const.STATUSES.SUCCEEDED.value, "output": "Cluster is offline.", "error": "" } else: return { "status": const.STATUSES.FAILED.value, "output": "Retry suggested.", "error": "Operation timed out." } return { "status": const.STATUSES.IN_PROGRESS.value, "output": status, "error": "" }