def quorum_unblock_cmd(argv): if len(argv) > 0: usage.quorum(["unblock"]) sys.exit(1) if utils.is_rhel6(): utils.err("operation is not supported on CMAN clusters") output, retval = utils.run( ["corosync-cmapctl", "-g", "runtime.votequorum.wait_for_all_status"] ) if retval != 0: utils.err("unable to check quorum status") if output.split("=")[-1].strip() != "1": utils.err("cluster is not waiting for nodes to establish quorum") unjoined_nodes = ( set(utils.getNodesFromCorosyncConf()) - set(utils.getCorosyncActiveNodes()) ) if not unjoined_nodes: utils.err("no unjoined nodes found") if "--force" not in utils.pcs_options: answer = utils.get_terminal_input( ( "WARNING: If node(s) {nodes} are not powered off or they do" + " have access to shared resources, data corruption and/or" + " cluster failure may occur. Are you sure you want to" + " continue? [y/N] " ).format(nodes=", ".join(unjoined_nodes)) ) if answer.lower() not in ["y", "yes"]: print("Canceled") return for node in unjoined_nodes: stonith.stonith_confirm([node], skip_question=True) output, retval = utils.run( ["corosync-cmapctl", "-s", "quorum.cancel_wait_for_all", "u8", "1"] ) if retval != 0: utils.err("unable to cancel waiting for nodes") print("Quorum unblocked") startup_fencing = utils.get_set_properties().get("startup-fencing", "") utils.set_cib_property( "startup-fencing", "false" if startup_fencing.lower() != "false" else "true" ) utils.set_cib_property("startup-fencing", startup_fencing) print("Waiting for nodes canceled")
def cluster_pcsd_status(argv, dont_exit=False): bad_nodes = False if len(argv) == 0: nodes = utils.getNodesFromCorosyncConf() if len(nodes) == 0: if utils.is_rhel6(): utils.err("no nodes found in cluster.conf") else: utils.err("no nodes found in corosync.conf") bad_nodes = check_nodes(nodes, " ") else: bad_nodes = check_nodes(argv, " ") if bad_nodes and not dont_exit: sys.exit(2)
def stonith_level_verify(): dom = utils.get_cib_dom() corosync_nodes = [] if utils.hasCorosyncConf(): corosync_nodes = utils.getNodesFromCorosyncConf() pacemaker_nodes = utils.getNodesFromPacemaker() fls = dom.getElementsByTagName("fencing-level") for fl in fls: node = fl.getAttribute("target") devices = fl.getAttribute("devices") for dev in devices.split(","): if not utils.is_stonith_resource(dev): utils.err("%s is not a stonith id" % dev) if node not in corosync_nodes and node not in pacemaker_nodes: utils.err("%s is not currently a node" % node)
def stonith_level_add(level, node, devices): dom = utils.get_cib_dom() if not re.search(r'^\d+$', level) or re.search(r'^0+$', level): utils.err("invalid level '{0}', use a positive integer".format(level)) level = level.lstrip('0') if "--force" not in utils.pcs_options: for dev in devices.split(","): if not utils.is_stonith_resource(dev): utils.err("%s is not a stonith id (use --force to override)" % dev) corosync_nodes = [] if utils.hasCorosyncConf(): corosync_nodes = utils.getNodesFromCorosyncConf() pacemaker_nodes = utils.getNodesFromPacemaker() if node not in corosync_nodes and node not in pacemaker_nodes: utils.err("%s is not currently a node (use --force to override)" % node) ft = dom.getElementsByTagName("fencing-topology") if len(ft) == 0: conf = dom.getElementsByTagName("configuration")[0] ft = dom.createElement("fencing-topology") conf.appendChild(ft) else: ft = ft[0] fls = ft.getElementsByTagName("fencing-level") for fl in fls: if fl.getAttribute("target") == node and fl.getAttribute( "index") == level and fl.getAttribute("devices") == devices: utils.err( "unable to add fencing level, fencing level for node: %s, at level: %s, with device: %s already exists" % (node, level, devices)) new_fl = dom.createElement("fencing-level") ft.appendChild(new_fl) new_fl.setAttribute("target", node) new_fl.setAttribute("index", level) new_fl.setAttribute("devices", devices) new_fl.setAttribute("id", utils.find_unique_id(dom, "fl-" + node + "-" + level)) utils.replace_cib_configuration(dom)
def pcsd_sync_certs(argv, exit_after_error=True, async_restart=False): error = False nodes_sync = argv if argv else utils.getNodesFromCorosyncConf() nodes_restart = [] print("Synchronizing pcsd certificates on nodes {0}...".format( ", ".join(nodes_sync))) pcsd_data = { "nodes": nodes_sync, } output, retval = utils.run_pcsdcli("send_local_certs", pcsd_data) if retval == 0 and output["status"] == "ok" and output["data"]: try: sync_result = output["data"] if sync_result["node_status"]: for node, status in sync_result["node_status"].items(): print("{0}: {1}".format(node, status["text"])) if status["status"] == "ok": nodes_restart.append(node) else: error = True if sync_result["status"] != "ok": error = True utils.err(sync_result["text"], False) if error and not nodes_restart: if exit_after_error: sys.exit(1) else: return except (KeyError, AttributeError): utils.err("Unable to communicate with pcsd", exit_after_error) return else: utils.err("Unable to sync pcsd certificates", exit_after_error) return print( "Restarting pcsd on the nodes in order to reload the certificates...") pcsd_restart_nodes(nodes_restart, exit_after_error, async_restart=async_restart)
def pcsd_sync_certs(argv, exit_after_error=True): error = False nodes_sync = argv if argv else utils.getNodesFromCorosyncConf() nodes_restart = [] print("Synchronizing pcsd certificates on nodes {0}...".format( ", ".join(nodes_sync) )) pcsd_data = { "nodes": nodes_sync, } output, retval = utils.run_pcsdcli("send_local_certs", pcsd_data) if retval == 0 and output["status"] == "ok" and output["data"]: try: sync_result = output["data"] if sync_result["node_status"]: for node, status in sync_result["node_status"].items(): print("{0}: {1}".format(node, status["text"])) if status["status"] == "ok": nodes_restart.append(node) else: error = True if sync_result["status"] != "ok": error = True utils.err(sync_result["text"], False) if error and not nodes_restart: if exit_after_error: sys.exit(1) else: return except (KeyError, AttributeError): utils.err("Unable to communicate with pcsd", exit_after_error) return else: utils.err("Unable to sync pcsd certificates", exit_after_error) return print("Restarting pcsd on the nodes in order to reload the certificates...") pcsd_restart_nodes(nodes_restart, exit_after_error)
def stonith_level_add(level, node, devices): dom = utils.get_cib_dom() if not re.search(r'^\d+$', level) or re.search(r'^0+$', level): utils.err("invalid level '{0}', use a positive integer".format(level)) level = level.lstrip('0') if "--force" not in utils.pcs_options: for dev in devices.split(","): if not utils.is_stonith_resource(dev): utils.err("%s is not a stonith id (use --force to override)" % dev) corosync_nodes = [] if utils.hasCorosyncConf(): corosync_nodes = utils.getNodesFromCorosyncConf() pacemaker_nodes = utils.getNodesFromPacemaker() if node not in corosync_nodes and node not in pacemaker_nodes: utils.err("%s is not currently a node (use --force to override)" % node) ft = dom.getElementsByTagName("fencing-topology") if len(ft) == 0: conf = dom.getElementsByTagName("configuration")[0] ft = dom.createElement("fencing-topology") conf.appendChild(ft) else: ft = ft[0] fls = ft.getElementsByTagName("fencing-level") for fl in fls: if fl.getAttribute("target") == node and fl.getAttribute("index") == level and fl.getAttribute("devices") == devices: utils.err("unable to add fencing level, fencing level for node: %s, at level: %s, with device: %s already exists" % (node,level,devices)) new_fl = dom.createElement("fencing-level") ft.appendChild(new_fl) new_fl.setAttribute("target", node) new_fl.setAttribute("index", level) new_fl.setAttribute("devices", devices) new_fl.setAttribute("id", utils.find_unique_id(dom, "fl-" + node +"-" + level)) utils.replace_cib_configuration(dom)
def config_restore_remote(infile_name, infile_obj): extracted = { "version.txt": "", "corosync.conf": "", "cluster.conf": "", } try: tarball = tarfile.open(infile_name, "r|*", infile_obj) while True: # next(tarball) does not work in python2.6 tar_member_info = tarball.next() if tar_member_info is None: break if tar_member_info.name in extracted: tar_member = tarball.extractfile(tar_member_info) extracted[tar_member_info.name] = tar_member.read() tar_member.close() tarball.close() except (tarfile.TarError, EnvironmentError) as e: utils.err("unable to read the tarball: %s" % e) config_backup_check_version(extracted["version.txt"]) node_list = utils.getNodesFromCorosyncConf( extracted["cluster.conf" if utils.is_rhel6() else "corosync.conf"].decode("utf-8") ) if not node_list: utils.err("no nodes found in the tarball") err_msgs = [] for node in node_list: try: retval, output = utils.checkStatus(node) if retval != 0: err_msgs.append(output) continue status = json.loads(output) if ( status["corosync"] or status["pacemaker"] or status["cman"] or # not supported by older pcsd, do not fail if not present status.get("pacemaker_remote", False) ): err_msgs.append( "Cluster is currently running on node %s. You need to stop " "the cluster in order to restore the configuration." % node ) continue except (ValueError, NameError, LookupError): err_msgs.append("unable to determine status of the node %s" % node) if err_msgs: for msg in err_msgs: utils.err(msg, False) sys.exit(1) # Temporarily disable config files syncing thread in pcsd so it will not # rewrite restored files. 10 minutes should be enough time to restore. # If node returns HTTP 404 it does not support config syncing at all. for node in node_list: retval, output = utils.pauseConfigSyncing(node, 10 * 60) if not (retval == 0 or "(HTTP error: 404)" in output): utils.err(output) if infile_obj: infile_obj.seek(0) tarball_data = infile_obj.read() else: with open(infile_name, "rb") as tarball: tarball_data = tarball.read() error_list = [] for node in node_list: retval, error = utils.restoreConfig(node, tarball_data) if retval != 0: error_list.append(error) if error_list: utils.err("unable to restore all nodes\n" + "\n".join(error_list))
def nodes_status(argv): if len(argv) == 1 and argv[0] == "pacemaker-id": for node_id, node_name in utils.getPacemakerNodesID().items(): print("{0} {1}".format(node_id, node_name)) return if len(argv) == 1 and argv[0] == "corosync-id": for node_id, node_name in utils.getCorosyncNodesID().items(): print("{0} {1}".format(node_id, node_name)) return if len(argv) == 1 and (argv[0] == "config"): if utils.hasCorosyncConf(): corosync_nodes = utils.getNodesFromCorosyncConf() else: corosync_nodes = [] try: pacemaker_nodes = sorted([ node.attrs.name for node in ClusterState( utils.getClusterStateXml()).node_section.nodes if node.attrs.type != 'remote' ]) except LibraryError as e: utils.process_library_reports(e.args) print("Corosync Nodes:") if corosync_nodes: print(" " + " ".join(corosync_nodes)) print("Pacemaker Nodes:") if pacemaker_nodes: print(" " + " ".join(pacemaker_nodes)) return if len(argv) == 1 and (argv[0] == "corosync" or argv[0] == "both"): all_nodes = utils.getNodesFromCorosyncConf() online_nodes = utils.getCorosyncActiveNodes() offline_nodes = [] for node in all_nodes: if node not in online_nodes: offline_nodes.append(node) online_nodes.sort() offline_nodes.sort() print("Corosync Nodes:") print(" ".join([" Online:"] + online_nodes)) print(" ".join([" Offline:"] + offline_nodes)) if argv[0] != "both": sys.exit(0) info_dom = utils.getClusterState() nodes = info_dom.getElementsByTagName("nodes") if nodes.length == 0: utils.err("No nodes section found") onlinenodes = [] offlinenodes = [] standbynodes = [] maintenancenodes = [] remote_onlinenodes = [] remote_offlinenodes = [] remote_standbynodes = [] remote_maintenancenodes = [] for node in nodes[0].getElementsByTagName("node"): node_name = node.getAttribute("name") node_remote = node.getAttribute("type") == "remote" if node.getAttribute("online") == "true": if node.getAttribute("standby") == "true": if node_remote: remote_standbynodes.append(node_name) else: standbynodes.append(node_name) elif node.getAttribute("maintenance") == "true": if node_remote: remote_maintenancenodes.append(node_name) else: maintenancenodes.append(node_name) else: if node_remote: remote_onlinenodes.append(node_name) else: onlinenodes.append(node_name) else: if node_remote: remote_offlinenodes.append(node_name) else: offlinenodes.append(node_name) print("Pacemaker Nodes:") print(" ".join([" Online:"] + onlinenodes)) print(" ".join([" Standby:"] + standbynodes)) print(" ".join([" Maintenance:"] + maintenancenodes)) print(" ".join([" Offline:"] + offlinenodes)) print("Pacemaker Remote Nodes:") print(" ".join([" Online:"] + remote_onlinenodes)) print(" ".join([" Standby:"] + remote_standbynodes)) print(" ".join([" Maintenance:"] + remote_maintenancenodes)) print(" ".join([" Offline:"] + remote_offlinenodes))
def nodes_status(argv): if len(argv) == 1 and argv[0] == "pacemaker-id": for node_id, node_name in utils.getPacemakerNodesID().items(): print("{0} {1}".format(node_id, node_name)) return if len(argv) == 1 and argv[0] == "corosync-id": for node_id, node_name in utils.getCorosyncNodesID().items(): print("{0} {1}".format(node_id, node_name)) return if len(argv) == 1 and (argv[0] == "config"): if utils.hasCorosyncConf(): corosync_nodes = utils.getNodesFromCorosyncConf() else: corosync_nodes = [] try: pacemaker_nodes = sorted([ node.attrs.name for node in ClusterState(utils.getClusterStateXml()).node_section.nodes if node.attrs.type != 'remote' ]) except LibraryError as e: utils.process_library_reports(e.args) print("Corosync Nodes:") if corosync_nodes: print(" " + " ".join(corosync_nodes)) print("Pacemaker Nodes:") if pacemaker_nodes: print(" " + " ".join(pacemaker_nodes)) return if len(argv) == 1 and (argv[0] == "corosync" or argv[0] == "both"): all_nodes = utils.getNodesFromCorosyncConf() online_nodes = utils.getCorosyncActiveNodes() offline_nodes = [] for node in all_nodes: if node not in online_nodes: offline_nodes.append(node) online_nodes.sort() offline_nodes.sort() print("Corosync Nodes:") print(" ".join([" Online:"] + online_nodes)) print(" ".join([" Offline:"] + offline_nodes)) if argv[0] != "both": sys.exit(0) info_dom = utils.getClusterState() nodes = info_dom.getElementsByTagName("nodes") if nodes.length == 0: utils.err("No nodes section found") onlinenodes = [] offlinenodes = [] standbynodes = [] maintenancenodes = [] remote_onlinenodes = [] remote_offlinenodes = [] remote_standbynodes = [] remote_maintenancenodes = [] for node in nodes[0].getElementsByTagName("node"): node_name = node.getAttribute("name") node_remote = node.getAttribute("type") == "remote" if node.getAttribute("online") == "true": if node.getAttribute("standby") == "true": if node_remote: remote_standbynodes.append(node_name) else: standbynodes.append(node_name) elif node.getAttribute("maintenance") == "true": if node_remote: remote_maintenancenodes.append(node_name) else: maintenancenodes.append(node_name) else: if node_remote: remote_onlinenodes.append(node_name) else: onlinenodes.append(node_name) else: if node_remote: remote_offlinenodes.append(node_name) else: offlinenodes.append(node_name) print("Pacemaker Nodes:") print(" ".join([" Online:"] + onlinenodes)) print(" ".join([" Standby:"] + standbynodes)) print(" ".join([" Maintenance:"] + maintenancenodes)) print(" ".join([" Offline:"] + offlinenodes)) print("Pacemaker Remote Nodes:") print(" ".join([" Online:"] + remote_onlinenodes)) print(" ".join([" Standby:"] + remote_standbynodes)) print(" ".join([" Maintenance:"] + remote_maintenancenodes)) print(" ".join([" Offline:"] + remote_offlinenodes))