def destroy_cluster(argv): if len(argv) > 0: # stop pacemaker and resources while cluster is still quorate nodes = argv node_errors = parallel_for_nodes(utils.repeat_if_timeout( utils.stopPacemaker), nodes, quiet=True) # proceed with destroy regardless of errors # destroy will stop any remaining cluster daemons node_errors = parallel_for_nodes(utils.destroyCluster, nodes, quiet=True) if node_errors: utils.err("unable to destroy cluster\n" + "\n".join(node_errors.values()))
def wait_for_nodes_started(node_list, timeout=None): timeout = 60 * 15 if timeout is None else timeout interval = 2 stop_at = datetime.datetime.now() + datetime.timedelta(seconds=timeout) print("Waiting for node(s) to start...") if not node_list: code, output = wait_for_local_node_started(stop_at, interval) if code != 0: utils.err(output) else: print(output) else: node_errors = parallel_for_nodes(wait_for_remote_node_started, node_list, stop_at, interval) if node_errors: utils.err("unable to verify all nodes have started")
def start_cluster_nodes(nodes): # Large clusters take longer time to start up. So we make the timeout longer # for each 8 nodes: # 1 - 8 nodes: 1 * timeout # 9 - 16 nodes: 2 * timeout # 17 - 24 nodes: 3 * timeout # and so on # Users can override this and set their own timeout by specifying # the --request-timeout option (see utils.sendHTTPRequest). timeout = int(settings.default_request_timeout * math.ceil(len(nodes) / 8.0)) node_errors = parallel_for_nodes(utils.startCluster, nodes, quiet=True, timeout=timeout) if node_errors: utils.err("unable to start all nodes\n" + "\n".join(node_errors.values()))
def stop_cluster_nodes(nodes): all_nodes = utils.get_corosync_conf_facade().get_nodes_names() unknown_nodes = set(nodes) - set(all_nodes) if unknown_nodes: utils.err("nodes '%s' do not appear to exist in configuration" % "', '".join(unknown_nodes)) stopping_all = set(nodes) >= set(all_nodes) if "--force" not in utils.pcs_options and not stopping_all: error_list = [] for node in nodes: retval, data = utils.get_remote_quorumtool_output(node) if retval != 0: error_list.append(node + ": " + data) continue try: quorum_status = corosync_live.QuorumStatus.from_string(data) if not quorum_status.is_quorate: # Get quorum status from a quorate node, non-quorate nodes # may provide inaccurate info. If no node is quorate, there # is no quorum to be lost and therefore no error to be # reported. continue if quorum_status.stopping_nodes_cause_quorum_loss(nodes): utils.err( "Stopping the node(s) will cause a loss of the quorum" + ", use --force to override") else: # We have the info, no need to print errors error_list = [] break except corosync_live.QuorumStatusException: if not utils.is_node_offline_by_quorumtool_output(data): error_list.append(node + ": Unable to get quorum status") # else the node seems to be stopped already if error_list: utils.err( "Unable to determine whether stopping the nodes will cause " + "a loss of the quorum, use --force to override\n" + "\n".join(error_list)) was_error = False node_errors = parallel_for_nodes(utils.repeat_if_timeout( utils.stopPacemaker), nodes, quiet=True) accessible_nodes = [ node for node in nodes if node not in node_errors.keys() ] if node_errors: utils.err("unable to stop all nodes\n" + "\n".join(node_errors.values()), exit_after_error=not accessible_nodes) was_error = True for node in node_errors: print("{0}: Not stopping cluster - node is unreachable".format(node)) node_errors = parallel_for_nodes(utils.stopCorosync, accessible_nodes, quiet=True) if node_errors: utils.err("unable to stop all nodes\n" + "\n".join(node_errors.values())) if was_error: utils.err("unable to stop all nodes")