def _check_ip(expected): if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT) != expected: raise utils.RetryAgain() try: utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False]) except utils.RetryTimeout: warning = ("The master IP is still reachable after %s seconds," " continuing but activating the master IP on the current" " node will probably fail" % total_timeout) logging.warning("%s", warning) warnings.append(warning) rcode = 1 if jstore.CheckDrainFlag(): logging.info("Undraining job queue") jstore.SetDrainFlag(False) logging.info("Starting the master daemons on the new master") result = rpc.BootstrapRunner().call_node_start_master_daemons( new_master, no_voting) msg = result.fail_msg if msg: logging.error( "Could not start the master role on the new master" " %s, please check: %s", new_master, msg) rcode = 1 # Finally verify that the new master managed to set up the master IP
def MasterFailover(no_voting=False): """Failover the master node. This checks that we are not already the master, and will cause the current master to cease being master, and the non-master to become new master. Note: The call to MasterFailover from lib/client/gnt_cluster.py checks that a majority of nodes are healthy and responding before calling this. If this function is called from somewhere else, the caller should also verify that a majority of nodes are healthy. @type no_voting: boolean @param no_voting: force the operation without remote nodes agreement (dangerous) @returns: the pair of an exit code and warnings to display """ sstore = ssconf.SimpleStore() old_master, new_master = ssconf.GetMasterAndMyself(sstore) node_names = sstore.GetNodeList() mc_list = sstore.GetMasterCandidates() if old_master == new_master: raise errors.OpPrereqError( "This commands must be run on the node" " where you want the new master to be." " %s is already the master" % old_master, errors.ECODE_INVAL) if new_master not in mc_list: mc_no_master = [name for name in mc_list if name != old_master] raise errors.OpPrereqError( "This node is not among the nodes marked" " as master candidates. Only these nodes" " can become masters. Current list of" " master candidates is:\n" "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE) if not no_voting: vote_list = _GatherMasterVotes(node_names) if vote_list: voted_master = vote_list[0][0] if voted_master != old_master: raise errors.OpPrereqError( "I have a wrong configuration, I believe" " the master is %s but the other nodes" " voted %s. Please resync the configuration" " of this node." % (old_master, voted_master), errors.ECODE_STATE) # end checks rcode = 0 warnings = [] logging.info("Setting master to %s, old master: %s", new_master, old_master) try: # Forcefully start WConfd so that we can access the configuration result = utils.RunCmd([ pathutils.DAEMON_UTIL, "start", constants.WCONFD, "--force-node", "--no-voting", "--yes-do-it" ]) if result.failed: raise errors.OpPrereqError( "Could not start the configuration daemon," " command %s had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output), errors.ECODE_NOENT) # instantiate a real config writer, as we now know we have the # configuration data livelock = utils.livelock.LiveLock("bootstrap_failover") cfg = config.GetConfig(None, livelock, accept_foreign=True) old_master_node = cfg.GetNodeInfoByName(old_master) if old_master_node is None: raise errors.OpPrereqError( "Could not find old master node '%s' in" " cluster configuration." % old_master, errors.ECODE_NOENT) cluster_info = cfg.GetClusterInfo() new_master_node = cfg.GetNodeInfoByName(new_master) if new_master_node is None: raise errors.OpPrereqError( "Could not find new master node '%s' in" " cluster configuration." % new_master, errors.ECODE_NOENT) cluster_info.master_node = new_master_node.uuid # this will also regenerate the ssconf files, since we updated the # cluster info cfg.Update(cluster_info, logging.error) # if cfg.Update worked, then it means the old master daemon won't be # able now to write its own config file (we rely on locking in both # backend.UploadFile() and ConfigWriter._Write(); hence the next # step is to kill the old master logging.info("Stopping the master daemon on node %s", old_master) runner = rpc.BootstrapRunner() master_params = cfg.GetMasterNetworkParameters() master_params.uuid = old_master_node.uuid ems = cfg.GetUseExternalMipScript() result = runner.call_node_deactivate_master_ip(old_master, master_params, ems) msg = result.fail_msg if msg: warning = "Could not disable the master IP: %s" % (msg, ) logging.warning("%s", warning) warnings.append(warning) result = runner.call_node_stop_master(old_master) msg = result.fail_msg if msg: warning = ("Could not disable the master role on the old master" " %s, please disable manually: %s" % (old_master, msg)) logging.error("%s", warning) warnings.append(warning) except errors.ConfigurationError as err: logging.error("Error while trying to set the new master: %s", str(err)) return 1, warnings finally: # stop WConfd again: result = utils.RunCmd( [pathutils.DAEMON_UTIL, "stop", constants.WCONFD]) if result.failed: warning = ("Could not stop the configuration daemon," " command %s had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output)) logging.error("%s", warning) rcode = 1 logging.info("Checking master IP non-reachability...") master_ip = sstore.GetMasterIP() total_timeout = 30 # Here we have a phase where no master should be running def _check_ip(expected): if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT) != expected: raise utils.RetryAgain() try: utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False]) except utils.RetryTimeout: warning = ("The master IP is still reachable after %s seconds," " continuing but activating the master IP on the current" " node will probably fail" % total_timeout) logging.warning("%s", warning) warnings.append(warning) rcode = 1 if jstore.CheckDrainFlag(): logging.info("Undraining job queue") jstore.SetDrainFlag(False) logging.info("Starting the master daemons on the new master") result = rpc.BootstrapRunner().call_node_start_master_daemons( new_master, no_voting) msg = result.fail_msg if msg: logging.error( "Could not start the master role on the new master" " %s, please check: %s", new_master, msg) rcode = 1 # Finally verify that the new master managed to set up the master IP # and warn if it didn't. try: utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[True]) except utils.RetryTimeout: warning = ("The master IP did not come up within %s seconds; the" " cluster should still be working and reachable via %s," " but not via the master IP address" % (total_timeout, new_master)) logging.warning("%s", warning) warnings.append(warning) rcode = 1 logging.info("Master failed over from %s to %s", old_master, new_master) return rcode, warnings