def get_server_state(server, host, pingtime=3, verbose=False): """Return the state of the server. This method returns one of the following states based on the criteria shown. UP - server is connected WARN - server is not connected but can be pinged DOWN - server cannot be pinged nor is connected server[in] Server class instance host[in] host name to ping if server is not connected pingtime[in] timeout in seconds for ping operation Default = 3 seconds verbose[in] if True, show ping status messages Default = False Returns string - state """ from mysql.utilities.common.tools import ping_host if verbose: print "# Attempting to contact %s ..." % host, if server is not None and server.is_alive(): if verbose: print "Success" return "UP" elif ping_host(host, pingtime): if verbose: print "Server is reachable" return "WARN" if verbose: print "FAIL" return "DOWN"
def run_auto_failover(self, console, interval): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = int(self.options.get("timeout", 300)) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], 'failover'), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) print "#\n# Failover console will start in 10 seconds." time.sleep(10) # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: pass old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report( "# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script( post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None and \ (not first_pass or self.options.get("rediscover", False)): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
def run(self): """Run automatic failover. This method implements the automatic failover facility. It the existing failover() method of the RplCommands class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail rpl[in] instance of the RplCommands class interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.mode pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get("pedantic", False) # Only works for GTID_MODE=ON if not self.rpl.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.rpl.topology.check_master_info_type("TABLE"): msg = ("Failover requires --master-info-repository=TABLE for " "all slaves.") self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self.rpl.check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover daemon will start in 10 seconds.") time.sleep(10) # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = ("Failover check script cannot be found. Please " "check the path and filename for accuracy and " "restart the failover daemon.") if exec_fail is not None and not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) # Check existence of errant transactions on slaves errant_tnx = self.rpl.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: msg = ("{0} Note: If you want to ignore this issue, please do " "not use the --pedantic option." "".format(_ERRANT_TNX_ERROR)) self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report("Failover daemon started.", logging.INFO, False) self._report("Failover mode = {0}.".format(failover_mode), logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.rpl.master.host old_port = self.rpl.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.rpl.verbose) if res == 0: self._report("# Failover check script completed " "Ok. Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.rpl.topology.master is not None and \ not self.rpl.topology.master.is_alive(): msg = ("Master may be down. Waiting for {0} seconds." "".format(pingtime)) self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.rpl.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.rpl.topology.master is None or \ not ping_host(self.rpl.topology.master.host, pingtime) or \ not self.rpl.topology.master.is_alive(): failover = True if self._reconnect_master(self.pingtime): failover = False # Master is now connected again if failover: self._report("Failed to reconnect to the master after " "3 attempts.", logging.INFO) if failover: self._report("Master is confirmed to be down or " "unreachable.", logging.CRITICAL, False) try: self.rpl.topology.master.disconnect() except: pass if failover_mode == "auto": self._report("Failover starting in 'auto' mode...") res = self.rpl.topology.failover(self.rpl.candidates, False) elif failover_mode == "elect": self._report("Failover starting in 'elect' mode...") res = self.rpl.topology.failover(self.rpl.candidates, True) else: msg = _FAILOVER_ERROR.format("Master has failed and " "automatic failover is " "not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script try: self.rpl.topology.run_script(post_fail, False, [old_host, old_port]) except Exception as err: # pylint: disable=W0703 self._report("# Post fail script failed! {0}" "".format(err), level=logging.ERROR) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR.format("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script try: self.rpl.topology.run_script(post_fail, False, [old_host, old_port]) except Exception as err: # pylint: disable=W0703 self._report("# Post fail script failed! {0}" "".format(err), level=logging.ERROR) raise UtilRplError(msg) self.rpl.master = self.rpl.topology.master self.master = self.rpl.master self.rpl.topology.remove_discovered_slaves() self.rpl.topology.discover_slaves() self.list_data = None print("\nFailover daemon will restart in 5 seconds.") time.sleep(5) failover = False # Execute post failover script try: self.rpl.topology.run_script(post_fail, False, [old_host, old_port, self.rpl.master.host, self.rpl.master.port]) except Exception as err: # pylint: disable=W0703 self._report("# Post fail script failed! {0}" "".format(err), level=logging.ERROR) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) self.unregister_slaves(self.rpl.topology) # Register instance on the new master msg = ("Registering instance on new master " "{0}:{1}.").format(self.master.host, self.master.port) self._report(msg, logging.INFO, False) failover_mode = self.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.rpl.topology.discover_slaves(): self.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.rpl.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.rpl.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR self.add_warning("errant_tnx", warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: self.del_warning("errant_tnx") if self.master and self.master.is_alive(): # Log status self._print_warnings() self._log_master_status() self.list_data = [] if "health" in self.report_values: (health_labels, health_data) = self._format_health_data() if health_data: self._log_data("Health Status:", health_labels, health_data) if "gtid" in self.report_values: (gtid_labels, gtid_data) = self._format_gtid_data() for i, v in enumerate(gtid_data): if v: self._log_data("GTID Status - {0}" "".format(_GTID_LISTS[i]), gtid_labels, v) if "uuid" in self.report_values: (uuid_labels, uuid_data) = self._format_uuid_data() if uuid_data: self._log_data("UUID Status:", uuid_labels, uuid_data) # Disconnect the master while waiting for the interval to expire self.master.disconnect() # Wait for the interval to expire time.sleep(self.interval) # Reconnect to the master self._reconnect_master(self.pingtime) first_pass = False return True
def run_auto_failover(self, console, failover_mode="auto"): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class. Returns bool - True = success, raises exception on error """ pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get('pedantic', False) fail_retry = self.options.get('fail_retry', None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover console will start in {0} seconds.".format( WARNING_SLEEP_TIME)) time.sleep(WARNING_SLEEP_TIME) # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the --pedantic " "option.".format(_ERRANT_TNX_ERROR)) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.isfile(exec_fail): message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format( path=exec_fail) self._report(message, logging.CRITICAL, False) raise UtilRplError(message) elif not os.access(exec_fail, os.X_OK): message = INSUFFICIENT_FILE_PERMISSIONS.format( path=exec_fail, permissions='execute') self._report(message, logging.CRITICAL, False) raise UtilRplError(message) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: pass # If user specified a master fail retry, wait for the # predetermined time and attempt to check the master again. if fail_retry is not None and \ not self.topology.master.is_alive(): msg = "Master is still not reachable. Waiting for %s " \ "seconds to retry detection." % fail_retry self._report(msg, logging.INFO, False) time.sleep(fail_retry) try: self.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True i = 0 while i < 3: try: self.topology.master.connect() failover = False # Master is now connected again break except: pass time.sleep(pingtime) i += 1 if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) else: self._report("Master is Ok. Resuming watch.", logging.INFO) if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % ("Master has failed and automatic " "failover is not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % ("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance on the new master self._report("Registering instance on master.", logging.INFO, False) failover_mode = console.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR console.add_warning('errant_tnx', warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: console.del_warning('errant_tnx') res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
def auto_failover(self, interval): """Automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = self.options.get("timeout", 3) exec_fail = self.options.get("exec_fail", None) force = self.options.get("force", False) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(fail_check): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "Failover mode changed to 'FAIL'. Console will start in 5 seconds." time.sleep(5) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(script) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for timeout seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ timeout self._report(msg, logging.INFO, False) time.sleep(timeout) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None \ and not first_pass: # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) return True
def run(self): """Run automatic failover. This method implements the automatic failover facility. It the existing failover() method of the RplCommands class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail rpl[in] instance of the RplCommands class interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.mode pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get("pedantic", False) # Only works for GTID_MODE=ON if not self.rpl.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.rpl.topology.check_master_info_type("TABLE"): msg = ("Failover requires --master-info-repository=TABLE for " "all slaves.") self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self.rpl.check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover daemon will start in 10 seconds.") time.sleep(10) # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = ("Failover check script cannot be found. Please " "check the path and filename for accuracy and " "restart the failover daemon.") if exec_fail is not None and not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) # Check existence of errant transactions on slaves errant_tnx = self.rpl.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: msg = ("{0} Note: If you want to ignore this issue, please do " "not use the --pedantic option." "".format(_ERRANT_TNX_ERROR)) self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report("Failover daemon started.", logging.INFO, False) self._report("Failover mode = {0}.".format(failover_mode), logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.rpl.master.host old_port = self.rpl.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.rpl.verbose) if res == 0: self._report("# Failover check script completed " "Ok. Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.rpl.topology.master is not None and \ not self.rpl.topology.master.is_alive(): msg = ("Master may be down. Waiting for {0} seconds." "".format(pingtime)) self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.rpl.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.rpl.topology.master is None or \ not ping_host(self.rpl.topology.master.host, pingtime) or \ not self.rpl.topology.master.is_alive(): failover = True if self._reconnect_master(self.pingtime): failover = False # Master is now connected again if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) if failover: self._report("Master is confirmed to be down or " "unreachable.", logging.CRITICAL, False) try: self.rpl.topology.master.disconnect() except: pass if failover_mode == "auto": self._report("Failover starting in 'auto' mode...") res = self.rpl.topology.failover(self.rpl.candidates, False) elif failover_mode == "elect": self._report("Failover starting in 'elect' mode...") res = self.rpl.topology.failover(self.rpl.candidates, True) else: msg = _FAILOVER_ERROR.format("Master has failed and " "automatic failover is " "not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.rpl.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR.format("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.rpl.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.rpl.master = self.rpl.topology.master self.master = self.rpl.master self.rpl.topology.remove_discovered_slaves() self.rpl.topology.discover_slaves() self.list_data = None print("\nFailover daemon will restart in 5 seconds.") time.sleep(5) failover = False # Execute post failover script self.rpl.topology.run_script(post_fail, False, [old_host, old_port, self.rpl.master.host, self.rpl.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) self.unregister_slaves(self.rpl.topology) # Register instance on the new master msg = ("Registering instance on new master " "{0}:{1}.").format(self.master.host, self.master.port) self._report(msg, logging.INFO, False) failover_mode = self.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.rpl.topology.discover_slaves(): self.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.rpl.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.rpl.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR self.add_warning("errant_tnx", warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: self.del_warning("errant_tnx") if self.master and self.master.is_alive(): # Log status self._print_warnings() self._log_master_status() self.list_data = [] if "health" in self.report_values: (health_labels, health_data) = self._format_health_data() if health_data: self._log_data("Health Status:", health_labels, health_data) if "gtid" in self.report_values: (gtid_labels, gtid_data) = self._format_gtid_data() for i, v in enumerate(gtid_data): if v: self._log_data("GTID Status - {0}" "".format(_GTID_LISTS[i]), gtid_labels, v) if "uuid" in self.report_values: (uuid_labels, uuid_data) = self._format_uuid_data() if uuid_data: self._log_data("UUID Status:", uuid_labels, uuid_data) # Disconnect the master while waiting for the interval to expire self.master.disconnect() # Wait for the interval to expire time.sleep(self.interval) # Reconnect to the master self._reconnect_master(self.pingtime) first_pass = False return True
def run_auto_failover(self, console, failover_mode="auto"): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class. Returns bool - True = success, raises exception on error """ pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get('pedantic', False) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover console will start in {0} seconds.".format( WARNING_SLEEP_TIME)) time.sleep(WARNING_SLEEP_TIME) # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the --pedantic " "option.".format(_ERRANT_TNX_ERROR)) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.isfile(exec_fail): message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format( path=exec_fail) self._report(message, logging.CRITICAL, False) raise UtilRplError(message) elif not os.access(exec_fail, os.X_OK): message = INSUFFICIENT_FILE_PERMISSIONS.format( path=exec_fail, permissions='execute') self._report(message, logging.CRITICAL, False) raise UtilRplError(message) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True i = 0 while i < 3: try: self.topology.master.connect() failover = False # Master is now connected again break except: pass time.sleep(pingtime) i += 1 if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % ("Master has failed and automatic " "failover is not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % ("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance on the new master self._report("Registering instance on master.", logging.INFO, False) failover_mode = console.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR console.add_warning('errant_tnx', warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: console.del_warning('errant_tnx') res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True