Exemple #1
0
    def run_auto_failover(self, console, failover_mode="auto"):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class.

        Returns bool - True = success, raises exception on error
        """
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get('pedantic', False)
        fail_retry = self.options.get('fail_retry', None)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover console will start in {0} seconds.".format(
                WARNING_SLEEP_TIME))
            time.sleep(WARNING_SLEEP_TIME)

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, please do not use the --pedantic "
                                   "option.".format(_ERRANT_TNX_ERROR))

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.isfile(exec_fail):
                    message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format(
                        path=exec_fail)
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                elif not os.access(exec_fail, os.X_OK):
                    message = INSUFFICIENT_FILE_PERMISSIONS.format(
                        path=exec_fail, permissions='execute')
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port], self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # If user specified a master fail retry, wait for the
                # predetermined time and attempt to check the master again.
                if fail_retry is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master is still not reachable. Waiting for %s " \
                          "seconds to retry detection." % fail_retry
                    self._report(msg, logging.INFO, False)
                    time.sleep(fail_retry)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
                    i = 0
                    while i < 3:
                        try:
                            self.topology.master.connect()
                            failover = False  # Master is now connected again
                            break
                        except:
                            pass
                        time.sleep(pingtime)
                        i += 1

                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)
                    else:
                        self._report("Master is Ok. Resuming watch.",
                                     logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % ("Master has failed and automatic "
                                             "failover is not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % ("An error was encountered "
                                             "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False,
                                         [old_host, old_port,
                                          self.master.host, self.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                console.unregister_slaves(self.topology)

                # Register instance on the new master
                self._report("Registering instance on master.", logging.INFO,
                             False)
                failover_mode = console.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    console.add_warning('errant_tnx', warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                console.del_warning('errant_tnx')

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        return True
Exemple #2
0
    def run_auto_failover(self, console, failover_mode="auto"):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class.

        Returns bool - True = success, raises exception on error
        """
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get('pedantic', False)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover console will start in {0} seconds.".format(
                WARNING_SLEEP_TIME))
            time.sleep(WARNING_SLEEP_TIME)

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, please do not use the --pedantic "
                                   "option.".format(_ERRANT_TNX_ERROR))

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.isfile(exec_fail):
                    message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format(
                        path=exec_fail)
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                elif not os.access(exec_fail, os.X_OK):
                    message = INSUFFICIENT_FILE_PERMISSIONS.format(
                        path=exec_fail, permissions='execute')
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port], self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
                    i = 0
                    while i < 3:
                        try:
                            self.topology.master.connect()
                            failover = False  # Master is now connected again
                            break
                        except:
                            pass
                        time.sleep(pingtime)
                        i += 1

                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % ("Master has failed and automatic "
                                             "failover is not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % ("An error was encountered "
                                             "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False,
                                         [old_host, old_port,
                                          self.master.host, self.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                console.unregister_slaves(self.topology)

                # Register instance on the new master
                self._report("Registering instance on master.", logging.INFO,
                             False)
                failover_mode = console.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    console.add_warning('errant_tnx', warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                console.del_warning('errant_tnx')

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        return True
        print("WARNING: The --table option is only required for "
              "--format=raw_csv (option ignored).")

    # Build list of files to import
    file_list = []
    for file_name in args:
        # Test if is a file
        if not os.path.isfile(file_name):
            parser.error(FILE_DOES_NOT_EXIST.format(path=file_name))
        # Test if is readable
        try:
            with open(file_name, "r"):
                pass
        except IOError:
            parser.error(
                INSUFFICIENT_FILE_PERMISSIONS.format(permissions="read",
                                                     path=file_name))

        file_list.append(file_name)

    try:
        # record start time
        if opt.verbosity >= 3:
            start_test = time.time()

        # Import all specified files.
        import_file_tasks = []
        for file_name in file_list:
            # Check multiprocess file import.
            # Note: Multiprocessing is only applied at the file level,
            # independently from the system (posix or not).
            if options['multiprocess'] > 1:
        print("WARNING: The --table option is only required for "
              "--format=raw_csv (option ignored).")

    # Build list of files to import
    file_list = []
    for file_name in args:
        # Test if is a file
        if not os.path.isfile(file_name):
            parser.error(FILE_DOES_NOT_EXIST.format(path=file_name))
        # Test if is readable
        try:
            with open(file_name, "r"):
                pass
        except IOError:
            parser.error(
                INSUFFICIENT_FILE_PERMISSIONS.format(permissions="read",
                                                     path=file_name))

        file_list.append(file_name)

    try:
        # record start time
        if opt.verbosity >= 3:
            start_test = time.time()

        # Import all specified files.
        import_file_tasks = []
        for file_name in file_list:
            # Check multiprocess file import.
            # Note: Multiprocessing is only applied at the file level,
            # independently from the system (posix or not).
            if options['multiprocess'] > 1: