def test_init(self): """Test basic properties/methods in the ErrorLog. """ # Check that the input parameters are not changed. interval = get_time_delta(1) now = get_time() input_whens = [ 30, 40 ] input_reporters = [ "reporter", "reporter" ] st = ErrorLog(self.server, interval, now, input_whens, input_reporters) self.assertEqual(st.server_uuid, self.server.uuid) self.assertEqual(st.interval, interval) self.assertEqual(st.now, now) self.assertEqual(st.whens, input_whens) self.assertEqual(st.reporters, input_reporters) # If whens and reporters don't have the same length, an exception is # raised interval = get_time_delta(1) now = get_time() input_whens = [ 0, 0, 0, 0 ] input_reporters = [] self.assertRaises(AssertionError, ErrorLog, self.server, interval, now, input_whens, input_reporters)
def test_node_view(self): """Test basic properties/methods in the MySQLHandler. """ # Retrieve information on Fabric node. Note though # that there is no specific view to retrieve such # information. node_id_1 = _uuid.uuid4() node_startup_1 = _utils.get_time() _LOGGER.debug("Fabric Node started.", extra={ 'subject' : str(node_id_1), 'category' : MySQLHandler.NODE, 'type' : MySQLHandler.START, 'reported' : node_startup_1, } ) node_stop_1 = _utils.get_time() _LOGGER.debug("Fabric Node started.", extra={ 'subject' : str(node_id_1), 'category' : MySQLHandler.NODE, 'type' : MySQLHandler.STOP, 'reported' : node_stop_1, } ) node_id_2 = _uuid.uuid4() node_startup_2 = _utils.get_time() _LOGGER.debug("Fabric Node started.", extra={ 'subject' : str(node_id_2), 'category' : MySQLHandler.NODE, 'type' : MySQLHandler.START, 'reported' : node_startup_2, } ) node_view = ("SELECT subject as node_id, " "TIMEDIFF(UTC_TIMESTAMP(), reported) as node_uptime, " "reported as node_startup FROM log WHERE category = %s " "and type = %s ORDER BY node_id, node_startup" ) persister = _persistence.current_persister() res = persister.exec_stmt( node_view, { "params" : ( MySQLHandler.idx_category(MySQLHandler.NODE), MySQLHandler.idx_type(MySQLHandler.START) ) } ) self.assertEqual(len(res), 2)
def _run(self): """Function that verifies servers' availabilities. """ ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status or \ MySQLServer.is_alive(server, detection_timeout): if server.status == MySQLServer.FAULTY: self.__connection_manager.purge_connections( server ) continue unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: self._spawn_report_failure(server) for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval / detections) _persistence.deinit_thread()
def test_check_instability(self): """Test whether a server can be considered unstable or not. """ # Update/Notify and refresh, they should match. interval = get_time_delta(10) now = get_time() input_whens = [now, now - get_time_delta(5)] input_reporters = ["client:1000", "client:2000"] st = ErrorLog(self.server, interval, now, [], []) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now - get_time_delta(5), "client:2000", "error") ErrorLog.add(self.server, now - get_time_delta(11), "client:3000", "error") st.refresh() self.assertEqual( st.is_unstable(n_notifications=1, n_reporters=1, filter_reporter=None), True) self.assertEqual( st.is_unstable(n_notifications=2, n_reporters=2, filter_reporter=None), True) self.assertEqual( st.is_unstable(n_notifications=3, n_reporters=2, filter_reporter=None), False) self.assertEqual( st.is_unstable(n_notifications=2, n_reporters=3, filter_reporter=None), False) self.assertEqual( st.is_unstable(n_notifications=1, n_reporters=1, filter_reporter=["client:2000"]), True)
def _node_view(): """Retrieve information on the Fabric node. """ fabric = FabricNode() node_id = fabric.uuid node_startup = fabric.startup node_uptime = _utils.get_time() - node_startup return [[str(node_id), str(node_uptime), str(node_startup)]]
def _append_error_log(server_id, reporter, error): """Check whether the server exist and is not faulty and register error log. """ server = _retrieve_server(server_id) now = get_time() _error_log.ErrorLog.add(server, now, reporter, error) _LOGGER.warning("Reported issue (%s) for server (%s).", error, server.uuid) return (now, server)
def _start(options, config): """Start Fabric server. """ # Remove temporary defaults file, which migh have left behind # by former runs of Fabric. _backup.cleanup_temp_defaults_files() #Configure TTL _setup_ttl(config) # Configure modules that are not dynamic loaded. _server.configure(config) _error_log.configure(config) _failure_detector.configure(config) # Load information on all providers. providers.find_providers() # Load all services into the service manager _services.ServiceManager().load_services(options, config) # Initilize the state store. _persistence.init_thread() # Check the maximum number of threads. _utils.check_number_threads() # Configure Fabric Node. fabric = FabricNode() reported = _utils.get_time() _LOGGER.info( "Fabric node version (%s) started. ", fabric.version, extra={ 'subject' : str(fabric.uuid), 'category' : MySQLHandler.NODE, 'type' : MySQLHandler.START, 'reported' : reported } ) fabric.startup = reported # Start the executor, failure detector and then service manager. In this # scenario, the recovery is sequentially executed after starting the # executor and before starting the service manager. _events.Handler().start() _recovery.recovery() _failure_detector.FailureDetector.register_groups() _services.ServiceManager().start()
def test_init(self): """Test basic properties/methods in the ErrorLog. """ # Check that the input parameters are not changed. interval = get_time_delta(1) now = get_time() input_whens = [30, 40] input_reporters = ["reporter", "reporter"] st = ErrorLog(self.server, interval, now, input_whens, input_reporters) self.assertEqual(st.server_uuid, self.server.uuid) self.assertEqual(st.interval, interval) self.assertEqual(st.now, now) self.assertEqual(st.whens, input_whens) self.assertEqual(st.reporters, input_reporters) # If whens and reporters don't have the same length, an exception is # raised interval = get_time_delta(1) now = get_time() input_whens = [0, 0, 0, 0] input_reporters = [] self.assertRaises(AssertionError, ErrorLog, self.server, interval, now, input_whens, input_reporters)
def _append_error_log(server_id, reporter, error): """Check whether the server exist and is not faulty and register error log. """ server = _retrieve_server(server_id) if server.status == _server.MySQLServer.FAULTY: raise _errors.ServerError("Server (%s) is already marked as faulty." % (server.uuid, )) _LOGGER.warning("Reported issue (%s) for server (%s).", error, server.uuid) now = get_time() _error_log.ErrorLog.add(server, now, reporter, error) return (now, server)
def _start(options, config): """Start Fabric server. """ # Remove temporary defaults file, which migh have left behind # by former runs of Fabric. _backup.cleanup_temp_defaults_files() #Configure TTL _setup_ttl(config) # Configure modules that are not dynamic loaded. _server.configure(config) _error_log.configure(config) _failure_detector.configure(config) # Load information on all providers. providers.find_providers() # Load all services into the service manager _services.ServiceManager().load_services(options, config) # Initilize the state store. _persistence.init_thread() # Check the maximum number of threads. _utils.check_number_threads() # Configure Fabric Node. fabric = FabricNode() reported = _utils.get_time() _LOGGER.info("Fabric node version (%s) started. ", fabric.version, extra={ 'subject': str(fabric.uuid), 'category': MySQLHandler.NODE, 'type': MySQLHandler.START, 'reported': reported }) fabric.startup = reported # Start the executor, failure detector and then service manager. In this # scenario, the recovery is sequentially executed after starting the # executor and before starting the service manager. _events.Handler().start() _recovery.recovery() _failure_detector.FailureDetector.register_groups() _services.ServiceManager().start()
def _append_error_log(server_id, reporter, error): """Check whether the server exist and is not faulty and register error log. """ server = _retrieve_server(server_id) if server.status == _server.MySQLServer.FAULTY: raise _errors.ServerError( "Server (%s) is already marked as faulty." % (server.uuid, ) ) _LOGGER.warning("Reported issue (%s) for server (%s).", error, server.uuid) now = get_time() _error_log.ErrorLog.add(server, now, reporter, error) return (now, server)
def fetch(server, interval, now=None, persister=None): """Return a ErrorLog object corresponding to the server. :param server: Server whose error has been reported. :param interval: Interval of interest. :param now: Consider from `now` until `now` - `interval`. :param persister: Persister to persist the object to. :return: ErrorLog object. """ from mysql.fabric.server import MySQLServer assert (isinstance(server, MySQLServer)) now = now or get_time() (whens, reporters) = ErrorLog.compute(server.uuid, interval, now) return ErrorLog(server, interval, now, whens, reporters)
def execute(self): """Statistics on the Fabric node. It returns information on the Fabric node, specifically a list with the following fileds: node identification, how long it is running, when it was started. """ fabric = FabricNode() node_id = fabric.uuid node_startup = fabric.startup node_uptime = _utils.get_time() - node_startup rset = ResultSet(names=('node_id', 'node_uptime', 'node_startup'), types=(str, str, str)) rset.append_row([node_id, node_uptime, node_startup]) return CommandResult(None, results=rset)
def fetch(server, interval, now=None, persister=None): """Return a ErrorLog object corresponding to the server. :param server: Server whose error has been reported. :param interval: Interval of interest. :param now: Consider from `now` until `now` - `interval`. :param persister: Persister to persist the object to. :return: ErrorLog object. """ from mysql.fabric.server import MySQLServer assert(isinstance(server, MySQLServer)) now = now or get_time() (whens, reporters) = ErrorLog.compute( server.uuid, interval, now ) return ErrorLog(server, interval, now, whens, reporters)
def execute(self): """Statistics on the Fabric node. It returns information on the Fabric node, specifically a list with the following fileds: node identification, how long it is running, when it was started. """ fabric = FabricNode() node_id = fabric.uuid node_startup = fabric.startup node_uptime = _utils.get_time() - node_startup rset = ResultSet( names=('node_id', 'node_uptime', 'node_startup'), types=( str, str, str)) rset.append_row([node_id, node_uptime, node_startup]) return CommandResult(None, results=rset)
def test_check_instability(self): """Test whether a server can be considered unstable or not. """ # Update/Notify and refresh, they should match. interval = get_time_delta(10) now = get_time() input_whens = [ now, now - get_time_delta(5) ] input_reporters = [ "client:1000", "client:2000" ] st = ErrorLog(self.server, interval, now, [], []) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now - get_time_delta(5), "client:2000", "error") ErrorLog.add(self.server, now - get_time_delta(11), "client:3000", "error") st.refresh() self.assertEqual( st.is_unstable(n_notifications=1, n_reporters=1, filter_reporter=None), True ) self.assertEqual( st.is_unstable(n_notifications=2, n_reporters=2, filter_reporter=None), True ) self.assertEqual( st.is_unstable(n_notifications=3, n_reporters=2, filter_reporter=None), False ) self.assertEqual( st.is_unstable(n_notifications=2, n_reporters=3, filter_reporter=None), False ) self.assertEqual( st.is_unstable(n_notifications=1, n_reporters=1, filter_reporter=["client:2000"]), True )
def _start(options, config): """Start Fabric server. """ # Configure modules that are not dynamic loaded. _server.configure(config) _error_log.configure(config) _failure_detector.configure(config) # Load all services into the service manager _services.ServiceManager().load_services(options, config) # Initilize the state store. _persistence.init_thread() # Check the maximum number of threads. _utils.check_number_threads() # Configure Fabric Node. fabric = FabricNode() reported = _utils.get_time() _LOGGER.info( "Fabric node starting.", extra={ 'subject' : str(fabric.uuid), 'category' : MySQLHandler.NODE, 'type' : MySQLHandler.START, 'reported' : reported } ) fabric.startup = reported # Start the executor, failure detector and then service manager. In this # scenario, the recovery is sequentially executed after starting the # executor and before starting the service manager. _events.Handler().start() _recovery.recovery() _failure_detector.FailureDetector.register_groups() _services.ServiceManager().start()
def test_persistence(self): """Test ErrorLog. """ # Update/Notify and fetch, they should match. interval = get_time_delta(1) now = get_time() input_whens = [now, now] input_reporters = ["client:1000", "client:2000"] st = ErrorLog(self.server, interval, now, input_whens, input_reporters) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now, "client:2000", "error") new_st = ErrorLog.fetch(self.server, interval, now) self.assertEqual(st.reporters, new_st.reporters) self.assertEqual(st.whens, new_st.whens) # Call remove, they should be empty and match. interval = get_time_delta(1) now = get_time() input_whens = [] input_reporters = [] ErrorLog.remove(self.server) st = ErrorLog(self.server, interval, now, input_whens, input_reporters) new_st = ErrorLog.fetch(self.server, interval, now) self.assertEqual(st.reporters, new_st.reporters) self.assertEqual(st.whens, new_st.whens) # Update/Notify and refresh, they should match. interval = get_time_delta(10) now = get_time() input_whens = [now, now - get_time_delta(5)] input_reporters = ["client:1000", "client:2000"] st = ErrorLog(self.server, interval, now, [], []) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now - get_time_delta(5), "client:2000", "error") ErrorLog.add(self.server, now - get_time_delta(11), "client:3000", "error") st.refresh() self.assertEqual(set(st.reporters), set(input_reporters)) self.assertEqual(set(st.whens), set(input_whens)) # Check whether a statement similar to the one used in the # event is fine. ErrorLog.remove(self.server) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now, "client:2000", "error") persister = _persistence.current_persister() out = persister.exec_stmt( "SELECT reported, UTC_TIMESTAMP() as now, " "TIMEDIFF(UTC_TIMESTAMP(), reported - MAKETIME(2,0,0)) as diff " "FROM error_log") _LOGGER.debug("Output test persistence %s.", out) self.assertEqual(len(out), 2) res = persister.exec_stmt( "DELETE FROM error_log WHERE " "TIMEDIFF(UTC_TIMESTAMP(), reported - MAKETIME(2,0,0)) > " "MAKETIME(1,0,0)") _LOGGER.debug("Output test persistence %s.", res) out = persister.exec_stmt( "SELECT reported, UTC_TIMESTAMP() as now, " "TIMEDIFF(UTC_TIMESTAMP(), reported - MAKETIME(2,0,0)) as diff " "FROM error_log") _LOGGER.debug("Output test persistence %s.", out) self.assertEqual(len(out), 0)
def _run(self): """Function that verifies servers' availabilities. """ from mysql.fabric.server import ( Group, MySQLServer, ConnectionManager, ) ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT connection_manager = ConnectionManager() slave_deep_checks = FailureDetector._SLAVE_DEEP_CHECKS _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status: ### Server is FAULTY connection_manager.kill_connections(server) continue else: ### Server is Not FAULTY if MySQLServer.is_alive(server, detection_timeout): ### Server is alive ### check depends on `slave_deep_checks` parameter if slave_deep_checks: ### When server is alive and status != FAULTY is_master= (group.master == server.uuid) if not is_master: ### Checking master is dead or alive. master_server = MySQLServer.fetch(group.master) if MySQLServer.is_alive(master_server, detection_timeout): ### Checking is replication valid or not if master is alive. server.connect() slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) if slave_issues: if (why_slave_issues['io_error'] and \ why_slave_issues['io_errno'] == 2003): ### Nothing to do during reconnecting, just logging _LOGGER.info(why_slave_issues) else: ### If slave threads are not running, set status to SPARE server.status = MySQLServer.SPARE ### Done slave_issues. server.disconnect() ### Endif MySQLServer.is_alive(master_server, detection_timeout) ### Endif not is_master ### Endif slave_deep_checks continue ### Else MySQLServer.is_alive(server, detection_timeout) else: unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: # We have to make this transactional and make the # failover (i.e. report failure) robust to failures. # Otherwise, a master might be set to faulty and # a new one never promoted. server.status = MySQLServer.FAULTY connection_manager.kill_connections(server) procedures = trigger("REPORT_FAILURE", None, str(server.uuid), threading.current_thread().name, MySQLServer.FAULTY, False ) executor = _executor.Executor() for procedure in procedures: executor.wait_for_procedure(procedure) ### Endif MySQLServer.is_alive(server, detection_timeout) ### Endif server.status in ignored_status ### End for server in group.servers() ### Endif group is not None for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval) _persistence.deinit_thread()
def test_persistence(self): """Test ErrorLog. """ # Update/Notify and fetch, they should match. interval = get_time_delta(1) now = get_time() input_whens = [ now, now ] input_reporters = [ "client:1000", "client:2000" ] st = ErrorLog(self.server, interval, now, input_whens, input_reporters) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now, "client:2000", "error") new_st = ErrorLog.fetch(self.server, interval, now) self.assertEqual(st.reporters, new_st.reporters) self.assertEqual(st.whens, new_st.whens) # Call remove, they should be empty and match. interval = get_time_delta(1) now = get_time() input_whens = [ ] input_reporters = [ ] ErrorLog.remove(self.server) st = ErrorLog(self.server, interval, now, input_whens, input_reporters) new_st = ErrorLog.fetch(self.server, interval, now) self.assertEqual(st.reporters, new_st.reporters) self.assertEqual(st.whens, new_st.whens) # Update/Notify and refresh, they should match. interval = get_time_delta(10) now = get_time() input_whens = [ now, now - get_time_delta(5) ] input_reporters = [ "client:1000", "client:2000" ] st = ErrorLog(self.server, interval, now, [], []) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now - get_time_delta(5), "client:2000", "error") ErrorLog.add(self.server, now - get_time_delta(11), "client:3000", "error") st.refresh() self.assertEqual(set(st.reporters), set(input_reporters)) self.assertEqual(set(st.whens), set(input_whens)) # Check whether a statement similar to the one used in the # event is fine. ErrorLog.remove(self.server) ErrorLog.add(self.server, now, "client:1000", "error") ErrorLog.add(self.server, now, "client:2000", "error") persister = _persistence.current_persister() out = persister.exec_stmt( "SELECT reported, UTC_TIMESTAMP() as now, " "TIMEDIFF(UTC_TIMESTAMP(), reported - MAKETIME(2,0,0)) as diff " "FROM error_log" ) _LOGGER.debug("Output test persistence %s.", out) self.assertEqual(len(out), 2) res = persister.exec_stmt( "DELETE FROM error_log WHERE " "TIMEDIFF(UTC_TIMESTAMP(), reported - MAKETIME(2,0,0)) > " "MAKETIME(1,0,0)" ) _LOGGER.debug("Output test persistence %s.", res) out = persister.exec_stmt( "SELECT reported, UTC_TIMESTAMP() as now, " "TIMEDIFF(UTC_TIMESTAMP(), reported - MAKETIME(2,0,0)) as diff " "FROM error_log" ) _LOGGER.debug("Output test persistence %s.", out) self.assertEqual(len(out), 0)
def _run(self): """Function that verifies servers' availabilities. """ from mysql.fabric.server import ( Group, MySQLServer, ConnectionManager, ) ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT connection_manager = ConnectionManager() _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status or \ MySQLServer.is_alive(server, detection_timeout): if server.status == MySQLServer.FAULTY: connection_manager.kill_connections(server) continue unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: # We have to make this transactional and make the # failover (i.e. report failure) robust to failures. # Otherwise, a master might be set to faulty and # a new one never promoted. server.status = MySQLServer.FAULTY connection_manager.kill_connections(server) procedures = trigger("REPORT_FAILURE", None, str(server.uuid), threading.current_thread().name, MySQLServer.FAULTY, False ) executor = _executor.Executor() for procedure in procedures: executor.wait_for_procedure(procedure) for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval / detections) _persistence.deinit_thread()