def _check_candidate_switch(group_id, slave_id): """Check if the candidate has all the features to become the new master. """ allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) if not group.master: raise _errors.GroupError( "Group (%s) does not contain a valid " "master. Please, run a promote or failover." % (group_id, ) ) slave = _retrieve_server(slave_id, group_id) slave.connect() if group.master == slave.uuid: raise _errors.ServerError( "Candidate slave (%s) is already master." % (slave_id, ) ) master_issues = _replication.check_master_issues(slave) if master_issues: raise _errors.ServerError( "Server (%s) is not a valid candidate slave " "due to the following reason(s): (%s)." % (slave.uuid, master_issues) ) slave_issues = _replication.check_slave_issues(slave) if slave_issues: raise _errors.ServerError( "Server (%s) is not a valid candidate slave " "due to the following reason: (%s)." % (slave.uuid, slave_issues) ) master_uuid = _replication.slave_has_master(slave) if master_uuid is None or group.master != _uuid.UUID(master_uuid): raise _errors.GroupError( "The group's master (%s) is different from the candidate's " "master (%s)." % (group.master, master_uuid) ) if slave.status not in allowed_status: raise _errors.ServerError("Server (%s) is faulty." % (slave_id, )) _events.trigger_within_procedure( BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid) )
def _check_candidate_switch(group_id, slave_id): """Check if the candidate has all the features to become the new master. """ allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) if not group.master: raise _errors.GroupError("Group (%s) does not contain a valid " "master. Please, run a promote or failover." % (group_id, )) slave = _retrieve_server(slave_id, group_id) slave.connect() if group.master == slave.uuid: raise _errors.ServerError("Candidate slave (%s) is already master." % (slave_id, )) master_issues, why_master_issues = _replication.check_master_issues(slave) if master_issues: raise _errors.ServerError("Server (%s) is not a valid candidate slave " "due to the following reason(s): (%s)." % (slave.uuid, why_master_issues)) slave_issues, why_slave_issues = _replication.check_slave_issues(slave) if slave_issues: raise _errors.ServerError("Server (%s) is not a valid candidate slave " "due to the following reason: (%s)." % (slave.uuid, why_slave_issues)) master_uuid = _replication.slave_has_master(slave) if master_uuid is None or group.master != _uuid.UUID(master_uuid): raise _errors.GroupError( "The group's master (%s) is different from the candidate's " "master (%s)." % (group.master, master_uuid)) if slave.status not in allowed_status: raise _errors.ServerError("Server (%s) is faulty." % (slave_id, )) _events.trigger_within_procedure(BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid))
def _health(group_id): """Check which servers in a group are up and down. """ availability = {} group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) for server in group.servers(): alive = False is_master = (group.master == server.uuid) thread_issues = {} status = server.status try: server.connect() alive = True if not is_master: slave_issues = _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: thread_issues = \ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) elif slave_issues: thread_issues = slave_issues except _errors.DatabaseError: status = _server.MySQLServer.FAULTY availability[str(server.uuid)] = { "is_alive" : alive, "status" : status, "threads" : thread_issues } return availability
def execute(self, group_id): """Check if any server within a group has failed. :param group_id: Group's id. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) info = ResultSet( names=[ 'uuid', 'is_alive', 'status', 'is_not_running', 'is_not_configured', 'io_not_running', 'sql_not_running', 'io_error', 'sql_error', 'gtid_executed' ], types=[str, bool, str] + [bool] * 4 + [str, str, str] ) issues = ResultSet(names=['issue'], types=[str]) for server in group.servers(): alive = False is_master = (group.master == server.uuid) status = server.status why_slave_issues = {} # These are used when server is not contactable. why_slave_issues = { 'is_not_running': False, 'is_not_configured': False, 'io_not_running': False, 'sql_not_running': False, 'io_error': False, 'sql_error': False, } try: # TODO: CHECK WHETHER WE SHOULD USE IS_ALIVE OR NOT. if server.is_alive: server.connect() alive = True if not is_master: slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: issues.append_row([ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) ]) gtid_executed= server.get_gtid_status()[0].GTID_EXECUTED else: status = _server.MySQLServer.FAULTY gtid_executed= "UNKNOWN" except _errors.DatabaseError: status = _server.MySQLServer.FAULTY gtid_executed= "UNKNOWN" info.append_row([ server.uuid, alive, status, why_slave_issues['is_not_running'], why_slave_issues['is_not_configured'], why_slave_issues['io_not_running'], why_slave_issues['sql_not_running'], why_slave_issues['io_error'], why_slave_issues['sql_error'], ' '.join(gtid_executed.splitlines()), ]) return CommandResult(None, results=[info, issues])
def _do_find_candidate(group_id, event): """Find the best candidate in a group that may be used to replace the current master if there is any. It chooses the slave that has processed more transactions and may become a master, e.g. has the binary log enabled. This function does not consider purged transactions and delays in the slave while picking up a slave. :param group_id: Group's id from where a candidate will be chosen. :return: Return the uuid of the best candidate to become a master in the group. """ forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) master_uuid = None if group.master: master_uuid = str(group.master) chosen_uuid = None chosen_gtid_status = None for candidate in group.servers(): if master_uuid != str(candidate.uuid) and \ candidate.status not in forbidden_status: try: candidate.connect() gtid_status = candidate.get_gtid_status() master_issues, why_master_issues = \ _replication.check_master_issues(candidate) slave_issues = False why_slave_issues = {} if event == FIND_CANDIDATE_SWITCH: slave_issues, why_slave_issues = \ _replication.check_slave_issues(candidate) has_valid_master = (master_uuid is None or \ _replication.slave_has_master(candidate) == master_uuid) can_become_master = False if chosen_gtid_status: n_trans = 0 try: n_trans = _replication.get_slave_num_gtid_behind( candidate, chosen_gtid_status) except _errors.InvalidGtidError: pass if n_trans == 0 and not master_issues and \ has_valid_master and not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True elif not master_issues and has_valid_master and \ not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True if not can_become_master: _LOGGER.warning( "Candidate (%s) cannot become a master due to the " "following reasons: issues to become a " "master (%s), prerequistes as a slave (%s), valid " "master (%s).", candidate.uuid, why_master_issues, why_slave_issues, has_valid_master) except _errors.DatabaseError as error: _LOGGER.warning("Error accessing candidate (%s): %s.", candidate.uuid, error) if not chosen_uuid: raise _errors.GroupError( "There is no valid candidate that can be automatically " "chosen in group (%s). Please, choose one manually." % (group_id, )) return chosen_uuid
def _run(self): """Function that verifies servers' availabilities. """ from mysql.fabric.server import ( Group, MySQLServer, ConnectionManager, ) ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT connection_manager = ConnectionManager() slave_deep_checks = FailureDetector._SLAVE_DEEP_CHECKS _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status: ### Server is FAULTY connection_manager.kill_connections(server) continue else: ### Server is Not FAULTY if MySQLServer.is_alive(server, detection_timeout): ### Server is alive ### check depends on `slave_deep_checks` parameter if slave_deep_checks: ### When server is alive and status != FAULTY is_master= (group.master == server.uuid) if not is_master: ### Checking master is dead or alive. master_server = MySQLServer.fetch(group.master) if MySQLServer.is_alive(master_server, detection_timeout): ### Checking is replication valid or not if master is alive. server.connect() slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) if slave_issues: if (why_slave_issues['io_error'] and \ why_slave_issues['io_errno'] == 2003): ### Nothing to do during reconnecting, just logging _LOGGER.info(why_slave_issues) else: ### If slave threads are not running, set status to SPARE server.status = MySQLServer.SPARE ### Done slave_issues. server.disconnect() ### Endif MySQLServer.is_alive(master_server, detection_timeout) ### Endif not is_master ### Endif slave_deep_checks continue ### Else MySQLServer.is_alive(server, detection_timeout) else: unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: # We have to make this transactional and make the # failover (i.e. report failure) robust to failures. # Otherwise, a master might be set to faulty and # a new one never promoted. server.status = MySQLServer.FAULTY connection_manager.kill_connections(server) procedures = trigger("REPORT_FAILURE", None, str(server.uuid), threading.current_thread().name, MySQLServer.FAULTY, False ) executor = _executor.Executor() for procedure in procedures: executor.wait_for_procedure(procedure) ### Endif MySQLServer.is_alive(server, detection_timeout) ### Endif server.status in ignored_status ### End for server in group.servers() ### Endif group is not None for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval) _persistence.deinit_thread()
def _do_find_candidate(group_id, event): """Find the best candidate in a group that may be used to replace the current master if there is any. It chooses the slave that has processed more transactions and may become a master, e.g. has the binary log enabled. This function does not consider purged transactions and delays in the slave while picking up a slave. :param group_id: Group's id from where a candidate will be chosen. :return: Return the uuid of the best candidate to become a master in the group. """ forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) master_uuid = None if group.master: master_uuid = str(group.master) chosen_uuid = None chosen_gtid_status = None for candidate in group.servers(): if master_uuid != str(candidate.uuid) and \ candidate.status not in forbidden_status: try: candidate.connect() gtid_status = candidate.get_gtid_status() master_issues = \ _replication.check_master_issues(candidate) if event == FIND_CANDIDATE_SWITCH: slave_issues = \ _replication.check_slave_issues(candidate) else: slave_issues = {} has_valid_master = (master_uuid is None or \ _replication.slave_has_master(candidate) == master_uuid) can_become_master = False if chosen_gtid_status: n_trans = 0 try: n_trans = _replication.get_slave_num_gtid_behind( candidate, chosen_gtid_status ) except _errors.InvalidGtidError: pass if n_trans == 0 and not master_issues and \ has_valid_master and not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True elif not master_issues and has_valid_master and \ not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True if not can_become_master: _LOGGER.warning( "Candidate (%s) cannot become a master due to the " "following reasons: issues to become a " "master (%s), prerequistes as a slave (%s), valid " "master (%s).", candidate.uuid, master_issues, slave_issues, has_valid_master ) except _errors.DatabaseError as error: _LOGGER.warning( "Error accessing candidate (%s).", candidate.uuid, exc_info=error ) if not chosen_uuid: raise _errors.GroupError( "There is no valid candidate that can be automatically " "chosen in group (%s). Please, choose one manually." % (group_id, ) ) return chosen_uuid
def execute(self, group_id, timeout=None): """Check if any server within a group has failed. :param group_id: Group's id. :param group_id: Timeout value after which a server is considered unreachable. If None is provided, it assumes the default value in the configuration file. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) info = ResultSet( names=[ 'uuid', 'is_alive', 'status', 'is_not_running', 'is_not_configured', 'io_not_running', 'sql_not_running', 'io_error', 'sql_error' ], types=[str, bool, str] + [bool] * 4 + [str, str] ) issues = ResultSet(names=['issue'], types=[str]) try: timeout = float(timeout) except (TypeError, ValueError): pass for server in group.servers(): alive = False is_master = (group.master == server.uuid) status = server.status why_slave_issues = {} # These are used when server is not contactable. why_slave_issues = { 'is_not_running': False, 'is_not_configured': False, 'io_not_running': False, 'sql_not_running': False, 'io_error': False, 'sql_error': False, } try: alive = server.is_alive(timeout or DEFAULT_UNREACHABLE_TIMEOUT) if alive and not is_master: server.connect() slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: issues.append_row([ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) ]) except _errors.DatabaseError: alive = False info.append_row([ server.uuid, alive, status, why_slave_issues['is_not_running'], why_slave_issues['is_not_configured'], why_slave_issues['io_not_running'], why_slave_issues['sql_not_running'], why_slave_issues['io_error'], why_slave_issues['sql_error'], ]) return CommandResult(None, results=[info, issues])