def _block_write_demote(group_id, update_only): """Block and disable write access to the current master. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) if not group.master: raise _errors.GroupError("Group (%s) does not have a master." % (group_id, )) master = _server.MySQLServer.fetch(group.master) assert(master.status in \ (_server.MySQLServer.PRIMARY, _server.MySQLServer.FAULTY) ) if master.status == _server.MySQLServer.PRIMARY: master.connect() master.mode = _server.MySQLServer.READ_ONLY master.status = _server.MySQLServer.SECONDARY _utils.set_read_only(master, True) if not update_only: _events.trigger_within_procedure(WAIT_SLAVES_DEMOTE, group_id, str(master.uuid)) _set_group_master_replication(group, None, update_only)
def _retrieve_group(group_id): """Return a Group object from an identifier. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) return group
def start_group_slaves(master_group_id): """Start the slave groups for the given master group. The method will be used in the events that requires, a group, that has registered slaves to start them. An example would be enable shard, enable shard requires that a group start all the slaves that are registered with it. :param master_group_id: The master group ID. The ID belongs to the master whose slaves need to be started. """ # Fetch the master group corresponding to the master group # ID. master_group = Group.fetch(master_group_id) if master_group is None: raise _errors.GroupError(GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % \ (master_group_id, )) # Setup replication with masters of the groups registered as master # groups. master_group.slave_group_ids contains the list of the group # IDs that are slaves to this master. Iterate through this list and start # replication with the registered slaves. for slave_group_id in master_group.slave_group_ids: slave_group = Group.fetch(slave_group_id) # Setup replication with the slave group. try: setup_group_replication(master_group_id, slave_group.group_id) except (_errors.GroupError, _errors.DatabaseError) as error: _LOGGER.warning( "Error while configuring group replication between " "(%s) and (%s): (%s).", master_group_id, slave_group.group_id, error)
def _retrieve_server(server_id, group_id=None): """Return a MySQLServer object from a UUID or a HOST:PORT. """ server = _server.MySQLServer.fetch(server_id) if not server: raise _errors.ServerError("Server (%s) does not exist." % (server_id, )) if not server.group_id: raise _errors.GroupError("Server (%s) does not belong to a group." % (server_id, )) if group_id is not None and group_id != server.group_id: raise _errors.GroupError("Group (%s) does not contain server (%s)." % (group_id, server_id)) return server
def _check_candidate_switch(group_id, slave_id): """Check if the candidate has all the features to become the new master. """ allowed_status = (_server.MySQLServer.SECONDARY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) if not group.master: raise _errors.GroupError("Group (%s) does not contain a valid " "master. Please, run a promote or failover." % (group_id, )) slave = _retrieve_server(slave_id, group_id) slave.connect() if group.master == slave.uuid: raise _errors.ServerError("Candidate slave (%s) is already master." % (slave_id, )) master_issues, why_master_issues = _replication.check_master_issues(slave) if master_issues: raise _errors.ServerError("Server (%s) is not a valid candidate slave " "due to the following reason(s): (%s)." % (slave.uuid, why_master_issues)) slave_issues, why_slave_issues = _replication.check_slave_issues(slave) if slave_issues: raise _errors.ServerError("Server (%s) is not a valid candidate slave " "due to the following reason: (%s)." % (slave.uuid, why_slave_issues)) master_uuid = _replication.slave_has_master(slave) if master_uuid is None or group.master != _uuid.UUID(master_uuid): raise _errors.GroupError( "The group's master (%s) is different from the candidate's " "master (%s)." % (group.master, master_uuid)) if slave.status not in allowed_status: raise _errors.ServerError("Server (%s) is faulty." % (slave_id, )) _events.trigger_within_procedure(BLOCK_WRITE_SWITCH, group_id, master_uuid, str(slave.uuid))
def _check_group_dependencies(group): """Check whether there is a shard associated with the group. """ group_id = group.group_id shard_id = _sharding.Shards.lookup_shard_id(group_id) if shard_id: raise _errors.GroupError( "Cannot destroy a group (%s) which is associated to a shard (%s)." % (group_id, shard_id)) shard_mapping_id = _sharding.ShardMapping.lookup_shard_mapping_id(group_id) if shard_mapping_id: raise _errors.GroupError( "Cannot destroy a group (%s) which is used as a global group in a " "shard definition (%s)." % (group_id, shard_mapping_id)) if group.servers(): raise _errors.GroupError( "Cannot destroy a group (%s) which has associated servers." % (group_id, ))
def _destroy_group(group_id): """Destroy a group. """ group = _retrieve_group(group_id) _check_group_dependencies(group) _detector.FailureDetector.unregister_group(group_id) try: _server.Group.remove(group) except _errors.DatabaseError as error: foreign_key_errors = (ER_ROW_IS_REFERENCED, ER_ROW_IS_REFERENCED_2) if error.errno in foreign_key_errors: raise _errors.GroupError( "Cannot destroy group (%s): %s." % (group_id, error, ) ) raise _LOGGER.debug("Destroyed group (%s).", group)
def _define_ha_operation(group_id, slave_id, update_only): """Define which operation must be called based on the master's status and whether the candidate slave is provided or not. """ fail_over = True group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) if update_only and not slave_id: raise _errors.ServerError( "The new master must be specified through --slave-uuid if " "--update-only is set.") if group.master: master = _server.MySQLServer.fetch(group.master) if master.status != _server.MySQLServer.FAULTY: if update_only: _do_block_write_master(group_id, str(group.master), update_only) fail_over = False if update_only: # Check whether the server is registered or not. _retrieve_server(slave_id, group_id) _change_to_candidate(group_id, slave_id, update_only) return if fail_over: if not slave_id: _events.trigger_within_procedure(FIND_CANDIDATE_FAIL, group_id) else: _events.trigger_within_procedure(CHECK_CANDIDATE_FAIL, group_id, slave_id) else: if not slave_id: _events.trigger_within_procedure(FIND_CANDIDATE_SWITCH, group_id) else: _events.trigger_within_procedure(CHECK_CANDIDATE_SWITCH, group_id, slave_id)
def execute(self, group_id): """Check if any server within a group has failed. :param group_id: Group's id. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) info = ResultSet( names=[ 'uuid', 'is_alive', 'status', 'is_not_running', 'is_not_configured', 'io_not_running', 'sql_not_running', 'io_error', 'sql_error', 'gtid_executed' ], types=[str, bool, str] + [bool] * 4 + [str, str, str] ) issues = ResultSet(names=['issue'], types=[str]) for server in group.servers(): alive = False is_master = (group.master == server.uuid) status = server.status why_slave_issues = {} # These are used when server is not contactable. why_slave_issues = { 'is_not_running': False, 'is_not_configured': False, 'io_not_running': False, 'sql_not_running': False, 'io_error': False, 'sql_error': False, } try: # TODO: CHECK WHETHER WE SHOULD USE IS_ALIVE OR NOT. if server.is_alive: server.connect() alive = True if not is_master: slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: issues.append_row([ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) ]) gtid_executed= server.get_gtid_status()[0].GTID_EXECUTED else: status = _server.MySQLServer.FAULTY gtid_executed= "UNKNOWN" except _errors.DatabaseError: status = _server.MySQLServer.FAULTY gtid_executed= "UNKNOWN" info.append_row([ server.uuid, alive, status, why_slave_issues['is_not_running'], why_slave_issues['is_not_configured'], why_slave_issues['io_not_running'], why_slave_issues['sql_not_running'], why_slave_issues['io_error'], why_slave_issues['sql_error'], ' '.join(gtid_executed.splitlines()), ]) return CommandResult(None, results=[info, issues])
def _do_find_candidate(group_id, event): """Find the best candidate in a group that may be used to replace the current master if there is any. It chooses the slave that has processed more transactions and may become a master, e.g. has the binary log enabled. This function does not consider purged transactions and delays in the slave while picking up a slave. :param group_id: Group's id from where a candidate will be chosen. :return: Return the uuid of the best candidate to become a master in the group. """ forbidden_status = (_server.MySQLServer.FAULTY, _server.MySQLServer.SPARE) group = _server.Group.fetch(group_id) master_uuid = None if group.master: master_uuid = str(group.master) chosen_uuid = None chosen_gtid_status = None for candidate in group.servers(): if master_uuid != str(candidate.uuid) and \ candidate.status not in forbidden_status: try: candidate.connect() gtid_status = candidate.get_gtid_status() master_issues, why_master_issues = \ _replication.check_master_issues(candidate) slave_issues = False why_slave_issues = {} if event == FIND_CANDIDATE_SWITCH: slave_issues, why_slave_issues = \ _replication.check_slave_issues(candidate) has_valid_master = (master_uuid is None or \ _replication.slave_has_master(candidate) == master_uuid) can_become_master = False if chosen_gtid_status: n_trans = 0 try: n_trans = _replication.get_slave_num_gtid_behind( candidate, chosen_gtid_status) except _errors.InvalidGtidError: pass if n_trans == 0 and not master_issues and \ has_valid_master and not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True elif not master_issues and has_valid_master and \ not slave_issues: chosen_gtid_status = gtid_status chosen_uuid = str(candidate.uuid) can_become_master = True if not can_become_master: _LOGGER.warning( "Candidate (%s) cannot become a master due to the " "following reasons: issues to become a " "master (%s), prerequistes as a slave (%s), valid " "master (%s).", candidate.uuid, why_master_issues, why_slave_issues, has_valid_master) except _errors.DatabaseError as error: _LOGGER.warning("Error accessing candidate (%s): %s.", candidate.uuid, error) if not chosen_uuid: raise _errors.GroupError( "There is no valid candidate that can be automatically " "chosen in group (%s). Please, choose one manually." % (group_id, )) return chosen_uuid
def _check_group_exists(group_id): """Check whether a group exists or not. """ group = _server.Group.fetch(group_id) if group: raise _errors.GroupError("Group (%s) already exists." % (group_id, ))
def setup_group_replication(group_master_id, group_slave_id): """Sets up replication between the masters of the two groups and updates the references to the groups in each other. :param group_master_id: The group whose master will act as the master in the replication setup. :param group_slave_id: The group whose master will act as the slave in the replication setup. """ group_master = Group.fetch(group_master_id) group_slave = Group.fetch(group_slave_id) if group_master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_master_id, )) if group_slave is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_slave_id, )) if group_master.master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "") if group_slave.master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "") #Master is the master of the Global Group. We replicate from here to #the masters of all the shard Groups. master = MySQLServer.fetch(group_master.master) if master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (group_master.master, )) #Get the master of the shard Group. slave = MySQLServer.fetch(group_slave.master) if slave is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (group_slave.master, )) if not server_running(master): #The server is already down. We cannot connect to it to setup #replication. raise _errors.GroupError \ (GROUP_MASTER_NOT_RUNNING % (group_master.group_id, )) try: master.connect() except _errors.DatabaseError as error: #Server is not accessible, unable to connect to the server. raise _errors.GroupError(GROUP_REPLICATION_SERVER_ERROR % (group_slave.master, error)) if not server_running(slave): #The server is already down. We cannot connect to it to setup #replication. raise _errors.GroupError \ (GROUP_MASTER_NOT_RUNNING % (group_slave.group_id, )) try: slave.connect() except _errors.DatabaseError as error: raise _errors.GroupError(GROUP_REPLICATION_SERVER_ERROR % (group_master.master, error)) _replication.stop_slave(slave, wait=True) #clear references to old masters in the slave _replication.reset_slave(slave, clean=True) _replication.switch_master(slave, master, master.user, master.passwd) _replication.start_slave(slave, wait=True) try: group_master.add_slave_group_id(group_slave_id) group_slave.add_master_group_id(group_master_id) except _errors.DatabaseError: #If there is an error while adding a reference to #the slave group or a master group, it means that #the slave group was already added and the error #is happening because the group was already registered. #Ignore this error. pass
def execute(self, group_id, timeout=None): """Check if any server within a group has failed. :param group_id: Group's id. :param group_id: Timeout value after which a server is considered unreachable. If None is provided, it assumes the default value in the configuration file. """ group = _server.Group.fetch(group_id) if not group: raise _errors.GroupError("Group (%s) does not exist." % (group_id, )) info = ResultSet( names=[ 'uuid', 'is_alive', 'status', 'is_not_running', 'is_not_configured', 'io_not_running', 'sql_not_running', 'io_error', 'sql_error' ], types=[str, bool, str] + [bool] * 4 + [str, str] ) issues = ResultSet(names=['issue'], types=[str]) try: timeout = float(timeout) except (TypeError, ValueError): pass for server in group.servers(): alive = False is_master = (group.master == server.uuid) status = server.status why_slave_issues = {} # These are used when server is not contactable. why_slave_issues = { 'is_not_running': False, 'is_not_configured': False, 'io_not_running': False, 'sql_not_running': False, 'io_error': False, 'sql_error': False, } try: alive = server.is_alive(timeout or DEFAULT_UNREACHABLE_TIMEOUT) if alive and not is_master: server.connect() slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) str_master_uuid = _replication.slave_has_master(server) if (group.master is None or str(group.master) != \ str_master_uuid) and not slave_issues: issues.append_row([ "Group has master (%s) but server is connected " \ "to master (%s)." % \ (group.master, str_master_uuid) ]) except _errors.DatabaseError: alive = False info.append_row([ server.uuid, alive, status, why_slave_issues['is_not_running'], why_slave_issues['is_not_configured'], why_slave_issues['io_not_running'], why_slave_issues['sql_not_running'], why_slave_issues['io_error'], why_slave_issues['sql_error'], ]) return CommandResult(None, results=[info, issues])