def _setup_replication(shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd): """Setup replication between the source and the destination groups and ensure that they are in sync. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation """ source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) destination_group = Group.fetch(destn_group_id) if destination_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destn_group_id, )) master = MySQLServer.fetch(source_group.master) if master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) master.connect() slave = MySQLServer.fetch(destination_group.master) if slave is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) slave.connect() #Stop and reset any slave that might be running on the slave server. _utils.set_offline_mode(slave, True) ### TODO: if forced offline_mode _replication.stop_slave(slave, wait=True) _replication.reset_slave(slave, clean=True) #Change the master to the shard group master. _replication.switch_master(slave, master, master.repl_user, master.repl_pass) #Start the slave so that syncing of the data begins _replication.start_slave(slave, wait=True) _utils.set_offline_mode(slave, False) ### TODO: if forced offline_mode #Setup sync between the source and the destination groups. _events.trigger_within_procedure( SETUP_SYNC, shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd )
def start_group_slaves(master_group_id): """Start the slave groups for the given master group. The method will be used in the events that requires, a group, that has registered slaves to start them. An example would be enable shard, enable shard requires that a group start all the slaves that are registered with it. :param master_group_id: The master group ID. The ID belongs to the master whose slaves need to be started. """ # Fetch the master group corresponding to the master group # ID. master_group = Group.fetch(master_group_id) if master_group is None: raise _errors.GroupError(GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % \ (master_group_id, )) # Setup replication with masters of the groups registered as master # groups. master_group.slave_group_ids contains the list of the group # IDs that are slaves to this master. Iterate through this list and start # replication with the registered slaves. for slave_group_id in master_group.slave_group_ids: slave_group = Group.fetch(slave_group_id) # Setup replication with the slave group. try: setup_group_replication(master_group_id, slave_group.group_id) except (_errors.GroupError, _errors.DatabaseError) as error: _LOGGER.warning( "Error while configuring group replication between " "(%s) and (%s): (%s).", master_group_id, slave_group.group_id, error)
def start_group_slaves(master_group_id): """Start the slave groups for the given master group. The method will be used in the events that requires, a group, that has registered slaves to start them. An example would be enable shard, enable shard requires that a group start all the slaves that are registered with it. :param master_group_id: The master group ID. The ID belongs to the master whose slaves need to be started. """ # Fetch the master group corresponding to the master group # ID. master_group = Group.fetch(master_group_id) if master_group is None: raise _errors.GroupError(GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % \ (master_group_id, )) # Setup replication with masters of the groups registered as master # groups. master_group.slave_group_ids contains the list of the group # IDs that are slaves to this master. Iterate through this list and start # replication with the registered slaves. for slave_group_id in master_group.slave_group_ids: slave_group = Group.fetch(slave_group_id) # Setup replication with the slave group. try: setup_group_replication(master_group_id, slave_group.group_id) except (_errors.GroupError, _errors.DatabaseError) as error: _LOGGER.warning( "Error while configuring group replication between " "(%s) and (%s): (%s).", master_group_id, slave_group.group_id, error )
def _lookup(lookup_arg, key, hint): """Given a table name and a key return the servers of the Group where the shard of this table can be found :param lookup_arg: table name for "LOCAL" lookups Shard Mapping ID for "GLOBAL" lookups. :param key: The key value that needs to be looked up :param hint: A hint indicates if the query is LOCAL or GLOBAL :return: The servers of the Group that contains the range in which the key belongs. """ VALID_HINTS = ('LOCAL', 'GLOBAL') hint = hint.upper() if hint not in VALID_HINTS: raise _errors.ShardingError(INVALID_SHARDING_HINT) group = None #Perform the lookup for the group contaning the lookup data. if hint == "GLOBAL": #Fetch the shard mapping object. In the case of GLOBAL lookups #the shard mapping ID is passed directly. In the case of "LOCAL" #lookups it is the table name that is passed. shard_mapping = ShardMapping.fetch_by_id(lookup_arg) if shard_mapping is None: raise _errors.ShardingError( SHARD_MAPPING_NOT_FOUND % (lookup_arg, ) ) #GLOBAL lookups. There can be only one global group, hence using #shard_mapping[0] is safe. group = Group.fetch(shard_mapping[0].global_group) else: shard_mapping = ShardMapping.fetch(lookup_arg) if shard_mapping is None: raise _errors.ShardingError(TABLE_NAME_NOT_FOUND % (lookup_arg, )) sharding_specification =\ SHARDING_SPECIFICATION_HANDLER[shard_mapping.type_name].\ lookup(key, shard_mapping.shard_mapping_id, shard_mapping.type_name) if sharding_specification is None: raise _errors.ShardingError(INVALID_SHARDING_KEY % (key, )) shard = Shards.fetch(str(sharding_specification.shard_id)) if shard.state == "DISABLED": raise _errors.ShardingError(SHARD_NOT_ENABLED) #group cannot be None since there is a foreign key on the group_id. #An exception will be thrown nevertheless. group = Group.fetch(shard.group_id) if group is None: raise _errors.ShardingError(SHARD_LOCATION_NOT_FOUND) ret = [] #An empty list will be returned if the registered group has not #servers. for server in group.servers(): ret.append([str(server.uuid), server.address, group.master == server.uuid]) return ret
def _setup_sync(shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd): """sync the source and the destination groups. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation """ source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) destination_group = Group.fetch(destn_group_id) if destination_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destn_group_id, )) master = MySQLServer.fetch(source_group.master) if master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) master.connect() slave = MySQLServer.fetch(destination_group.master) if slave is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) slave.connect() #Synchronize until the slave catches up with the master. _replication.synchronize_with_read_only(slave, master) #Reset replication once the syncing is done. _replication.stop_slave(slave, wait=True) _replication.reset_slave(slave, clean=True) #Trigger changing the mappings for the shard that was copied _events.trigger_within_procedure( SETUP_RESHARDING_SWITCH, shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd )
def stop_group_slave(group_master_id, group_slave_id, clear_ref): """Stop the slave on the slave group. This utility method is the completement of the setup_group_replication method and is used to stop the replication on the slave group. Given a master group ID and the slave group ID the method stops the slave on the slave group and updates the references on both the master and the slave group. :param group_master_id: The id of the master group. :param group_slave_id: The id of the slave group. :param clear_ref: The parameter indicates if the stop_group_slave needs to clear the references to the group's slaves. For example when you do a disable shard the shard group still retains the references to its slaves, since when enabled it needs to enable the replication. """ master_group = Group.fetch(group_master_id) slave_group = Group.fetch(group_slave_id) if master_group is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_master_id, )) if slave_group is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_slave_id, )) slave_group_master = MySQLServer.fetch(slave_group.master) if slave_group_master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % (slave_group.master, )) if not server_running(slave_group_master): #The server is already down. We cannot connect to it to stop #replication. return try: slave_group_master.connect() except _errors.DatabaseError: #Server is not accessible, unable to connect to the server. return #Stop replication on the master of the group and clear the references, #if clear_ref has been set. _replication.stop_slave(slave_group_master, wait=True) _replication.reset_slave(slave_group_master, clean=True) if clear_ref: slave_group.remove_master_group_id() master_group.remove_slave_group_id(group_slave_id)
def _backup_source_shard(shard_id, source_group_id, destn_group_id, mysqldump_binary, mysqlclient_binary, split_value, config_file, prune_limit, cmd, update_only): """Backup the source shard. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param mysqldump_binary: The fully qualified mysqldump binary. :param mysqlclient_binary: The fully qualified mysql client binary. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param config_file: The complete path to the fabric configuration file. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation (move, split) :update_only: Only update the state store and skip provisioning. """ source_group = Group.fetch(source_group_id) move_source_server = _services_utils.fetch_backup_server(source_group) #Do the backup of the group hosting the source shard. backup_image = _backup.MySQLDump.backup(move_source_server, config_file, mysqldump_binary) #Change the master for the server that is master of the group which hosts #the destination shard. _events.trigger_within_procedure(RESTORE_SHARD_BACKUP, shard_id, source_group_id, destn_group_id, mysqlclient_binary, backup_image.path, split_value, config_file, prune_limit, cmd)
def _restore_shard_backup(shard_id, source_group_id, destn_group_id, backup_image, split_value, prune_limit, cmd): """Restore the backup on the destination Group. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param backup_image: The destination file that contains the backup of the source shard. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation """ restore_user = _services_utils.read_config_value( _config.global_config, 'servers', 'restore_user' ) restore_passwd = _services_utils.read_config_value( _config.global_config, 'servers', 'restore_password' ) mysqlclient_binary = _services_utils.read_config_value( _config.global_config, 'sharding', 'mysqlclient_program' ) destn_group = Group.fetch(destn_group_id) if destn_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destn_group_id, )) #Build a backup image that will be used for restoring bk_img = _backup.BackupImage(backup_image) for destn_group_server in destn_group.servers(): destn_group_server.connect() _backup.MySQLDump.restore_fabric_server( destn_group_server, restore_user, restore_passwd, bk_img, mysqlclient_binary ) #Setup sync between the source and the destination groups. _events.trigger_within_procedure( SETUP_REPLICATION, shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd )
def _run(self): """Function that verifies servers' availabilities. """ ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status or \ MySQLServer.is_alive(server, detection_timeout): if server.status == MySQLServer.FAULTY: self.__connection_manager.purge_connections( server ) continue unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: self._spawn_report_failure(server) for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval / detections) _persistence.deinit_thread()
def drop_shard_range_trigger(group_id, sharding_type, table_name, column_name): """Drop a trigger on the shard table. :param group_id: The ID of the group on which the trigger definition is applied. The trigger is created on the master of this group. :param sharding_type: The datatype supported by the shards. Used to name the trigger. :param table_name: The name of the table. This is used to name the trigger being created. :param column_name: The name of the column in the table being sharded. This is used to create the name of the trigger. """ global_group = Group.fetch(group_id) master_server = MySQLServer.fetch(global_group.master) master_server.connect() db, table = table_name.split(".") #Drop the INSERT trigger on the sharded table. trigger_name = db + "." + _TRIGGER_PREFIX_INSERT+table drop_insert_trigger = _DROP_TRIGGER_DEFN.format( trigger_name=trigger_name ) master_server.exec_stmt(drop_insert_trigger) #Drop the UPDATE trigger on the sharded table. trigger_name = db + "." + _TRIGGER_PREFIX_UPDATE + table drop_update_trigger = _DROP_TRIGGER_DEFN.format( trigger_name=trigger_name ) master_server.exec_stmt(drop_update_trigger)
def stop_group_slaves(master_group_id): """Stop the group slaves for the given master group. This will be used for use cases that required all the slaves replicating from this group to be stopped. An example use case would be disabling a shard. :param master_group_id: The master group ID. """ master_group = Group.fetch(master_group_id) if master_group is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % \ (master_group_id, )) # Stop the replication on all of the registered slaves for the group. for slave_group_id in master_group.slave_group_ids: slave_group = Group.fetch(slave_group_id) # Fetch the Slave Group and the master of the Slave Group slave_group_master = MySQLServer.fetch(slave_group.master) if slave_group_master is None: _LOGGER.warning( GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (slave_group.master, ) ) continue if not server_running(slave_group_master): # The server is already down. we cannot connect to it to stop # replication. continue try: slave_group_master.connect() _replication.stop_slave(slave_group_master, wait=True) # Reset the slave to remove the reference of the master so # that when the server is used as a slave next it does not # complaint about having a different master. _replication.reset_slave(slave_group_master, clean=True) except _errors.DatabaseError as error: # Server is not accessible, unable to connect to the server. _LOGGER.warning( "Error while unconfiguring group replication between " "(%s) and (%s): (%s).", master_group_id, slave_group.group_id, error ) continue
def stop_group_slaves(master_group_id): """Stop the group slaves for the given master group. This will be used for use cases that required all the slaves replicating from this group to be stopped. An example use case would be disabling a shard. :param master_group_id: The master group ID. """ master_group = Group.fetch(master_group_id) if master_group is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % \ (master_group_id, )) # Stop the replication on all of the registered slaves for the group. for slave_group_id in master_group.slave_group_ids: slave_group = Group.fetch(slave_group_id) # Fetch the Slave Group and the master of the Slave Group slave_group_master = MySQLServer.fetch(slave_group.master) if slave_group_master is None: _LOGGER.warning(GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR, slave_group.master) continue if not server_running(slave_group_master): # The server is already down. we cannot connect to it to stop # replication. continue try: slave_group_master.connect() _replication.stop_slave(slave_group_master, wait=True) # Reset the slave to remove the reference of the master so # that when the server is used as a slave next it does not # complaint about having a different master. _replication.reset_slave(slave_group_master, clean=True) except _errors.DatabaseError as error: # Server is not accessible, unable to connect to the server. _LOGGER.warning( "Error while unconfiguring group replication between " "(%s) and (%s): (%s).", master_group_id, slave_group.group_id, error) continue
def _fetch_master_of_group(group_id): """Return a reference to the master of the group. :param group_id: ID of the group whose master needs to be fetched. :return: MySQLServer object referring to the group master. """ global_group = Group.fetch(group_id) master_server = MySQLServer.fetch(global_group.master) master_server.connect() return master_server
def _backup_source_shard(shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd, update_only): """Backup the source shard. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation (move, split) :update_only: Only update the state store and skip provisioning. """ backup_user = _services_utils.read_config_value( _config.global_config, 'servers', 'backup_user' ) backup_passwd = _services_utils.read_config_value( _config.global_config, 'servers', 'backup_password' ) mysqldump_binary = _services_utils.read_config_value( _config.global_config, 'sharding', 'mysqldump_program' ) source_group = Group.fetch(source_group_id) move_source_server = _services_utils.fetch_backup_server(source_group) #Do the backup of the group hosting the source shard. backup_image = _backup.MySQLDump.backup( move_source_server, backup_user, backup_passwd, mysqldump_binary ) #Change the master for the server that is master of the group which hosts #the destination shard. _events.trigger_within_procedure( RESTORE_SHARD_BACKUP, shard_id, source_group_id, destn_group_id, backup_image.path, split_value, prune_limit, cmd )
def test_properties(self): """Test group's properties. """ group_1 = Group("mysql.com") Group.add(group_1) fetched_group_1 = Group.fetch(group_1.group_id) self.assertEqual(group_1.group_id, "mysql.com") self.assertEqual(fetched_group_1.group_id, "mysql.com") self.assertEqual(fetched_group_1.master_defined, None) group_2 = Group("oracle.com", "First description.") Group.add(group_2) fetched_group_2 = Group.fetch(group_2.group_id) self.assertEqual(group_2.group_id, "oracle.com") self.assertEqual(fetched_group_2.group_id, "oracle.com") group_1.description = "New description." fetched_group_1 = Group.fetch(group_1.group_id) self.assertEqual(group_1.description, "New description.") self.assertEqual(fetched_group_1.description, "New description.") group_1.description = None fetched_group_1 = Group.fetch(group_1.group_id) self.assertEqual(group_1.description, None) self.assertEqual(fetched_group_1.description, None) group_1.status = Group.INACTIVE fetched_group_1 = Group.fetch(group_1.group_id) self.assertEqual(group_1.status, Group.INACTIVE) self.assertEqual(fetched_group_1.status, Group.INACTIVE) self.assertEqual(group_1, fetched_group_1) self.assertEqual(group_2, fetched_group_2) self.assertNotEqual(group_1, group_2) set_of_groups = set() set_of_groups.add(group_1) set_of_groups.add(group_2) set_of_groups.add(fetched_group_1) set_of_groups.add(fetched_group_2) self.assertEqual(len(set_of_groups), 2)
def _backup_source_shard(shard_id, source_group_id, destn_group_id, mysqldump_binary, mysqlclient_binary, split_value, config_file, prune_limit, cmd, update_only): """Backup the source shard. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param mysqldump_binary: The fully qualified mysqldump binary. :param mysqlclient_binary: The fully qualified mysql client binary. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param config_file: The complete path to the fabric configuration file. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation (move, split) :update_only: Only update the state store and skip provisioning. """ source_group = Group.fetch(source_group_id) move_source_server = _services_utils.fetch_backup_server(source_group) #Do the backup of the group hosting the source shard. backup_image = _backup.MySQLDump.backup( move_source_server, config_file, mysqldump_binary ) #Change the master for the server that is master of the group which hosts #the destination shard. _events.trigger_within_procedure( RESTORE_SHARD_BACKUP, shard_id, source_group_id, destn_group_id, mysqlclient_binary, backup_image.path, split_value, config_file, prune_limit, cmd )
def _restore_shard_backup(shard_id, source_group_id, destn_group_id, mysqlclient_binary, backup_image, split_value, config_file, cmd): """Restore the backup on the destination Group. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param mysqlclient_binary: The fully qualified mysqlclient binary. :param backup_image: The destination file that contains the backup of the source shard. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param config_file: The complete path to the fabric configuration file. :param cmd: Indicates the type of re-sharding operation """ destn_group = Group.fetch(destn_group_id) if destn_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destn_group_id, )) #Build a backup image that will be used for restoring bk_img = _backup.BackupImage(backup_image) for destn_group_server in destn_group.servers(): destn_group_server.connect() _backup.MySQLDump.restore_fabric_server( destn_group_server, bk_img, config_file, mysqlclient_binary ) #Setup sync between the source and the destination groups. _events.trigger_within_procedure( SETUP_MOVE_SYNC, shard_id, source_group_id, destn_group_id, split_value, cmd )
def add_shard_range_trigger(group_id, sharding_type, table_name, column_name): """Add a trigger on the shard table to ensure that values inserted fall within the valid shard ranges. :param group_id: The ID of the group on which the trigger definition is applied. The trigger is created on the master of this group. :param sharding_type: The datatype supported by the shards. Used to name the trigger. :param table_name: The name of the table. This is used to name the trigger being created. :param column_name: The name of the column in the table being sharded. This is used to create the name of the trigger. """ global_group = Group.fetch(group_id) master_server = MySQLServer.fetch(global_group.master) master_server.connect() #Create an INSERT trigger on the sharded table. db, table = table_name.split(".") trigger_tmpl = _TRIGGER_DEFN[sharding_type] trigger_name = db + "." + _TRIGGER_PREFIX_INSERT + table create_insert_trigger = trigger_tmpl.format( trigger_name=trigger_name, operation="INSERT", table_name=table_name, column_name="NEW"+"."+column_name ) master_server.exec_stmt(create_insert_trigger) #Create an UPDATE trigger on the sharded table. trigger_tmpl = _TRIGGER_DEFN[sharding_type] trigger_name = db + "." + _TRIGGER_PREFIX_UPDATE + table create_update_trigger =trigger_tmpl.format( trigger_name=trigger_name, operation="UPDATE", table_name=table_name, column_name="NEW"+"."+column_name ) master_server.exec_stmt(create_update_trigger)
def _setup_shard_switch_move(shard_id, source_group_id, destination_group_id, update_only): """Setup the moved shard to map to the new group. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destination_group_id: The ID of the group to which the shard needs to be moved. :update_only: Only update the state store and skip provisioning. """ #Fetch the Range sharding specification. When we start implementing #heterogenous sharding schemes, we need to find out the type of #sharding scheme and we should use that to find out the sharding #implementation. _, source_shard, _, shard_mapping_defn = \ _services_sharding._verify_and_fetch_shard(shard_id) #Setup replication between the shard group and the global group. _group_replication.setup_group_replication \ (shard_mapping_defn[2], destination_group_id) #set the shard to point to the new group. source_shard.group_id = destination_group_id #Stop the replication between the global server and the original #group associated with the shard. _group_replication.stop_group_slave\ (shard_mapping_defn[2], source_group_id, True) #Reset the read only flag on the source server. source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) master = MySQLServer.fetch(source_group.master) if master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) if not update_only: master.connect() master.read_only = False
def test_switchover_with_no_master(self): """Ensure that a switchover/failover happens when masters in the shard and global groups are dead. """ # Check that a shard group has it master pointing to a the master # in the global group. global_group = Group.fetch("GROUPID1") shard_group = Group.fetch("GROUPID2") other_shard_group = Group.fetch("GROUPID3") global_master = fetch_test_server(global_group.master) global_master.connect() shard_master = fetch_test_server(shard_group.master) shard_master.connect() other_shard_master = fetch_test_server(other_shard_group.master) other_shard_master.connect() self.assertEqual(_replication.slave_has_master(shard_master), str(global_group.master)) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master)) # Demote the master in the global group and check that a # shard group points to None. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual(_replication.slave_has_master(other_shard_master), None) # Demote the master in a shard group and promote the master # in the global group. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID2") shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master)) # Promote the master in the previous shard group and check that # everything is back to normal. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID2", str(shard_master.uuid)) self.assertEqual(_replication.slave_has_master(shard_master), str(global_group.master)) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master)) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) # Demote the master in the global group, check that a shard group # points to None, promot it again and check that everything is back # to normal global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), str(global_group.master)) self.assertEqual(_replication.slave_has_master(other_shard_master), str(global_group.master))
def setup_group_replication(group_master_id, group_slave_id): """Sets up replication between the masters of the two groups and updates the references to the groups in each other. :param group_master_id: The group whose master will act as the master in the replication setup. :param group_slave_id: The group whose master will act as the slave in the replication setup. """ group_master = Group.fetch(group_master_id) group_slave = Group.fetch(group_slave_id) if group_master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_master_id, )) if group_slave is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_slave_id, )) if group_master.master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "") if group_slave.master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "") #Master is the master of the Global Group. We replicate from here to #the masters of all the shard Groups. master = MySQLServer.fetch(group_master.master) if master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (group_master.master, )) #Get the master of the shard Group. slave = MySQLServer.fetch(group_slave.master) if slave is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (group_slave.master, )) if not server_running(master): #The server is already down. We cannot connect to it to setup #replication. raise _errors.GroupError \ (GROUP_MASTER_NOT_RUNNING % (group_master.group_id, )) try: master.connect() except _errors.DatabaseError as error: #Server is not accessible, unable to connect to the server. raise _errors.GroupError( GROUP_REPLICATION_SERVER_ERROR % (group_slave.master, error) ) if not server_running(slave): #The server is already down. We cannot connect to it to setup #replication. raise _errors.GroupError \ (GROUP_MASTER_NOT_RUNNING % (group_slave.group_id, )) try: slave.connect() except _errors.DatabaseError as error: raise _errors.GroupError( GROUP_REPLICATION_SERVER_ERROR % (group_master.master, error) ) _replication.stop_slave(slave, wait=True) #clear references to old masters in the slave _replication.reset_slave(slave, clean=True) _replication.switch_master(slave, master, master.user, master.passwd) _replication.start_slave(slave, wait=True) try: group_master.add_slave_group_id(group_slave_id) group_slave.add_master_group_id(group_master_id) except _errors.DatabaseError: #If there is an error while adding a reference to #the slave group or a master group, it means that #the slave group was already added and the error #is happening because the group was already registered. #Ignore this error. pass
def setup_group_replication(group_master_id, group_slave_id): """Sets up replication between the masters of the two groups and updates the references to the groups in each other. :param group_master_id: The group whose master will act as the master in the replication setup. :param group_slave_id: The group whose master will act as the slave in the replication setup. """ group_master = Group.fetch(group_master_id) group_slave = Group.fetch(group_slave_id) if group_master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_master_id, )) if group_slave is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_NOT_FOUND_ERROR % (group_slave_id, )) if group_master.master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "") if group_slave.master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % "") #Master is the master of the Global Group. We replicate from here to #the masters of all the shard Groups. master = MySQLServer.fetch(group_master.master) if master is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (group_master.master, )) #Get the master of the shard Group. slave = MySQLServer.fetch(group_slave.master) if slave is None: raise _errors.GroupError \ (GROUP_REPLICATION_GROUP_MASTER_NOT_FOUND_ERROR % \ (group_slave.master, )) if not server_running(master): #The server is already down. We cannot connect to it to setup #replication. raise _errors.GroupError \ (GROUP_MASTER_NOT_RUNNING % (group_master.group_id, )) try: master.connect() except _errors.DatabaseError as error: #Server is not accessible, unable to connect to the server. raise _errors.GroupError(GROUP_REPLICATION_SERVER_ERROR % (group_slave.master, error)) if not server_running(slave): #The server is already down. We cannot connect to it to setup #replication. raise _errors.GroupError \ (GROUP_MASTER_NOT_RUNNING % (group_slave.group_id, )) try: slave.connect() except _errors.DatabaseError as error: raise _errors.GroupError(GROUP_REPLICATION_SERVER_ERROR % (group_master.master, error)) _replication.stop_slave(slave, wait=True) #clear references to old masters in the slave _replication.reset_slave(slave, clean=True) _replication.switch_master(slave, master, master.user, master.passwd) _replication.start_slave(slave, wait=True) try: group_master.add_slave_group_id(group_slave_id) group_slave.add_master_group_id(group_master_id) except _errors.DatabaseError: #If there is an error while adding a reference to #the slave group or a master group, it means that #the slave group was already added and the error #is happening because the group was already registered. #Ignore this error. pass
def _run(self): """Function that verifies servers' availabilities. """ from mysql.fabric.server import ( Group, MySQLServer, ConnectionManager, ) ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT connection_manager = ConnectionManager() slave_deep_checks = FailureDetector._SLAVE_DEEP_CHECKS _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status: ### Server is FAULTY connection_manager.kill_connections(server) continue else: ### Server is Not FAULTY if MySQLServer.is_alive(server, detection_timeout): ### Server is alive ### check depends on `slave_deep_checks` parameter if slave_deep_checks: ### When server is alive and status != FAULTY is_master= (group.master == server.uuid) if not is_master: ### Checking master is dead or alive. master_server = MySQLServer.fetch(group.master) if MySQLServer.is_alive(master_server, detection_timeout): ### Checking is replication valid or not if master is alive. server.connect() slave_issues, why_slave_issues = \ _replication.check_slave_issues(server) if slave_issues: if (why_slave_issues['io_error'] and \ why_slave_issues['io_errno'] == 2003): ### Nothing to do during reconnecting, just logging _LOGGER.info(why_slave_issues) else: ### If slave threads are not running, set status to SPARE server.status = MySQLServer.SPARE ### Done slave_issues. server.disconnect() ### Endif MySQLServer.is_alive(master_server, detection_timeout) ### Endif not is_master ### Endif slave_deep_checks continue ### Else MySQLServer.is_alive(server, detection_timeout) else: unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: # We have to make this transactional and make the # failover (i.e. report failure) robust to failures. # Otherwise, a master might be set to faulty and # a new one never promoted. server.status = MySQLServer.FAULTY connection_manager.kill_connections(server) procedures = trigger("REPORT_FAILURE", None, str(server.uuid), threading.current_thread().name, MySQLServer.FAULTY, False ) executor = _executor.Executor() for procedure in procedures: executor.wait_for_procedure(procedure) ### Endif MySQLServer.is_alive(server, detection_timeout) ### Endif server.status in ignored_status ### End for server in group.servers() ### Endif group is not None for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval) _persistence.deinit_thread()
def _setup_shard_switch_move(shard_id, source_group_id, destination_group_id, update_only): """Setup the moved shard to map to the new group. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destination_group_id: The ID of the group to which the shard needs to be moved. :update_only: Only update the state store and skip provisioning. """ #Fetch the Range sharding specification. When we start implementing #heterogenous sharding schemes, we need to find out the type of #sharding scheme and we should use that to find out the sharding #implementation. _, source_shard, _, shard_mapping_defn = \ _services_sharding.verify_and_fetch_shard(shard_id) destination_group = Group.fetch(destination_group_id) if destination_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destination_group_id, )) destn_group_master = MySQLServer.fetch(destination_group.master) if destn_group_master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) destn_group_master.connect() #Set the destination group master to read_only destn_group_master.read_only = True #Setup replication between the shard group and the global group. _group_replication.setup_group_replication \ (shard_mapping_defn[2], destination_group_id) #set the shard to point to the new group. source_shard.group_id = destination_group_id #Stop the replication between the global server and the original #group associated with the shard. _group_replication.stop_group_slave\ (shard_mapping_defn[2], source_group_id, True) #The sleep ensures that the connector have refreshed their caches with the #new shards that have been added as a result of the split. time.sleep(_utils.TTL) #Reset the read only flag on the source server. source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) master = MySQLServer.fetch(source_group.master) if master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) if not update_only: master.connect() master.read_only = False #Kill all the existing connections on the servers source_group.kill_connections_on_servers() #allow updates in the destination group master destn_group_master.read_only = False
def _setup_shard_switch_split(shard_id, source_group_id, destination_group_id, split_value, prune_limit, cmd, update_only): """Setup the moved shard to map to the new group. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation. :update_only: Only update the state store and skip provisioning. """ #Fetch the Range sharding specification. range_sharding_spec, source_shard, shard_mappings, shard_mapping_defn = \ _services_sharding.verify_and_fetch_shard(shard_id) #Disable the old shard source_shard.disable() #Remove the old shard. range_sharding_spec.remove() source_shard.remove() destination_group = Group.fetch(destination_group_id) if destination_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destination_group_id, )) destn_group_master = MySQLServer.fetch(destination_group.master) if destn_group_master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) destn_group_master.connect() #Make the destination group as read only to disable updates until the #connectors update their caches, thus avoiding inconsistency. destn_group_master.read_only = True #Add the new shards. Generate new shard IDs for the shard being #split and also for the shard that is created as a result of the split. new_shard_1 = Shards.add(source_shard.group_id, "DISABLED") new_shard_2 = Shards.add(destination_group_id, "DISABLED") #Both of the shard mappings associated with this shard_id should #be of the same sharding type. Hence it is safe to use one of the #shard mappings. if shard_mappings[0].type_name == "HASH": #In the case of a split involving a HASH sharding scheme, #the shard that is split gets a new shard_id, while the split #gets the new computed lower_bound and also a new shard id. #NOTE: How the shard that is split retains its lower_bound. HashShardingSpecification.add_hash_split( range_sharding_spec.shard_mapping_id, new_shard_1.shard_id, range_sharding_spec.lower_bound ) HashShardingSpecification.add_hash_split( range_sharding_spec.shard_mapping_id, new_shard_2.shard_id, split_value ) else: #Add the new ranges. Note that the shard being split retains #its lower_bound, while the new shard gets the computed, #lower_bound. RangeShardingSpecification.add( range_sharding_spec.shard_mapping_id, range_sharding_spec.lower_bound, new_shard_1.shard_id ) RangeShardingSpecification.add( range_sharding_spec.shard_mapping_id, split_value, new_shard_2.shard_id ) #The sleep ensures that the connector have refreshed their caches with the #new shards that have been added as a result of the split. time.sleep(_utils.TTL) #The source shard group master would have been marked as read only #during the sync. Remove the read_only flag. source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) source_group_master = MySQLServer.fetch(source_group.master) if source_group_master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) source_group_master.connect() #Kill all the existing connections on the servers source_group.kill_connections_on_servers() #Allow connections on the source group master source_group_master.read_only = False #Allow connections on the destination group master destn_group_master.read_only = False #Setup replication for the new group from the global server _group_replication.setup_group_replication \ (shard_mapping_defn[2], destination_group_id) #Enable the split shards new_shard_1.enable() new_shard_2.enable() #Trigger changing the mappings for the shard that was copied if not update_only: _events.trigger_within_procedure( PRUNE_SHARDS, new_shard_1.shard_id, new_shard_2.shard_id, prune_limit )
def _setup_move_sync(shard_id, source_group_id, destn_group_id, split_value, cmd): """Setup replication between the source and the destination groups and ensure that they are in sync. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param cmd: Indicates the type of re-sharding operation """ source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) destination_group = Group.fetch(destn_group_id) if destination_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destination_group_id, )) master = MySQLServer.fetch(source_group.master) if master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) master.connect() slave = MySQLServer.fetch(destination_group.master) if slave is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) slave.connect() #Stop and reset any slave that might be running on the slave server. _replication.stop_slave(slave, wait=True) _replication.reset_slave(slave, clean=True) #Change the master to the shard group master. _replication.switch_master(slave, master, master. user, master.passwd) #Start the slave so that syncing of the data begins _replication.start_slave(slave, wait=True) #Synchronize until the slave catches up with the master. _replication.synchronize_with_read_only(slave, master) #Reset replication once the syncing is done. _replication.stop_slave(slave, wait=True) _replication.reset_slave(slave, clean=True) #Trigger changing the mappings for the shard that was copied _events.trigger_within_procedure( SETUP_RESHARDING_SWITCH, shard_id, source_group_id, destn_group_id, split_value, cmd )
def _check_shard_information(shard_id, destn_group_id, mysqldump_binary, mysqlclient_binary, split_value, config_file, prune_limit, cmd, update_only): """Verify the sharding information before starting a re-sharding operation. :param shard_id: The destination shard ID. :param destn_group_id: The Destination group ID. :param mysqldump_binary: The path to the mysqldump binary. :param mysqlclient_binary: The path to the mysqlclient binary. :param split_value: The point at which the sharding definition should be split. :param config_file: The complete path to the fabric configuration file. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates if it is a split or a move being executed. :param update_only: If the operation is a update only operation. """ if not _services_utils.is_valid_binary(mysqldump_binary): raise _errors.ShardingError( _services_sharding.MYSQLDUMP_NOT_FOUND % mysqldump_binary) if not _services_utils.is_valid_binary(mysqlclient_binary): raise _errors.ShardingError( _services_sharding.MYSQLCLIENT_NOT_FOUND % mysqlclient_binary) if cmd == "SPLIT": range_sharding_spec, _, shard_mappings, _ = \ _services_sharding.verify_and_fetch_shard(shard_id) upper_bound = \ SHARDING_SPECIFICATION_HANDLER[shard_mappings[0].type_name].\ get_upper_bound( range_sharding_spec.lower_bound, range_sharding_spec.shard_mapping_id, shard_mappings[0].type_name ) #If the underlying sharding scheme is a HASH. When a shard is split, #all the tables that are part of the shard, have the same sharding #scheme. All the shard mappings associated with this shard_id will be #of the same sharding type. Hence it is safe to use one of the shard #mappings. if shard_mappings[0].type_name == "HASH": if split_value is not None: raise _errors.ShardingError( _services_sharding.NO_LOWER_BOUND_FOR_HASH_SHARDING ) if upper_bound is None: #While splitting a range, retrieve the next upper bound and #find the mid-point, in the case where the next upper_bound #is unavailable pick the maximum value in the set of values in #the shard. upper_bound = HashShardingSpecification.fetch_max_key(shard_id) #Calculate the split value. split_value = \ SHARDING_DATATYPE_HANDLER[shard_mappings[0].type_name].\ split_value( range_sharding_spec.lower_bound, upper_bound ) elif split_value is not None: if not (SHARDING_DATATYPE_HANDLER[shard_mappings[0].type_name].\ is_valid_split_value( split_value, range_sharding_spec.lower_bound, upper_bound ) ): raise _errors.ShardingError( _services_sharding.INVALID_LOWER_BOUND_VALUE % (split_value, ) ) elif split_value is None: raise _errors.ShardingError( _services_sharding.SPLIT_VALUE_NOT_DEFINED ) #Ensure that the group does not already contain a shard. if Shards.lookup_shard_id(destn_group_id) is not None: raise _errors.ShardingError( _services_sharding.SHARD_MOVE_DESTINATION_NOT_EMPTY % (destn_group_id, ) ) #Fetch the group information for the source shard that #needs to be moved. source_shard = Shards.fetch(shard_id) if source_shard is None: raise _errors.ShardingError( _services_sharding.SHARD_NOT_FOUND % (shard_id, )) #Fetch the group_id and the group that hosts the source shard. source_group_id = source_shard.group_id destn_group = Group.fetch(destn_group_id) if destn_group is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_NOT_FOUND % (destn_group_id, )) if not update_only: _events.trigger_within_procedure( BACKUP_SOURCE_SHARD, shard_id, source_group_id, destn_group_id, mysqldump_binary, mysqlclient_binary, split_value, config_file, prune_limit, cmd, update_only ) else: _events.trigger_within_procedure( SETUP_RESHARDING_SWITCH, shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd, update_only )
def _run(self): """Function that verifies servers' availabilities. """ from mysql.fabric.server import ( Group, MySQLServer, ConnectionManager, ) ignored_status = [MySQLServer.FAULTY] quarantine = {} interval = FailureDetector._DETECTION_INTERVAL detections = FailureDetector._DETECTIONS detection_timeout = FailureDetector._DETECTION_TIMEOUT connection_manager = ConnectionManager() _persistence.init_thread() while self.__check: try: unreachable = set() group = Group.fetch(self.__group_id) if group is not None: for server in group.servers(): if server.status in ignored_status or \ MySQLServer.is_alive(server, detection_timeout): if server.status == MySQLServer.FAULTY: connection_manager.kill_connections(server) continue unreachable.add(server.uuid) _LOGGER.warning( "Server (%s) in group (%s) is unreachable.", server.uuid, self.__group_id ) unstable = False failed_attempts = 0 if server.uuid not in quarantine: quarantine[server.uuid] = failed_attempts = 1 else: failed_attempts = quarantine[server.uuid] + 1 quarantine[server.uuid] = failed_attempts if failed_attempts >= detections: unstable = True can_set_faulty = group.can_set_server_faulty( server, get_time() ) if unstable and can_set_faulty: # We have to make this transactional and make the # failover (i.e. report failure) robust to failures. # Otherwise, a master might be set to faulty and # a new one never promoted. server.status = MySQLServer.FAULTY connection_manager.kill_connections(server) procedures = trigger("REPORT_FAILURE", None, str(server.uuid), threading.current_thread().name, MySQLServer.FAULTY, False ) executor = _executor.Executor() for procedure in procedures: executor.wait_for_procedure(procedure) for uuid in quarantine.keys(): if uuid not in unreachable: del quarantine[uuid] except (_errors.ExecutorError, _errors.DatabaseError): pass except Exception as error: _LOGGER.exception(error) time.sleep(interval / detections) _persistence.deinit_thread()
def _setup_shard_switch_split(shard_id, source_group_id, destination_group_id, split_value, prune_limit, cmd, update_only): """Setup the moved shard to map to the new group. :param shard_id: The shard ID of the shard that needs to be moved. :param source_group_id: The group_id of the source shard. :param destn_group_id: The ID of the group to which the shard needs to be moved. :param split_value: Indicates the value at which the range for the particular shard will be split. Will be set only for shard split operations. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates the type of re-sharding operation. :update_only: Only update the state store and skip provisioning. """ #Fetch the Range sharding specification. range_sharding_spec, source_shard, shard_mappings, shard_mapping_defn = \ _services_sharding.verify_and_fetch_shard(shard_id) #Disable the old shard source_shard.disable() #Remove the old shard. range_sharding_spec.remove() source_shard.remove() destination_group = Group.fetch(destination_group_id) if destination_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destination_group_id, )) destn_group_master = MySQLServer.fetch(destination_group.master) if destn_group_master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) destn_group_master.connect() #Make the destination group as read only to disable updates until the #connectors update their caches, thus avoiding inconsistency. destn_group_master.read_only = True #Add the new shards. Generate new shard IDs for the shard being #split and also for the shard that is created as a result of the split. new_shard_1 = Shards.add(source_shard.group_id, "DISABLED") new_shard_2 = Shards.add(destination_group_id, "DISABLED") #Both of the shard mappings associated with this shard_id should #be of the same sharding type. Hence it is safe to use one of the #shard mappings. if shard_mappings[0].type_name == "HASH": #In the case of a split involving a HASH sharding scheme, #the shard that is split gets a new shard_id, while the split #gets the new computed lower_bound and also a new shard id. #NOTE: How the shard that is split retains its lower_bound. HashShardingSpecification.add_hash_split( range_sharding_spec.shard_mapping_id, new_shard_1.shard_id, range_sharding_spec.lower_bound) HashShardingSpecification.add_hash_split( range_sharding_spec.shard_mapping_id, new_shard_2.shard_id, split_value) else: #Add the new ranges. Note that the shard being split retains #its lower_bound, while the new shard gets the computed, #lower_bound. RangeShardingSpecification.add(range_sharding_spec.shard_mapping_id, range_sharding_spec.lower_bound, new_shard_1.shard_id) RangeShardingSpecification.add(range_sharding_spec.shard_mapping_id, split_value, new_shard_2.shard_id) #The sleep ensures that the connector have refreshed their caches with the #new shards that have been added as a result of the split. time.sleep(_utils.TTL) #The source shard group master would have been marked as read only #during the sync. Remove the read_only flag. source_group = Group.fetch(source_group_id) if source_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (source_group_id, )) source_group_master = MySQLServer.fetch(source_group.master) if source_group_master is None: raise _errors.ShardingError( _services_sharding.SHARD_GROUP_MASTER_NOT_FOUND) source_group_master.connect() #Kill all the existing connections on the servers source_group.kill_connections_on_servers() #Allow connections on the source group master source_group_master.read_only = False #Allow connections on the destination group master destn_group_master.read_only = False #Setup replication for the new group from the global server _group_replication.setup_group_replication \ (shard_mapping_defn[2], destination_group_id) #Enable the split shards new_shard_1.enable() new_shard_2.enable() #Trigger changing the mappings for the shard that was copied if not update_only: _events.trigger_within_procedure(PRUNE_SHARDS, new_shard_1.shard_id, new_shard_2.shard_id, prune_limit)
def test_switchover_with_no_master(self): """Ensure that a switchover/failover happens when masters in the shard and global groups are dead. """ # Check that a shard group has it master pointing to a the master # in the global group. global_group = Group.fetch("GROUPID1") shard_group = Group.fetch("GROUPID2") other_shard_group = Group.fetch("GROUPID3") global_master = MySQLServer.fetch(global_group.master) global_master.connect() shard_master = MySQLServer.fetch(shard_group.master) shard_master.connect() other_shard_master = MySQLServer.fetch(other_shard_group.master) other_shard_master.connect() self.assertEqual( _replication.slave_has_master(shard_master), str(global_group.master) ) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) ) # Demote the master in the global group and check that a # shard group points to None. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual( _replication.slave_has_master(other_shard_master), None ) # Demote the master in a shard group and promote the master # in the global group. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID2") shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) ) # Promote the master in the previous shard group and check that # everything is back to normal. global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual(_replication.slave_has_master(shard_master), None) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, None) self.proxy.group.promote("GROUPID2", str(shard_master.uuid)) self.assertEqual( _replication.slave_has_master(shard_master), str(global_group.master) ) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) ) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) # Demote the master in the global group, check that a shard group # points to None, promot it again and check that everything is back # to normal global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) shard_group = Group.fetch("GROUPID2") self.assertEqual(shard_group.master, shard_master.uuid) self.proxy.group.demote("GROUPID1") global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, None) self.assertEqual(_replication.slave_has_master(shard_master), None) self.proxy.group.promote("GROUPID1", str(global_master.uuid)) global_group = Group.fetch("GROUPID1") self.assertEqual(global_group.master, global_master.uuid) self.assertEqual( _replication.slave_has_master(shard_master), str(global_group.master) ) self.assertEqual( _replication.slave_has_master(other_shard_master), str(global_group.master) )
def _check_shard_information(shard_id, destn_group_id, mysqldump_binary, mysqlclient_binary, split_value, config_file, prune_limit, cmd, update_only): """Verify the sharding information before starting a re-sharding operation. :param shard_id: The destination shard ID. :param destn_group_id: The Destination group ID. :param mysqldump_binary: The path to the mysqldump binary. :param mysqlclient_binary: The path to the mysqlclient binary. :param split_value: The point at which the sharding definition should be split. :param config_file: The complete path to the fabric configuration file. :param prune_limit: The number of DELETEs that should be done in one batch. :param cmd: Indicates if it is a split or a move being executed. :param update_only: If the operation is a update only operation. """ if not _services_utils.is_valid_binary(mysqldump_binary): raise _errors.ShardingError(_services_sharding.MYSQLDUMP_NOT_FOUND % mysqldump_binary) if not _services_utils.is_valid_binary(mysqlclient_binary): raise _errors.ShardingError(_services_sharding.MYSQLCLIENT_NOT_FOUND % mysqlclient_binary) if cmd == "SPLIT": range_sharding_spec, _, shard_mappings, _ = \ _services_sharding.verify_and_fetch_shard(shard_id) upper_bound = \ SHARDING_SPECIFICATION_HANDLER[shard_mappings[0].type_name].\ get_upper_bound( range_sharding_spec.lower_bound, range_sharding_spec.shard_mapping_id, shard_mappings[0].type_name ) #If the underlying sharding scheme is a HASH. When a shard is split, #all the tables that are part of the shard, have the same sharding #scheme. All the shard mappings associated with this shard_id will be #of the same sharding type. Hence it is safe to use one of the shard #mappings. if shard_mappings[0].type_name == "HASH": if split_value is not None: raise _errors.ShardingError( _services_sharding.NO_LOWER_BOUND_FOR_HASH_SHARDING) if upper_bound is None: #While splitting a range, retrieve the next upper bound and #find the mid-point, in the case where the next upper_bound #is unavailable pick the maximum value in the set of values in #the shard. upper_bound = HashShardingSpecification.fetch_max_key(shard_id) #Calculate the split value. split_value = \ SHARDING_DATATYPE_HANDLER[shard_mappings[0].type_name].\ split_value( range_sharding_spec.lower_bound, upper_bound ) elif split_value is not None: if not (SHARDING_DATATYPE_HANDLER[shard_mappings[0].type_name].\ is_valid_split_value( split_value, range_sharding_spec.lower_bound, upper_bound ) ): raise _errors.ShardingError( _services_sharding.INVALID_LOWER_BOUND_VALUE % (split_value, )) elif split_value is None: raise _errors.ShardingError( _services_sharding.SPLIT_VALUE_NOT_DEFINED) #Ensure that the group does not already contain a shard. if Shards.lookup_shard_id(destn_group_id) is not None: raise _errors.ShardingError( _services_sharding.SHARD_MOVE_DESTINATION_NOT_EMPTY % (destn_group_id, )) #Fetch the group information for the source shard that #needs to be moved. source_shard = Shards.fetch(shard_id) if source_shard is None: raise _errors.ShardingError(_services_sharding.SHARD_NOT_FOUND % (shard_id, )) #Fetch the group_id and the group that hosts the source shard. source_group_id = source_shard.group_id destn_group = Group.fetch(destn_group_id) if destn_group is None: raise _errors.ShardingError(_services_sharding.SHARD_GROUP_NOT_FOUND % (destn_group_id, )) if not update_only: _events.trigger_within_procedure(BACKUP_SOURCE_SHARD, shard_id, source_group_id, destn_group_id, mysqldump_binary, mysqlclient_binary, split_value, config_file, prune_limit, cmd, update_only) else: _events.trigger_within_procedure(SETUP_RESHARDING_SWITCH, shard_id, source_group_id, destn_group_id, split_value, prune_limit, cmd, update_only)