def get_random_tablet(self, keyspace=None, shard_name=None, cell=None, tablet_type=None, task_number=None): """Get a random tablet name. Args: keyspace: name of the keyspace to get information on the master. shard_name: shard to select tablet from (None for random) (string). cell: cell to select tablet from (None for random) (string). tablet_type: type of tablet to select (None for random) (string). task_number: a specific task number (None for random) (int). Returns: random tablet name (cell-uid) (string). """ keyspace = keyspace or random.choice(self.keyspaces) shard_name = shard_name or ( sharding_utils.get_shard_name( random.randint(0, self.shards[self.keyspaces.index(keyspace)]))) cell = cell or random.choice(self.cells) tablets = [t.split(' ') for t in self.vtctl_helper.execute_vtctl_command( ['ListShardTablets', '%s/%s' % (keyspace, shard_name)]).split('\n')] cell_tablets = [t for t in tablets if self.get_tablet_cell(t[0]) == cell] if task_number: return cell_tablets[task_number][0] if tablet_type: return random.choice([t[0] for t in cell_tablets if t[3] == tablet_type]) return random.choice(cell_tablets)[0]
def implicit_reparent(self, keyspace, shard, num_shards, perform_emergency_reparent=False): """Performs an implicit reparent. This function will restart the current master task and verify that a new task was selected to be the master. Args: keyspace: Name of the keyspace to reparent (string) shard: Numeric ID of the shard to reparent (zero based int) num_shards: Total number of shards (int) perform_emergency_reparent: Do an emergency reparent as well (bool) """ shard_name = sharding_utils.get_shard_name(shard, num_shards) original_master_name = (self.env.get_current_master_name( keyspace, shard_name)) logging.info('Restarting %s/%s, current master: %s', keyspace, shard_name, original_master_name) ret_val = self.env.restart_mysql_task(original_master_name, 'mysql', True) self.assertEquals(ret_val, 0, msg='restart failed (returned %d)' % ret_val) if perform_emergency_reparent: next_master = self.env.get_next_master(keyspace, shard_name)[2] logging.info('Emergency reparenting %s/%s to %s', keyspace, shard_name, next_master) self.env.internal_reparent(keyspace, shard_name, next_master, emergency=True) start_time = time.time() while time.time() - start_time < self.reparent_timeout_threshold: new_master_name = self.env.get_current_master_name( keyspace, shard_name) if new_master_name != original_master_name: break time.sleep(1) self.assertNotEquals( new_master_name, original_master_name, msg='Expected master tablet to change, but it remained as %s' % (new_master_name)) logging.info('restart on %s/%s resulted in new master: %s', keyspace, shard_name, new_master_name)
def implicit_reparent(self, keyspace, shard, num_shards): """Performs an implicit reparent. This function will call borg restart on the current master task and verify that a new task was selected to be the master. Args: keyspace: Name of the keyspace to reparent (string) shard: Numeric ID of the shard to reparent (zero based int) num_shards: Total number of shards (int) """ shard_name = sharding_utils.get_shard_name(shard, num_shards) original_master_name = (self.env.get_current_master_name( keyspace, shard_name)) original_master_cell = self.env.get_tablet_cell(original_master_name) master_task_num = self.env.get_tablet_task_number(original_master_name) logging.info('Restarting %s/%s, current master: %s, task: %d', keyspace, shard_name, original_master_name, master_task_num) ret_val = self.env.restart_mysql_task(original_master_cell, keyspace, shard, master_task_num, 'replica', 'mysql-alloc', True) self.assertEquals(ret_val, 0, msg='restartalloc failed (returned %d)' % ret_val) start_time = time.time() while time.time() - start_time < self.reparent_timeout_threshold: new_master_name = self.env.get_current_master_name( keyspace, shard_name) new_master_task_num = self.env.get_tablet_task_number( new_master_name) if new_master_name != original_master_name: break time.sleep(1) self.assertNotEquals( new_master_name, original_master_name, msg='Expected master tablet to change, but it remained as %s' % (new_master_name)) logging.info( 'restartalloc on %s/%s resulted in new master: %s, task: %d', keyspace, shard_name, new_master_name, new_master_task_num)
def get_current_master_cell(self, keyspace): """Obtains current master cell. This gets the master cell for the first shard in the keyspace, and assumes that all shards share the same master. Args: keyspace: name of the keyspace to get the master cell for (string). Returns: master cell name (string). """ num_shards = self.num_shards[self.keyspaces.index(keyspace)] first_shard_name = sharding_utils.get_shard_name(0, num_shards) first_shard_master_tablet = ( self.get_current_master_name(keyspace, first_shard_name)) return self.get_tablet_cell(first_shard_master_tablet)
def implicit_reparent( self, keyspace, shard, num_shards, perform_emergency_reparent=False): """Performs an implicit reparent. This function will restart the current master task and verify that a new task was selected to be the master. Args: keyspace: Name of the keyspace to reparent (string) shard: Numeric ID of the shard to reparent (zero based int) num_shards: Total number of shards (int) perform_emergency_reparent: Do an emergency reparent as well (bool) """ shard_name = sharding_utils.get_shard_name(shard, num_shards) original_master_name = ( self.env.get_current_master_name(keyspace, shard_name)) logging.info('Restarting %s/%s, current master: %s', keyspace, shard_name, original_master_name) ret_val = self.env.restart_mysql_task(original_master_name, 'mysql', True) self.assertEquals(ret_val, 0, msg='restart failed (returned %d)' % ret_val) if perform_emergency_reparent: next_master = self.env.get_next_master(keyspace, shard_name)[2] logging.info('Emergency reparenting %s/%s to %s', keyspace, shard_name, next_master) self.env.internal_reparent( keyspace, shard_name, next_master, emergency=True) start_time = time.time() while time.time() - start_time < self.reparent_timeout_threshold: new_master_name = self.env.get_current_master_name(keyspace, shard_name) if new_master_name != original_master_name: break time.sleep(1) self.assertNotEquals( new_master_name, original_master_name, msg='Expected master tablet to change, but it remained as %s' % ( new_master_name)) logging.info('restart on %s/%s resulted in new master: %s', keyspace, shard_name, new_master_name)
def test_backup(self): logging.info('Performing %s backup cycles', self.num_backups) for attempt in xrange(self.num_backups): logging.info('Backup iteration %d of %d', attempt + 1, self.num_backups) for keyspace, num_shards in zip(self.env.keyspaces, self.env.num_shards): backup_tablets = [] for shard in xrange(num_shards): # Pick a random replica tablet in each shard tablets = self.env.get_tablet_types_for_shard( keyspace, sharding_utils.get_shard_name(shard, num_shards)) available_tablets = [x for x in tablets if x[1] == 'replica'] self.assertTrue( len(available_tablets), 'No available tablets found to backup!') tablet_to_backup_name = random.choice(available_tablets)[0] backup_tablets.append(tablet_to_backup_name) self.perform_backup(backup_tablets) self.perform_restore(backup_tablets, num_shards)
def implicit_reparent(self, keyspace, shard, num_shards): """Performs an implicit reparent. This function will call borg restart on the current master task and verify that a new task was selected to be the master. Args: keyspace: Name of the keyspace to reparent (string) shard: Numeric ID of the shard to reparent (zero based int) num_shards: Total number of shards (int) """ shard_name = sharding_utils.get_shard_name(shard, num_shards) original_master_name = ( self.env.get_current_master_name(keyspace, shard_name)) original_master_cell = self.env.get_tablet_cell(original_master_name) master_task_num = self.env.get_tablet_task_number(original_master_name) logging.info('Restarting %s/%s, current master: %s, task: %d', keyspace, shard_name, original_master_name, master_task_num) ret_val = self.env.restart_mysql_task( original_master_cell, keyspace, shard, master_task_num, 'replica', 'mysql-alloc', True) self.assertEquals(ret_val, 0, msg='restartalloc failed (returned %d)' % ret_val) start_time = time.time() while time.time() - start_time < self.reparent_timeout_threshold: new_master_name = self.env.get_current_master_name(keyspace, shard_name) new_master_task_num = self.env.get_tablet_task_number(new_master_name) if new_master_name != original_master_name: break time.sleep(1) self.assertNotEquals( new_master_name, original_master_name, msg='Expected master tablet to change, but it remained as %s' % ( new_master_name)) logging.info('restartalloc on %s/%s resulted in new master: %s, task: %d', keyspace, shard_name, new_master_name, new_master_task_num)
def setUpClass(cls): super(ReparentTest, cls).setUpClass() # number of reparent iterations cls.num_reparents = int(cls.test_params.get('num_reparents', '1')) # max allowable median master downtime in seconds cls.master_downtime_threshold = int(cls.test_params.get( 'master_downtime_threshold', '20')) # seconds to wait for reparent to result in a new master cls.reparent_timeout_threshold = int(cls.test_params.get( 'reparent_timeout_threshold', '30')) for keyspace, num_shards in zip(cls.env.keyspaces, cls.env.num_shards): for shard in xrange(num_shards): shard_name = sharding_utils.get_shard_name(shard, num_shards) backup_tablet_uid = cls.env.get_random_tablet( keyspace, shard_name, tablet_type='replica') logging.info('Taking a backup on tablet %s for %s/%s', backup_tablet_uid, keyspace, shard_name) cls.env.vtctl_helper.execute_vtctl_command( ['Backup', backup_tablet_uid])
def setUpClass(cls): super(ReparentTest, cls).setUpClass() # number of reparent iterations cls.num_reparents = int(cls.test_params.get('num_reparents', '1')) # max allowable median master downtime in seconds cls.master_downtime_threshold = int( cls.test_params.get('master_downtime_threshold', '20')) # seconds to wait for reparent to result in a new master cls.reparent_timeout_threshold = int( cls.test_params.get('reparent_timeout_threshold', '30')) for keyspace, num_shards in zip(cls.env.keyspaces, cls.env.num_shards): for shard in xrange(num_shards): shard_name = sharding_utils.get_shard_name(shard, num_shards) backup_tablet_uid = cls.env.get_random_tablet( keyspace, shard_name, tablet_type='replica') logging.info('Taking a backup on tablet %s for %s/%s', backup_tablet_uid, keyspace, shard_name) cls.env.vtctl_helper.execute_vtctl_command( ['Backup', backup_tablet_uid])
def explicit_reparent(self, keyspace, num_shards, external=False, cross_cell=False): """Performs an explicit reparent. This function will explicitly select a new master and verify that the topology is updated. Args: keyspace: Name of the keyspace to reparent (string) num_shards: Total number of shards (int) external: Whether the reparent should be external or through vtctl (bool) cross_cell: Whether to reparent to a different cell (bool) Returns: How long we waited for the reparent. The time begins just before calling an explicit reparent. This is a list of floats, one for each shard. For cross-cell reparents, it returns []. """ next_masters = [] durations = [] for shard in xrange(num_shards): shard_name = sharding_utils.get_shard_name(shard, num_shards) original_master = self.env.get_current_master_name(keyspace, shard_name) next_master = self.env.get_next_master(keyspace, shard_name, cross_cell) next_masters.append(next_master) self.env.wait_for_good_failover_status(keyspace, shard_name) # Call Reparent in a separate thread. def reparent_shard(shard, shard_name, original_master, next_master): logging.info('Reparenting %s/%s from %s to %s', keyspace, shard_name, original_master, next_master[2]) if external: return_code, return_output = self.env.external_reparent( keyspace, next_master[0], shard, new_task_num=next_master[1]) else: return_code, return_output = self.env.internal_reparent( keyspace, shard_name, next_master[2]) logging.info('Reparent returned %d for %s/%s: %s', return_code, keyspace, shard_name, return_output) thread = threading.Thread(target=reparent_shard, args=[shard, shard_name, original_master, next_master]) start_time = time.time() thread.start() # Wait for the reparent. while time.time() - start_time < self.reparent_timeout_threshold: try: tablet_health = json.loads( self.env.vtctl_helper.execute_vtctl_command( ['VtTabletStreamHealth', next_master[2]])) if tablet_health['target']['tablet_type'] == topodata_pb2.MASTER: duration = time.time() - start_time durations.append(duration) logging.info('Reparent took %f seconds', duration) break except (IndexError, KeyError, vtctl_helper.VtctlClientError): pass else: self.fail('Timed out waiting for reparent on %s/%s' % ( keyspace, shard_name)) thread.join() return durations
def explicit_reparent(self, keyspace, num_shards, external=False, cross_cell=False): """Performs an explicit reparent. This function will explicitly select a new master and verify that the topology is updated. Args: keyspace: Name of the keyspace to reparent (string) num_shards: Total number of shards (int) external: Whether the reparent should be external or through vtctl (bool) cross_cell: Whether to reparent to a different cell (bool) Returns: How long we waited for the reparent. The time begins just before calling an explicit reparent. This is a list of floats, one for each shard. For cross-cell reparents, it returns []. """ next_masters = [] durations = [] for shard in xrange(num_shards): shard_name = sharding_utils.get_shard_name(shard, num_shards) original_master = self.env.get_current_master_name( keyspace, shard_name) next_master = self.env.get_next_master(keyspace, shard_name, cross_cell) next_masters.append(next_master) self.env.wait_for_good_failover_status(keyspace, shard_name) # Call Reparent in a separate thread. def reparent_shard(shard_name, original_master, next_master): logging.info('Reparenting %s/%s from %s to %s', keyspace, shard_name, original_master, next_master[2]) reparent_fn = self.env.external_reparent if external else ( self.env.internal_reparent) return_code, return_output = reparent_fn( keyspace, shard_name, next_master[2]) logging.info('Reparent returned %d for %s/%s: %s', return_code, keyspace, shard_name, return_output) thread = threading.Thread( target=reparent_shard, args=[shard_name, original_master, next_master]) start_time = time.time() thread.start() # Wait for the reparent. while time.time() - start_time < self.reparent_timeout_threshold: try: tablet_health = json.loads( self.env.vtctl_helper.execute_vtctl_command( ['VtTabletStreamHealth', next_master[2]])) if tablet_health['target'][ 'tablet_type'] == topodata_pb2.MASTER: duration = time.time() - start_time durations.append(duration) logging.info('Reparent took %f seconds', duration) break except (IndexError, KeyError, vtctl_helper.VtctlClientError) as e: logging.info( 'While waiting for reparent, got the following error: %s', e) else: self.fail('Timed out waiting for reparent on %s/%s' % (keyspace, shard_name)) thread.join() return durations