def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets): """Runs a vtworker SplitDiff on the given keyspace/shard, and then sets all former rdonly slaves back to rdonly. Args: keyspace_shard - keyspace/shard to run SplitDiff on (string) source_tablets - ShardTablets instance for the source shard destination_tablets - ShardTablets instance for the destination shard """ logging.debug("Running vtworker SplitDiff for %s" % keyspace_shard) stdout, stderr = utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', keyspace_shard], auto_log=True)
def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets): """Runs a vtworker SplitDiff on the given keyspace/shard. Sets all former rdonly slaves back to rdonly. Args: keyspace_shard: keyspace/shard to run SplitDiff on (string) source_tablets: ShardTablets instance for the source shard destination_tablets: ShardTablets instance for the destination shard """ logging.debug('Running vtworker SplitDiff for %s', keyspace_shard) stdout, stderr = utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', keyspace_shard], auto_log=True)
def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets): """Runs a vtworker SplitDiff on the given keyspace/shard. Sets all former rdonly slaves back to rdonly. Args: keyspace_shard: keyspace/shard to run SplitDiff on (string) source_tablets: ShardTablets instance for the source shard destination_tablets: ShardTablets instance for the destination shard """ _ = source_tablets, destination_tablets logging.debug("Running vtworker SplitDiff for %s", keyspace_shard) _, _ = utils.run_vtworker(["-cell", "test_nj", "SplitDiff", keyspace_shard], auto_log=True)
def split_clone_fails_not_enough_health_rdonly_tablets(self): """Verify vtworker errors if there aren't enough healthy RDONLY tablets.""" _, stderr = utils.run_vtworker([ '-cell', 'test_nj', '--wait_for_healthy_rdonly_tablets_timeout', '1s', 'SplitClone', '--min_healthy_rdonly_tablets', '2', 'test_keyspace/0' ], auto_log=True, expect_fail=True) self.assertIn( 'findTargets() failed: FindWorkerTablet() failed for' ' test_nj/test_keyspace/0: not enough healthy RDONLY' ' tablets to choose from in (test_nj,test_keyspace/0),' ' have 1 healthy ones, need at least 2', stderr)
def split_clone_fails_not_enough_health_rdonly_tablets(self): """Verify vtworker errors if there aren't enough healthy RDONLY tablets.""" stdout, _ = utils.run_vtworker( ['-cell', 'test_nj', '--wait_for_healthy_rdonly_endpoints_timeout', '1s', 'SplitClone', '--min_healthy_rdonly_endpoints', '2', 'test_keyspace/0'], auto_log=True, expect_fail=True) self.assertIn(stdout, 'findTargets() failed: FindWorkerTablet() failed for' ' test_nj/test_keyspace/0: not enough healthy rdonly' ' endpoints to choose from in (test_nj,test_keyspace/0),' ' have 1 healthy ones, need at least 2')
def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets): """Runs a vtworker SplitDiff on the given keyspace/shard. Sets all former rdonly slaves back to rdonly. Args: keyspace_shard: keyspace/shard to run SplitDiff on (string) source_tablets: ShardTablets instance for the source shard destination_tablets: ShardTablets instance for the destination shard """ _ = source_tablets, destination_tablets logging.debug('Running vtworker SplitDiff for %s', keyspace_shard) _, _ = utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_endpoints', '1', keyspace_shard], auto_log=True)
def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets): """Runs a vtworker SplitDiff on the given keyspace/shard. Sets all former rdonly slaves back to rdonly. Args: keyspace_shard: keyspace/shard to run SplitDiff on (string) source_tablets: ShardTablets instance for the source shard destination_tablets: ShardTablets instance for the destination shard """ _ = source_tablets, destination_tablets logging.debug('Running vtworker SplitDiff for %s', keyspace_shard) _, _ = utils.run_vtworker( ['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', keyspace_shard], auto_log=True)
def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets): """Runs a vtworker SplitDiff on the given keyspace/shard, and then sets all former rdonly slaves back to rdonly. Args: keyspace_shard - keyspace/shard to run SplitDiff on (string) source_tablets - ShardTablets instance for the source shard destination_tablets - ShardTablets instance for the destination shard """ logging.debug("Running vtworker SplitDiff for %s" % keyspace_shard) stdout, stderr = utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', keyspace_shard], auto_log=True) for shard_tablets in (source_tablets, destination_tablets): for tablet in shard_tablets.rdonlys: utils.run_vtctl( ['ChangeSlaveType', tablet.tablet_alias, 'rdonly'], auto_log=True)
def test_resharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', keyspace_id_type]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'LegacySplitClone', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl(['SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-'], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1])
def setUpModule(): try: environment.topo_server().setup() setup_procs = [t.init_mysql() for t in all_tablets] utils.Vtctld().start() utils.wait_procs(setup_procs) # Set up binlog stream from shard 0 to shard 1. # Modeled after initial_sharding.py. utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyrange_constants.KIT_UINT64]) src_master.init_tablet('master', 'test_keyspace', '0') src_replica.init_tablet('replica', 'test_keyspace', '0') src_rdonly.init_tablet('rdonly', 'test_keyspace', '0') utils.validate_topology() for t in [src_master, src_replica, src_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) src_master.wait_for_vttablet_state('SERVING') for t in [src_replica, src_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', src_master.tablet_alias], auto_log=True) # Create schema logging.debug('Creating schema...') create_table = '''create table test_table( id bigint auto_increment, keyspace_id bigint(20) unsigned, msg varchar(64), primary key (id), index by_msg (msg) ) Engine=InnoDB''' utils.run_vtctl(['ApplySchema', '-sql=' + create_table, 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery # (for binlog players) and on the source rdonlys (for workers) utils.run_vtctl(['RunHealthCheck', src_replica.tablet_alias]) utils.run_vtctl(['RunHealthCheck', src_rdonly.tablet_alias]) # Create destination shard (won't be serving as there is no DB) dst_master.init_tablet('master', 'test_keyspace', '-') dst_replica.init_tablet('replica', 'test_keyspace', '-') dst_rdonly.init_tablet('rdonly', 'test_keyspace', '-') dst_master.start_vttablet(wait_for_state='NOT_SERVING') dst_replica.start_vttablet(wait_for_state='NOT_SERVING') dst_rdonly.start_vttablet(wait_for_state='NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/-', dst_master.tablet_alias], auto_log=True) # copy the schema utils.run_vtctl(['CopySchemaShard', src_replica.tablet_alias, 'test_keyspace/-'], auto_log=True) # run the clone worker (this is a degenerate case, source and destination # both have the full keyrange. Happens to work correctly). logging.debug('Running the clone worker to start binlog stream...') utils.run_vtworker(['--cell', 'test_nj', 'SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], auto_log=True) dst_master.wait_for_binlog_player_count(1) # Wait for dst_replica to be ready. dst_replica.wait_for_binlog_server_state('Enabled') except: tearDownModule() raise
def test_vertical_split(self): # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl(['CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0'], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0'], auto_log=True) # One of the two source rdonly tablets went spare after the clone. # Force a healthcheck on both to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', self.moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', self.moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', self.moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use vtworker to compare the data for t in [destination_rdonly1, destination_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) # One of each source and dest rdonly tablet went spare after the diff. # Force a healthcheck on all four to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2, destination_rdonly1, destination_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn( '<td><b>All</b>: 1000<br><b>Query</b>: 700<br>' '<b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(['MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf['cells'], ['test_nj']) self.assertTrue(found) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertNotIn('cells', ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # check the stats are correct self._check_stats() self._verify_vtctl_set_shard_tablet_control()
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet( 'master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet( 'master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet( 'master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery('vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl( ['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def test_resharding(self): # create the keyspace with just one shard shard_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') shard_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') shard_rdonly1.start_vttablet( wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='0') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start(cache_ttl='0') # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', keyspace_id_type]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias, 'replica']) # create the split shards shard_0_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='-80') shard_0_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='-80') shard_0_rdonly1.start_vttablet( wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='-80') shard_1_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='80-') shard_1_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='80-') shard_1_rdonly1.start_vttablet( wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='80-') # start vttablet on the split shards (no db created, # so they're all not serving) sharded_tablets = [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1] for t in sharded_tablets: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0') # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias, 'rdonly']) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl( ['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def setUpModule(): try: environment.topo_server().setup() setup_procs = [ src_master.init_mysql(), src_replica.init_mysql(), src_rdonly1.init_mysql(), src_rdonly2.init_mysql(), dst_master.init_mysql(), dst_replica.init_mysql(), ] utils.Vtctld().start() utils.wait_procs(setup_procs) # Set up binlog stream from shard 0 to shard 1. # Modeled after initial_sharding.py. utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyrange_constants.KIT_UINT64]) src_master.init_tablet('master', 'test_keyspace', '0') src_replica.init_tablet('replica', 'test_keyspace', '0') src_rdonly1.init_tablet('rdonly', 'test_keyspace', '0') src_rdonly2.init_tablet('rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) for t in [src_master, src_replica, src_rdonly1, src_rdonly2]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [src_master, src_replica, src_rdonly1, src_rdonly2]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', src_master.tablet_alias], auto_log=True) # Create schema logging.debug("Creating schema...") create_table = '''create table test_table( id bigint auto_increment, keyspace_id bigint(20) unsigned, msg varchar(64), primary key (id), index by_msg (msg) ) Engine=InnoDB''' utils.run_vtctl(['ApplySchema', '-sql=' + create_table, 'test_keyspace'], auto_log=True) # Create destination shard. dst_master.init_tablet('master', 'test_keyspace', '-') dst_replica.init_tablet('replica', 'test_keyspace', '-') dst_master.start_vttablet(wait_for_state='NOT_SERVING') dst_replica.start_vttablet(wait_for_state='NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/-', dst_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # copy the schema utils.run_vtctl(['CopySchemaShard', src_replica.tablet_alias, 'test_keyspace/-'], auto_log=True) # run the clone worked (this is a degenerate case, source and destination # both have the full keyrange. Happens to work correctly). logging.debug("Running the clone worker to start binlog stream...") utils.run_vtworker(['--cell', 'test_nj', 'SplitClone', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0'], auto_log=True) dst_master.wait_for_binlog_player_count(1) # Wait for dst_replica to be ready. dst_replica.wait_for_binlog_server_state("Enabled") except: tearDownModule() raise
def verify_successful_worker_copy_with_reparent(self, mysql_down=False): """Verifies that vtworker can successfully copy data for a SplitClone. Order of operations: 1. Run a background vtworker 2. Wait until the worker successfully resolves the destination masters. 3. Reparent the destination tablets 4. Wait until the vtworker copy is finished 5. Verify that the worker was forced to reresolve topology and retry writes due to the reparent. 6. Verify that the data was copied successfully to both new shards Args: mysql_down: boolean. If True, we take down the MySQL instances on the destination masters at first, then bring them back and reparent away. Raises: AssertionError if things didn't go as expected. """ if mysql_down: logging.debug('Shutting down mysqld on destination masters.') utils.wait_procs( [shard_0_master.shutdown_mysql(), shard_1_master.shutdown_mysql()]) worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--use_v3_resharding_mode=false'], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # --chunk_count is 2 because rows are currently ordered by primary key such # that all rows of the first shard come first and then the second shard. # TODO(mberlin): Remove --offline=false once vtworker ensures that the # destination shards are not behind the master's replication # position. args = ['SplitClone', '--offline=false', '--destination_writer_count', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999'] if not mysql_down: # Make the clone as slow as necessary such that there is enough time to # run PlannedReparent in the meantime. # TODO(mberlin): Once insert_values is fixed to uniformly distribute the # rows across shards when sorted by primary key, remove # --chunk_count 2, --min_rows_per_chunk 1 and set # --source_reader_count back to 1. args.extend(['--source_reader_count', '2', '--chunk_count', '2', '--min_rows_per_chunk', '1', '--write_query_max_rows', '1']) args.append('test_keyspace/0') workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port) if mysql_down: # If MySQL is down, we wait until vtworker retried at least once to make # sure it reached the point where a write failed due to MySQL being down. # There should be two retries at least, one for each destination shard. utils.poll_for_vars( 'vtworker', worker_port, 'WorkerRetryCount >= 2', condition_fn=lambda v: v.get('WorkerRetryCount') >= 2) logging.debug('Worker has retried at least twice, starting reparent now') # vtworker is blocked at this point. This is a good time to test that its # throttler server is reacting to RPCs. self.check_throttler_service('localhost:%d' % worker_rpc_port, ['test_keyspace/-80', 'test_keyspace/80-'], 9999) # Bring back masters. Since we test with semi-sync now, we need at least # one replica for the new master. This test is already quite expensive, # so we bring back the old master as a replica rather than having a third # replica up the whole time. logging.debug('Restarting mysqld on destination masters') utils.wait_procs( [shard_0_master.start_mysql(), shard_1_master.start_mysql()]) # Reparent away from the old masters. utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80', '-new_master', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-', '-new_master', shard_1_replica.tablet_alias], auto_log=True) else: # NOTE: There is a race condition around this: # It's possible that the SplitClone vtworker command finishes before the # PlannedReparentShard vtctl command, which we start below, succeeds. # Then the test would fail because vtworker did not have to retry. # # To workaround this, the test takes a parameter to increase the number of # rows that the worker has to copy (with the idea being to slow the worker # down). # You should choose a value for num_insert_rows, such that this test # passes for your environment (trial-and-error...) # Make sure that vtworker got past the point where it picked a master # for each destination shard ("finding targets" state). utils.poll_for_vars( 'vtworker', worker_port, 'WorkerState == cloning the data (online)', condition_fn=lambda v: v.get('WorkerState') == 'cloning the' ' data (online)') logging.debug('Worker is in copy state, starting reparent now') utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80', '-new_master', shard_0_replica.tablet_alias], auto_log=True) utils.run_vtctl( ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-', '-new_master', shard_1_replica.tablet_alias], auto_log=True) utils.wait_procs([workerclient_proc]) # Verify that we were forced to re-resolve and retry. worker_vars = utils.get_vars(worker_port) self.assertGreater(worker_vars['WorkerRetryCount'], 1, "expected vtworker to retry each of the two reparented" " destination masters at least once, but it didn't") self.assertNotEqual(worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") utils.kill_sub_process(worker_proc, soft=True) # Wait for the destination RDONLYs to catch up or the following offline # clone will try to insert rows which already exist. # TODO(mberlin): Remove this once SplitClone supports it natively. utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1) utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1) # Run final offline clone to enable filtered replication. _, _ = utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitClone', '--online=false', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], auto_log=True) # Make sure that everything is caught up to the same replication point self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets) self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets) self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica) self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
def test_vertical_split(self): utils.run_vtctl(["CreateKeyspace", "source_keyspace"]) utils.run_vtctl( [ "CreateKeyspace", "--served_from", "master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace", "destination_keyspace", ] ) source_master.init_tablet("master", "source_keyspace", "0") source_replica.init_tablet("replica", "source_keyspace", "0") source_rdonly1.init_tablet("rdonly", "source_keyspace", "0") source_rdonly2.init_tablet("rdonly", "source_keyspace", "0") # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(["RebuildKeyspaceGraph", "source_keyspace"], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "destination_keyspace"], auto_log=True) self._check_srv_keyspace( "ServedFrom(master): source_keyspace\n" + "ServedFrom(rdonly): source_keyspace\n" + "ServedFrom(replica): source_keyspace\n" ) destination_master.init_tablet("master", "destination_keyspace", "0") destination_replica.init_tablet("replica", "destination_keyspace", "0") destination_rdonly1.init_tablet("rdonly", "destination_keyspace", "0") destination_rdonly2.init_tablet("rdonly", "destination_keyspace", "0") utils.run_vtctl(["RebuildKeyspaceGraph", "source_keyspace"], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "destination_keyspace"], auto_log=True) self._check_srv_keyspace( "ServedFrom(master): source_keyspace\n" + "ServedFrom(rdonly): source_keyspace\n" + "ServedFrom(replica): source_keyspace\n" ) # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: t.create_db("vt_source_keyspace") t.start_vttablet(wait_for_state=None) destination_master.start_vttablet(wait_for_state=None, target_tablet_type="replica") for t in [destination_replica, destination_rdonly1, destination_rdonly2]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: t.wait_for_vttablet_state("SERVING") for t in [destination_master, destination_replica, destination_rdonly1, destination_rdonly2]: t.wait_for_vttablet_state("NOT_SERVING") # reparent to make the tablets work utils.run_vtctl(["InitShardMaster", "source_keyspace/0", source_master.tablet_alias], auto_log=True) utils.run_vtctl(["InitShardMaster", "destination_keyspace/0", destination_master.tablet_alias], auto_log=True) # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values("moving1", 100) moving2_first = self._insert_values("moving2", 100) staying1_first = self._insert_values("staying1", 100) staying2_first = self._insert_values("staying2", 100) self._check_values(source_master, "vt_source_keyspace", "moving1", moving1_first, 100) self._check_values(source_master, "vt_source_keyspace", "moving2", moving2_first, 100) self._check_values(source_master, "vt_source_keyspace", "staying1", staying1_first, 100) self._check_values(source_master, "vt_source_keyspace", "staying2", staying2_first, 100) self._check_values(source_master, "vt_source_keyspace", "view1", moving1_first, 100) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl( ["CopySchemaShard", "--tables", "moving.*,view1", source_rdonly1.tablet_alias, "destination_keyspace/0"], auto_log=True, ) utils.run_vtworker( [ "--cell", "test_nj", "--command_display_interval", "10ms", "VerticalSplitClone", "--tables", "moving.*,view1", "--strategy=-populate_blp_checkpoint", "--source_reader_count", "10", "--min_table_size_for_split", "1", "destination_keyspace/0", ], auto_log=True, ) utils.run_vtctl(["ChangeSlaveType", source_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", source_rdonly2.tablet_alias, "rdonly"], auto_log=True) # check values are present self._check_values(destination_master, "vt_destination_keyspace", "moving1", moving1_first, 100) self._check_values(destination_master, "vt_destination_keyspace", "moving2", moving2_first, 100) self._check_values(destination_master, "vt_destination_keyspace", "view1", moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values("moving1", 100) staying1_first_add1 = self._insert_values("staying1", 100) moving2_first_add1 = self._insert_values("moving2", 100) self._check_values_timeout(destination_master, "vt_destination_keyspace", "moving1", moving1_first_add1, 100) self._check_values_timeout(destination_master, "vt_destination_keyspace", "moving2", moving2_first_add1, 100) # use vtworker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(["-cell", "test_nj", "VerticalSplitDiff", "destination_keyspace/0"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", source_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", source_rdonly2.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", destination_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", destination_rdonly2.tablet_alias, "rdonly"], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn("Binlog player state: Running", destination_master_status) self.assertIn("moving.*", destination_master_status) self.assertIn( "<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>", destination_master_status ) self.assertIn("</html>", destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars["TabletStateName"], "NOT_SERVING") # check we can't migrate the master just yet utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "master"], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(["MigrateServedFrom", "--cells=test_ny", "destination_keyspace/0", "rdonly"], auto_log=True) self._check_srv_keyspace( "ServedFrom(master): source_keyspace\n" + "ServedFrom(rdonly): source_keyspace\n" + "ServedFrom(replica): source_keyspace\n" ) self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json(["GetKeyspace", "destination_keyspace"]) found = False for ksf in keyspace_json["served_froms"]: if ksf["tablet_type"] == 4: found = True self.assertEqual(ksf["cells"], ["test_nj"]) self.assertTrue(found) utils.run_vtctl( [ "SetKeyspaceServedFrom", "-source=source_keyspace", "-remove", "-cells=test_nj", "destination_keyspace", "rdonly", ], auto_log=True, ) keyspace_json = utils.run_vtctl_json(["GetKeyspace", "destination_keyspace"]) found = False for ksf in keyspace_json["served_froms"]: if ksf["tablet_type"] == 4: found = True self.assertFalse(found) utils.run_vtctl( ["SetKeyspaceServedFrom", "-source=source_keyspace", "destination_keyspace", "rdonly"], auto_log=True ) keyspace_json = utils.run_vtctl_json(["GetKeyspace", "destination_keyspace"]) found = False for ksf in keyspace_json["served_froms"]: if ksf["tablet_type"] == 4: found = True self.assertNotIn("cells", ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "rdonly"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n" + "ServedFrom(replica): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection( "source_keyspace", "destination_keyspace", ["rdonly"], ["master", "replica"], ["moving1", "moving2"] ) # then serve replica from the destination shards utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "replica"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection( "source_keyspace", "destination_keyspace", ["replica", "rdonly"], ["master"], ["moving1", "moving2"] ) # move replica back and forth utils.run_vtctl(["MigrateServedFrom", "-reverse", "destination_keyspace/0", "replica"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n" + "ServedFrom(replica): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "replica"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection( "source_keyspace", "destination_keyspace", ["replica", "rdonly"], ["master"], ["moving1", "moving2"] ) # then serve master from the destination shards utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "master"], auto_log=True) self._check_srv_keyspace("") self._check_blacklisted_tables(source_master, ["moving.*", "view1"]) self._check_blacklisted_tables(source_replica, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection( "source_keyspace", "destination_keyspace", ["replica", "rdonly", "master"], [], ["moving1", "moving2"] ) # check 'vtctl SetShardTabletControl' command works as expected: # clear the rdonly entry, re-add it, and then clear all entries. utils.run_vtctl(["SetShardTabletControl", "--remove", "source_keyspace/0", "rdonly"], auto_log=True) shard_json = utils.run_vtctl_json(["GetShard", "source_keyspace/0"]) self.assertEqual(len(shard_json["tablet_controls"]), 2) for tc in shard_json["tablet_controls"]: self.assertIn( tc["tablet_type"], [tablet.Tablet.tablet_type_value["MASTER"], tablet.Tablet.tablet_type_value["REPLICA"]], ) utils.run_vtctl( ["SetShardTabletControl", "--tables=moving.*,view1", "source_keyspace/0", "rdonly"], auto_log=True ) shard_json = utils.run_vtctl_json(["GetShard", "source_keyspace/0"]) for tc in shard_json["tablet_controls"]: if tc["tablet_type"] == 4: break self.assertEqual(["moving.*", "view1"], tc["blacklisted_tables"]) utils.run_vtctl(["SetShardTabletControl", "--remove", "source_keyspace/0", "rdonly"], auto_log=True) utils.run_vtctl(["SetShardTabletControl", "--remove", "source_keyspace/0", "replica"], auto_log=True) utils.run_vtctl(["SetShardTabletControl", "--remove", "source_keyspace/0", "master"], auto_log=True) shard_json = utils.run_vtctl_json(["GetShard", "source_keyspace/0"]) self.assertNotIn("tablet_controls", shard_json) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # check the stats are correct self._check_stats() # kill everything tablet.kill_tablets( [ source_master, source_replica, source_rdonly1, source_rdonly2, destination_master, destination_replica, destination_rdonly1, destination_rdonly2, ] )
def setUpModule(): try: environment.topo_server().setup() setup_procs = [t.init_mysql() for t in all_tablets] utils.Vtctld().start() utils.wait_procs(setup_procs) # Set up binlog stream from shard 0 to shard 1. # Modeled after initial_sharding.py. utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyrange_constants.KIT_UINT64 ]) src_master.init_tablet('replica', 'test_keyspace', '0') src_replica.init_tablet('replica', 'test_keyspace', '0') src_rdonly.init_tablet('rdonly', 'test_keyspace', '0') for t in [src_master, src_replica, src_rdonly]: t.start_vttablet(wait_for_state=None) for t in [src_master, src_replica, src_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', src_master.tablet_alias ], auto_log=True) # Create schema logging.debug('Creating schema...') create_table = '''create table test_table( id bigint auto_increment, keyspace_id bigint(20) unsigned, msg varchar(64), primary key (id), index by_msg (msg) ) Engine=InnoDB''' utils.run_vtctl( ['ApplySchema', '-sql=' + create_table, 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery # (for binlog players) and on the source rdonlys (for workers) utils.run_vtctl(['RunHealthCheck', src_replica.tablet_alias]) utils.run_vtctl(['RunHealthCheck', src_rdonly.tablet_alias]) # Create destination shard (won't be serving as there is no DB) dst_master.init_tablet('replica', 'test_keyspace', '-') dst_replica.init_tablet('replica', 'test_keyspace', '-') dst_rdonly.init_tablet('rdonly', 'test_keyspace', '-') dst_master.start_vttablet(wait_for_state='NOT_SERVING') dst_replica.start_vttablet(wait_for_state='NOT_SERVING') dst_rdonly.start_vttablet(wait_for_state='NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-', dst_master.tablet_alias ], auto_log=True) # copy the schema utils.run_vtctl( ['CopySchemaShard', src_replica.tablet_alias, 'test_keyspace/-'], auto_log=True) # run the clone worker (this is a degenerate case, source and destination # both have the full keyrange. Happens to work correctly). logging.debug('Running the clone worker to start binlog stream...') utils.run_vtworker([ '--cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) dst_master.wait_for_binlog_player_count(1) # Wait for dst_replica to be ready. dst_replica.wait_for_binlog_server_state('Enabled') except: tearDownModule() raise
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_master.init_tablet('master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly1.init_tablet('rdonly', 'test_keyspace', '0') shard_rdonly2.init_tablet('rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly1.wait_for_vttablet_state('SERVING') shard_rdonly2.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly1.init_tablet('rdonly', 'test_keyspace', '-80') shard_0_rdonly2.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly2.init_tablet('rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_0_rdonly2, shard_1_master, shard_1_replica, shard_1_rdonly1, shard_1_rdonly2 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_0_rdonly2, shard_1_master, shard_1_replica, shard_1_rdonly1, shard_1_rdonly2 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly2.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug("Waiting for binlog players to start on new masters...") shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly2.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets( [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]) for t in [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_0_rdonly2, shard_1_master, shard_1_replica, shard_1_rdonly1, shard_1_rdonly2 ])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_sharding_key', '--sharding_column_type', keyspace_id_type, '--split_shard_count', '4', 'test_keyspace']) shard_0_master.init_tablet('master', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('master', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('master', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the merge shards shard_dest_master.init_tablet('master', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) # Start masters with enabled healthcheck (necessary for resolving the # destination master). shard_dest_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_endpoints', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_endpoints', '1', '--source_uid', '0', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_endpoints', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def setUpModule(): try: environment.topo_server().setup() setup_procs = [ src_master.init_mysql(), src_replica.init_mysql(), src_rdonly1.init_mysql(), src_rdonly2.init_mysql(), dst_master.init_mysql(), dst_replica.init_mysql(), ] utils.Vtctld().start() utils.wait_procs(setup_procs) # Set up binlog stream from shard 0 to shard 1. # Modeled after initial_sharding.py. utils.run_vtctl(["CreateKeyspace", "test_keyspace"]) utils.run_vtctl( ["SetKeyspaceShardingInfo", "-force", "test_keyspace", "keyspace_id", keyrange_constants.KIT_UINT64] ) src_master.init_tablet("master", "test_keyspace", "0") src_replica.init_tablet("replica", "test_keyspace", "0") src_rdonly1.init_tablet("rdonly", "test_keyspace", "0") src_rdonly2.init_tablet("rdonly", "test_keyspace", "0") utils.run_vtctl(["RebuildShardGraph", "test_keyspace/0"]) utils.validate_topology() utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) for t in [src_master, src_replica, src_rdonly1, src_rdonly2]: t.create_db("vt_test_keyspace") t.start_vttablet(wait_for_state=None) for t in [src_master, src_replica, src_rdonly1, src_rdonly2]: t.wait_for_vttablet_state("SERVING") utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/0", src_master.tablet_alias], auto_log=True) # Create schema logging.debug("Creating schema...") create_table = """create table test_table( id bigint auto_increment, keyspace_id bigint(20) unsigned, msg varchar(64), primary key (id), index by_msg (msg) ) Engine=InnoDB""" utils.run_vtctl(["ApplySchemaKeyspace", "-simple", "-sql=" + create_table, "test_keyspace"], auto_log=True) # Create destination shard. dst_master.init_tablet("master", "test_keyspace", "1") dst_replica.init_tablet("replica", "test_keyspace", "1") dst_master.start_vttablet(wait_for_state="NOT_SERVING") dst_replica.start_vttablet(wait_for_state="NOT_SERVING") utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/1", dst_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) # copy the schema utils.run_vtctl(["CopySchemaShard", src_replica.tablet_alias, "test_keyspace/1"], auto_log=True) # run the clone worked (this is a degenerate case, source and destination # both have the full keyrange. Happens to work correctly). logging.debug("Running the clone worker to start binlog stream...") utils.run_vtworker( [ "--cell", "test_nj", "SplitClone", "--strategy=-populate_blp_checkpoint", "--source_reader_count", "10", "--min_table_size_for_split", "1", "test_keyspace/0", ], auto_log=True, ) dst_master.wait_for_binlog_player_count(1) # Wait for dst_replica to be ready. dst_replica.wait_for_binlog_server_state("Enabled") except: tearDownModule() raise
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl([ 'CreateKeyspace', '--served_from', 'master:source_keyspace,replica:source_keyspace,rdonly:' 'source_keyspace', 'destination_keyspace' ]) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly1.init_tablet('rdonly', 'source_keyspace', '0') source_rdonly2.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly1.init_tablet('rdonly', 'destination_keyspace', '0') destination_rdonly2.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [ source_master, source_replica, source_rdonly1, source_rdonly2 ]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) destination_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ destination_replica, destination_rdonly1, destination_rdonly2 ]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [ source_master, source_replica, source_rdonly1, source_rdonly2 ]: t.wait_for_vttablet_state('SERVING') for t in [ destination_master, destination_replica, destination_rdonly1, destination_rdonly2 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'source_keyspace/0', source_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'destination_keyspace/0', destination_master.tablet_alias ], auto_log=True) # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) # run a health check on source replica so it responds to discovery # (for binlog players) and on the source rdonlys (for workers) utils.run_vtctl( ['RunHealthCheck', source_replica.tablet_alias, 'replica']) for t in [source_rdonly1, source_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl([ 'CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0' ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'], auto_log=True) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use vtworker to compare the data for t in [destination_rdonly1, destination_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', destination_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', destination_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn( '<td><b>All</b>: 1000<br><b>Query</b>: 700<br>' '<b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl([ 'MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf['cells'], ['test_nj']) self.assertTrue(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertNotIn('cells', ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl([ 'MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', [], ['moving1', 'moving2']) # check 'vtctl SetShardTabletControl' command works as expected: # clear the rdonly entry, re-add it, and then clear all entries. utils.run_vtctl([ 'SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly' ], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(len(shard_json['tablet_controls']), 2) for tc in shard_json['tablet_controls']: self.assertIn(tc['tablet_type'], [topodata_pb2.MASTER, topodata_pb2.REPLICA]) utils.run_vtctl([ 'SetShardTabletControl', '--tables=moving.*,view1', 'source_keyspace/0', 'rdonly' ], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) for tc in shard_json['tablet_controls']: if tc['tablet_type'] == topodata_pb2.RDONLY: break self.assertEqual(['moving.*', 'view1'], tc['blacklisted_tables']) utils.run_vtctl([ 'SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly' ], auto_log=True) utils.run_vtctl([ 'SetShardTabletControl', '--remove', 'source_keyspace/0', 'replica' ], auto_log=True) utils.run_vtctl([ 'SetShardTabletControl', '--remove', 'source_keyspace/0', 'master' ], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertNotIn('tablet_controls', shard_json) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # check the stats are correct self._check_stats() # kill everything tablet.kill_tablets([ source_master, source_replica, source_rdonly1, source_rdonly2, destination_master, destination_replica, destination_rdonly1, destination_rdonly2 ])
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl(['CreateKeyspace', '--served-from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace']) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly.init_tablet('rdonly', 'source_keyspace', '0') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) for t in [destination_master, destination_replica, destination_rdonly]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly]: t.wait_for_vttablet_state('CONNECTING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'source_keyspace/0', source_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'destination_keyspace/0', destination_master.tablet_alias], auto_log=True) # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--tables', 'moving1,moving2,view1', source_rdonly.tablet_alias], auto_log=True) # perform the restore. utils.run_vtctl(['ShardMultiRestore', '--strategy' ,'populateBlpCheckpoint', '--tables', 'moving1,moving2', 'destination_keyspace/0', source_rdonly.tablet_alias], auto_log=True) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use the vtworker checker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving1', 'moving2']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving1', 'moving2']) self._check_blacklisted_tables(source_rdonly, ['moving1', 'moving2']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving1', 'moving2']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving1', 'moving2']) self._check_blacklisted_tables(source_rdonly, ['moving1', 'moving2']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving1', 'moving2']) self._check_blacklisted_tables(source_replica, ['moving1', 'moving2']) self._check_blacklisted_tables(source_rdonly, ['moving1', 'moving2']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], []) # check 'vtctl SetBlacklistedTables' command works as expected utils.run_vtctl(['SetBlacklistedTables', source_master.tablet_alias, 'moving1,moving2,view1'], auto_log=True) self._check_blacklisted_tables(source_master, ['moving1', 'moving2', 'view1']) utils.run_vtctl(['SetBlacklistedTables', source_master.tablet_alias], auto_log=True) self._check_blacklisted_tables(source_master, None) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly, destination_master, destination_replica, destination_rdonly])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [ shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def verify_successful_worker_copy_with_reparent(self, mysql_down=False): """Verifies that vtworker can successfully copy data for a SplitClone. Order of operations: 1. Run a background vtworker 2. Wait until the worker successfully resolves the destination masters. 3. Reparent the destination tablets 4. Wait until the vtworker copy is finished 5. Verify that the worker was forced to reresolve topology and retry writes due to the reparent. 6. Verify that the data was copied successfully to both new shards Args: mysql_down: boolean. If True, we take down the MySQL instances on the destination masters at first, then bring them back and reparent away. Raises: AssertionError if things didn't go as expected. """ if mysql_down: logging.debug('Shutting down mysqld on destination masters.') utils.wait_procs([ shard_0_master.shutdown_mysql(), shard_1_master.shutdown_mysql() ]) worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj'], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # --chunk_count is 2 because rows are currently ordered by primary key such # that all rows of the first shard come first and then the second shard. # TODO(mberlin): Remove --offline=false once vtworker ensures that the # destination shards are not behind the master's replication # position. args = [ 'SplitClone', '--offline=false', '--destination_writer_count', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999' ] if not mysql_down: # Make the clone as slow as necessary such that there is enough time to # run PlannedReparent in the meantime. # TOOD(mberlin): Once insert_values is fixed to uniformly distribute the # rows across shards when sorted by primary key, remove # --chunk_count 2, --min_rows_per_chunk 1 and set # --source_reader_count back to 1. args.extend([ '--source_reader_count', '2', '--chunk_count', '2', '--min_rows_per_chunk', '1', '--write_query_max_rows', '1' ]) args.append('test_keyspace/0') workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port) if mysql_down: # If MySQL is down, we wait until vtworker retried at least once to make # sure it reached the point where a write failed due to MySQL being down. # There should be two retries at least, one for each destination shard. utils.poll_for_vars( 'vtworker', worker_port, 'WorkerRetryCount >= 2', condition_fn=lambda v: v.get('WorkerRetryCount') >= 2) logging.debug( 'Worker has retried at least twice, starting reparent now') # vtworker is blocked at this point. This is a good time to test that its # throttler server is reacting to RPCs. self.check_throttler_service( 'localhost:%d' % worker_rpc_port, ['test_keyspace/-80', 'test_keyspace/80-'], 9999) # Bring back masters. Since we test with semi-sync now, we need at least # one replica for the new master. This test is already quite expensive, # so we bring back the old master as a replica rather than having a third # replica up the whole time. logging.debug('Restarting mysqld on destination masters') utils.wait_procs( [shard_0_master.start_mysql(), shard_1_master.start_mysql()]) # Reparent away from the old masters. utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias ], auto_log=True) else: # NOTE: There is a race condition around this: # It's possible that the SplitClone vtworker command finishes before the # PlannedReparentShard vtctl command, which we start below, succeeds. # Then the test would fail because vtworker did not have to retry. # # To workaround this, the test takes a parameter to increase the number of # rows that the worker has to copy (with the idea being to slow the worker # down). # You should choose a value for num_insert_rows, such that this test # passes for your environment (trial-and-error...) # Make sure that vtworker got past the point where it picked a master # for each destination shard ("finding targets" state). utils.poll_for_vars( 'vtworker', worker_port, 'WorkerState == cloning the data (online)', condition_fn=lambda v: v.get('WorkerState') == 'cloning the' ' data (online)') logging.debug('Worker is in copy state, starting reparent now') utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/-80', shard_0_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-', shard_1_replica.tablet_alias ], auto_log=True) utils.wait_procs([workerclient_proc]) # Verify that we were forced to re-resolve and retry. worker_vars = utils.get_vars(worker_port) self.assertGreater( worker_vars['WorkerRetryCount'], 1, "expected vtworker to retry each of the two reparented" " destination masters at least once, but it didn't") self.assertNotEqual(worker_vars['WorkerRetryCount'], {}, "expected vtworker to retry, but it didn't") utils.kill_sub_process(worker_proc, soft=True) # Wait for the destination RDONLYs to catch up or the following offline # clone will try to insert rows which already exist. # TODO(mberlin): Remove this once SplitClone supports it natively. utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1) utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1) # Run final offline clone to enable filtered replication. _, _ = utils.run_vtworker([ '-cell', 'test_nj', 'SplitClone', '--online=false', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # Make sure that everything is caught up to the same replication point self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets) self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets) self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica) self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
def test_vertical_split(self): utils.run_vtctl([ 'CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0' ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'destination_keyspace/0' ], auto_log=True) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', self.moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', self.moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', self.moving1_first, 100) # check the binlog player is running and exporting vars self.check_destination_master(destination_master, ['source_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(source_replica, horizontal=False) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) self.check_binlog_player_vars(destination_master, ['source_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(source_replica, horizontal=False, min_statements=100, min_transactions=100) # use vtworker to compare the data logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'VerticalSplitDiff', '--min_healthy_rdonly_tablets', '1', 'destination_keyspace/0' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all self.check_running_binlog_player(destination_master, 700, 300, extra_text='moving.*') # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl([ 'MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf['cells'], ['test_nj']) self.assertTrue(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertNotIn('cells', ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl([ 'MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) # check the binlog player is gone now self.check_no_binlog_player(destination_master) # check the stats are correct self._check_stats() # now remove the tables on the source shard. The blacklisted tables # in the source shard won't match any table, make sure that works. utils.run_vtctl( ['ApplySchema', '-sql=drop view view1', 'source_keyspace'], auto_log=True) for t in ['moving1', 'moving2']: utils.run_vtctl( ['ApplySchema', '-sql=drop table %s' % (t), 'source_keyspace'], auto_log=True) for t in [ source_master, source_replica, source_rdonly1, source_rdonly2 ]: utils.run_vtctl(['ReloadSchema', t.tablet_alias]) qr = source_master.execute('select count(1) from staying1') self.assertEqual(len(qr['rows']), 1, 'cannot read staying1: got %s' % str(qr)) # test SetShardTabletControl self._verify_vtctl_set_shard_tablet_control()
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_master.init_tablet( 'master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly.init_tablet( 'rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly.init_tablet( 'rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly.init_tablet( 'rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('CONNECTING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--spec=-80-', shard_replica.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot shard_replica.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/-80', shard_replica.tablet_alias], auto_log=True) utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-', shard_replica.tablet_alias], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_master, shard_replica, shard_rdonly]) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly])
def test_merge_sharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'custom_sharding_key', '--sharding_column_type', keyspace_id_type, 'test_keyspace' ]) shard_0_master.init_tablet('master', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('master', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('master', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_sharding_key') # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # masters will be serving for t in [shard_0_master, shard_1_master, shard_2_master]: t.wait_for_vttablet_state('SERVING') # slaves won't be, no replication state for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly, shard_2_replica, shard_2_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-40', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/40-80', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_2_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('master', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_dest_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # copy the schema utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80' ], auto_log=True) # copy the data (will also start filtered replication), reset source utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias ]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly ])
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl(['CreateKeyspace', '--served_from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace']) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly1.init_tablet('rdonly', 'source_keyspace', '0') source_rdonly2.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly1.init_tablet('rdonly', 'destination_keyspace', '0') destination_rdonly2.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) destination_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [destination_replica, destination_rdonly1, destination_rdonly2]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly1, destination_rdonly2]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'source_keyspace/0', source_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'destination_keyspace/0', destination_master.tablet_alias], auto_log=True) # read all the keyspaces, this will populate the topology cache. self._populate_topo_cache() # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl(['CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0'], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'], auto_log=True) topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace') # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use the vtworker checker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn('<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(['MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace']) self.assertEqual(keyspace_json['ServedFromMap']['rdonly']['Cells'], ['test_nj']) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace']) self.assertFalse('rdonly' in keyspace_json['ServedFromMap']) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace']) self.assertEqual(keyspace_json['ServedFromMap']['rdonly']['Cells'], None) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2']) # check 'vtctl SetShardTabletControl' command works as expected: # clear the rdonly entry, re-add it, and then clear all entries. utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertNotIn('rdonly', shard_json['TabletControlMap']) self.assertIn('replica', shard_json['TabletControlMap']) self.assertIn('master', shard_json['TabletControlMap']) utils.run_vtctl(['SetShardTabletControl', '--tables=moving.*,view1', 'source_keyspace/0', 'rdonly'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(['moving.*', 'view1'], shard_json['TabletControlMap']['rdonly']['BlacklistedTables']) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly'], auto_log=True) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'replica'], auto_log=True) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'master'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(None, shard_json['TabletControlMap']) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # optional method to check the stats are correct self._check_stats() # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly1, source_rdonly2, destination_master, destination_replica, destination_rdonly1, destination_rdonly2])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br>' '<b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias, 'replica']) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias, 'replica'], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug('Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl(['SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-'], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # delete the original tablets in the original shard tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1])
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl(['CreateKeyspace', '--served-from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace']) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) for t in [destination_master, destination_replica, destination_rdonly]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'source_keyspace/0', source_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'destination_keyspace/0', destination_master.tablet_alias], auto_log=True) # read all the keyspaces, this will populate the topology cache. self._populate_topo_cache() # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) utils.pause("Before multisnapshot") # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--tables', 'moving.*,view1', source_rdonly.tablet_alias], auto_log=True) # perform the restore. utils.run_vtctl(['ShardMultiRestore', '--strategy' ,'populateBlpCheckpoint', '--tables', 'moving.*', 'destination_keyspace/0', source_rdonly.tablet_alias], auto_log=True) topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace') # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use the vtworker checker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn('<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*']) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*']) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*']) self._check_blacklisted_tables(source_replica, ['moving.*']) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2']) # check 'vtctl SetBlacklistedTables' command works as expected utils.run_vtctl(['SetBlacklistedTables', source_master.tablet_alias, 'moving.*,view1'], auto_log=True) self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) utils.run_vtctl(['SetBlacklistedTables', source_master.tablet_alias], auto_log=True) self._check_blacklisted_tables(source_master, None) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # optional method to check the stats are correct self._check_stats() # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly, destination_master, destination_replica, destination_rdonly])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) # won't be serving, no replication state for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '2', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_master.init_tablet( 'master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly1.init_tablet( 'rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly1.init_tablet( 'rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet( 'rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables' ,'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug("Waiting for binlog players to start on new masters...") shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def test_resharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet( 'master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet( 'master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly]: t.start_vttablet(wait_for_state=None) shard_2_master.wait_for_vttablet_state('CONNECTING') shard_2_replica1.wait_for_vttablet_state('NOT_SERVING') shard_2_replica2.wait_for_vttablet_state('NOT_SERVING') shard_3_master.wait_for_vttablet_state('CONNECTING') shard_3_replica.wait_for_vttablet_state('NOT_SERVING') shard_3_rdonly.wait_for_vttablet_state('CONNECTING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', '-use-served-types', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica') # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot, # and check all the others while we're at it shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias], auto_log=True) utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica') # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica') # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica') # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly])
def test_resharding(self): utils.run_vtctl( [ "CreateKeyspace", "--sharding_column_name", "bad_column", "--sharding_column_type", "bytes", "--split_shard_count", "2", "test_keyspace", ] ) utils.run_vtctl(["SetKeyspaceShardingInfo", "test_keyspace", "keyspace_id", "uint64"], expect_fail=True) utils.run_vtctl( [ "SetKeyspaceShardingInfo", "-force", "-split_shard_count", "4", "test_keyspace", "keyspace_id", keyspace_id_type, ] ) shard_0_master.init_tablet("master", "test_keyspace", "-80") shard_0_replica.init_tablet("replica", "test_keyspace", "-80") shard_0_ny_rdonly.init_tablet("rdonly", "test_keyspace", "-80") shard_1_master.init_tablet("master", "test_keyspace", "80-") shard_1_slave1.init_tablet("replica", "test_keyspace", "80-") shard_1_slave2.init_tablet("spare", "test_keyspace", "80-") shard_1_ny_rdonly.init_tablet("rdonly", "test_keyspace", "80-") shard_1_rdonly1.init_tablet("rdonly", "test_keyspace", "80-") utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) ks = utils.run_vtctl_json(["GetSrvKeyspace", "test_nj", "test_keyspace"]) self.assertEqual(ks["split_shard_count"], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1, ]: t.create_db("vt_test_keyspace") t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state("SERVING") shard_0_replica.wait_for_vttablet_state("SERVING") shard_0_ny_rdonly.wait_for_vttablet_state("SERVING") shard_1_master.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("SERVING") shard_1_slave2.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_ny_rdonly.wait_for_vttablet_state("SERVING") shard_1_rdonly1.wait_for_vttablet_state("SERVING") # reparent to make the tablets work utils.run_vtctl(["InitShardMaster", "test_keyspace/-80", shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(["InitShardMaster", "test_keyspace/80-", shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet("master", "test_keyspace", "80-c0") shard_2_replica1.init_tablet("spare", "test_keyspace", "80-c0") shard_2_replica2.init_tablet("spare", "test_keyspace", "80-c0") shard_3_master.init_tablet("master", "test_keyspace", "c0-") shard_3_replica.init_tablet("spare", "test_keyspace", "c0-") shard_3_rdonly1.init_tablet("rdonly", "test_keyspace", "c0-") # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type="replica") for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state("NOT_SERVING") utils.run_vtctl(["InitShardMaster", "test_keyspace/80-c0", shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(["InitShardMaster", "test_keyspace/c0-", shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-\n" + "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ("test_keyspace/80-c0", "test_keyspace/c0-"): utils.run_vtctl( ["CopySchemaShard", "--exclude_tables", "unrelated", shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True, ) utils.run_vtworker( [ "--cell", "test_nj", "--command_display_interval", "10ms", "SplitClone", "--exclude_tables", "unrelated", "--strategy=-populate_blp_checkpoint", "--source_reader_count", "10", "--min_table_size_for_split", "1", "test_keyspace/80-", ], auto_log=True, ) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly1.tablet_alias, "rdonly"], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(["ValidateSchemaKeyspace", "--exclude_tables=unrelated", "test_keyspace"], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ["-cell", "test_nj", "SplitDiff", "--exclude_tables", "unrelated", "test_keyspace/c0-"], auto_log=True ) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn("Binlog player state: Running", shard_2_master_status) self.assertIn( "<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>", shard_2_master_status ) self.assertIn("</html>", shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(["ChangeSlaveType", shard_1_slave2.tablet_alias, "replica"]) utils.run_vtctl(["ChangeSlaveType", shard_1_slave1.tablet_alias, "spare"]) shard_1_slave2.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("NOT_SERVING") # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(["RunHealthCheck", shard_3_master.tablet_alias, "replica"], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(["VtTabletStreamHealth", "-count", "1", master.tablet_alias]) logging.debug("Got health: %s", str(stream_health)) self.assertIn("realtime_stats", stream_health) self.assertNotIn("serving", stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(["MigrateServedTypes", "--cells=test_nj", "test_keyspace/80-", "rdonly"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_srv_keyspace( "test_ny", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "rdonly"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_srv_keyspace( "test_ny", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards source_tablet = shard_1_slave2 destination_shards = ["test_keyspace/80-c0", "test_keyspace/c0-"] utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-c0 c0-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl(["MigrateServedTypes", "-reverse", "test_keyspace/80-", "replica"], auto_log=True) # After a backwards migration, queryservice should be enabled on source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other reasons than the migration, # so check the shard record instead of the tablets directly utils.check_shard_query_services(self, destination_shards, tablet.Tablet.tablet_type_value["REPLICA"], False) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, tablet.Tablet.tablet_type_value["REPLICA"], True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-c0 c0-\n", keyspace_id_type=keyspace_id_type, ) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(["PlannedReparentShard", "test_keyspace/80-c0", shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ["-cell", "test_nj", "SplitDiff", "--exclude_tables", "unrelated", "test_keyspace/c0-"], auto_log=True ) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly1.tablet_alias, "rdonly"], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( "DELAY 1: %s max_lag=%d avg_lag=%d", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count, ) logging.debug( "DELAY 2: %s max_lag=%d avg_lag=%d", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count, ) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(["SourceShardDelete", "test_keyspace/c0-", "0"], auto_log=True) utils.run_vtctl( ["SourceShardAdd", "--key_range=80-", "test_keyspace/c0-", "0", "test_keyspace/80-"], auto_log=True ) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-c0 c0-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-c0 c0-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn("No binlog player is running", shard_2_master_status) self.assertIn("</html>", shard_2_master_status) # scrap the original tablets in the original shard for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(["ScrapTablet", t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(["DeleteTablet", t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) # test RemoveShardCell utils.run_vtctl(["RemoveShardCell", "test_keyspace/-80", "test_nj"], auto_log=True, expect_fail=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_nj"], auto_log=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_ny"], auto_log=True) shard = utils.run_vtctl_json(["GetShard", "test_keyspace/80-"]) self.assertNotIn("cells", shard) # delete the original shard utils.run_vtctl(["DeleteShard", "test_keyspace/80-"], auto_log=True) # kill everything tablet.kill_tablets( [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1, ] )
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet('master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl( ['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-C0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias ], auto_log=True) # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias ], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets( [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly]) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def test_resharding(self): utils.run_vtctl( [ "CreateKeyspace", "--sharding_column_name", "bad_column", "--sharding_column_type", "bytes", "--split_shard_count", "2", "test_keyspace", ] ) utils.run_vtctl(["SetKeyspaceShardingInfo", "test_keyspace", "keyspace_id", "uint64"], expect_fail=True) utils.run_vtctl( [ "SetKeyspaceShardingInfo", "-force", "-split_shard_count", "4", "test_keyspace", "keyspace_id", keyspace_id_type, ] ) shard_0_master.init_tablet("master", "test_keyspace", "-80") shard_0_replica.init_tablet("replica", "test_keyspace", "-80") shard_0_ny_slave.init_tablet("spare", "test_keyspace", "-80") shard_1_master.init_tablet("master", "test_keyspace", "80-") shard_1_slave1.init_tablet("replica", "test_keyspace", "80-") shard_1_slave2.init_tablet("spare", "test_keyspace", "80-") shard_1_ny_slave.init_tablet("spare", "test_keyspace", "80-") shard_1_rdonly.init_tablet("rdonly", "test_keyspace", "80-") utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) ks = utils.run_vtctl_json(["GetSrvKeyspace", "test_nj", "test_keyspace"]) self.assertEqual(ks["SplitShardCount"], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly, ]: t.create_db("vt_test_keyspace") t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state("SERVING") shard_0_replica.wait_for_vttablet_state("SERVING") shard_0_ny_slave.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_master.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("SERVING") shard_1_slave2.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_ny_slave.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_rdonly.wait_for_vttablet_state("SERVING") # reparent to make the tablets work utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/-80", shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/80-", shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet("master", "test_keyspace", "80-c0") shard_2_replica1.init_tablet("spare", "test_keyspace", "80-c0") shard_2_replica2.init_tablet("spare", "test_keyspace", "80-c0") shard_3_master.init_tablet("master", "test_keyspace", "c0-") shard_3_replica.init_tablet("spare", "test_keyspace", "c0-") shard_3_rdonly.init_tablet("rdonly", "test_keyspace", "c0-") # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type="replica") for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly]: t.wait_for_vttablet_state("NOT_SERVING") utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/80-c0", shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/c0-", shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) if use_clone_worker: # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtworker( [ "--cell", "test_nj", "--command_display_interval", "10ms", "SplitClone", "--exclude_tables", "unrelated", "--strategy", "populateBlpCheckpoint", "--source_reader_count", "10", "--min_table_size_for_split", "1", "test_keyspace/80-c0", ], auto_log=True, ) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option else: # take the snapshot for the split utils.run_vtctl( ["MultiSnapshot", "--spec=80-c0-", "--exclude_tables=unrelated", shard_1_slave1.tablet_alias], auto_log=True, ) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-c0") ) ) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl( [ "ShardMultiRestore", "-strategy=populateBlpCheckpoint,dontStartBinlogPlayer", "test_keyspace/80-c0", shard_1_slave1.tablet_alias, ], auto_log=True, ) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step("shard 2 master has not failed starting yet", timeout) continue logging.debug("shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json( [ "ExecuteFetch", shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0', ] ) self.assertEqual(qr["RowsAffected"], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step("shard 2 master has not started replication yet", timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl( [ "ShardMultiRestore", "-strategy=populateBlpCheckpoint", "test_keyspace/c0-", shard_1_slave1.tablet_alias, ], auto_log=True, ) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(["ValidateSchemaKeyspace", "--exclude_tables=unrelated", "test_keyspace"], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker(["-cell", "test_nj", "SplitDiff", "test_keyspace/c0-"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly.tablet_alias, "rdonly"], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn("Binlog player state: Running", shard_2_master_status) self.assertIn( "<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>", shard_2_master_status ) self.assertIn("</html>", shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(["ChangeSlaveType", shard_1_slave2.tablet_alias, "replica"]) utils.run_vtctl(["ChangeSlaveType", shard_1_slave1.tablet_alias, "spare"]) shard_1_slave2.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("NOT_SERVING") # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. shard_2_master_vars = utils.get_vars(shard_2_master.port) self.assertEqual(shard_2_master_vars["TabletStateName"], "NOT_SERVING") shard_3_master_vars = utils.get_vars(shard_3_master.port) self.assertEqual(shard_3_master_vars["TabletStateName"], "NOT_SERVING") # now serve rdonly from the split shards utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "rdonly"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # then serve replica from the split shards utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-c0 c0-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # move replica back and forth utils.run_vtctl(["MigrateServedTypes", "-reverse", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-c0 c0-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(["ReparentShard", "test_keyspace/80-c0", shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker(["-cell", "test_nj", "SplitDiff", "test_keyspace/c0-"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly.tablet_alias, "rdonly"], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( "DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count, ) logging.debug( "DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count, ) # then serve master from the split shards utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-c0 c0-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-c0 c0-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn("No binlog player is running", shard_2_master_status) self.assertIn("</html>", shard_2_master_status) # scrap the original tablets in the original shard for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(["ScrapTablet", t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]) for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(["DeleteTablet", t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) # test RemoveShardCell utils.run_vtctl(["RemoveShardCell", "test_keyspace/-80", "test_nj"], auto_log=True, expect_fail=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_nj"], auto_log=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_ny"], auto_log=True) shard = utils.run_vtctl_json(["GetShard", "test_keyspace/80-"]) if shard["Cells"]: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(["DeleteShard", "test_keyspace/80-"], auto_log=True) # kill everything tablet.kill_tablets( [ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly, ] )
def test_vertical_split(self): # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl(['CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0'], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_endpoints', '1', 'destination_keyspace/0'], auto_log=True) # One of the two source rdonly tablets went spare after the clone. # Force a healthcheck on both to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', self.moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', self.moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', self.moving1_first, 100) # check the binlog player is running and exporting vars self.check_destination_master(destination_master, ['source_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(source_replica, horizontal=False) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) self.check_binlog_player_vars(destination_master, ['source_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(source_replica, horizontal=False, min_statements=100, min_transactions=100) # use vtworker to compare the data for t in [destination_rdonly1, destination_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', '--min_healthy_rdonly_endpoints', '1', 'destination_keyspace/0'], auto_log=True) # One of each source and dest rdonly tablet went spare after the diff. # Force a healthcheck on all four to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2, destination_rdonly1, destination_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all self.check_running_binlog_player(destination_master, 700, 300, extra_text='moving.*') # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(['MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf['cells'], ['test_nj']) self.assertTrue(found) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertNotIn('cells', ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) # check the binlog player is gone now self.check_no_binlog_player(destination_master) # check the stats are correct self._check_stats() # now remove the tables on the source shard. The blacklisted tables # in the source shard won't match any table, make sure that works. utils.run_vtctl(['ApplySchema', '-sql=drop view view1', 'source_keyspace'], auto_log=True) for t in ['moving1', 'moving2']: utils.run_vtctl(['ApplySchema', '-sql=drop table %s' % (t), 'source_keyspace'], auto_log=True) for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: utils.run_vtctl(['ReloadSchema', t.tablet_alias]) qr = source_master.execute('select count(1) from staying1') self.assertEqual(len(qr['rows']), 1, 'cannot read staying1: got %s' % str(qr)) # test SetShardTabletControl self._verify_vtctl_set_shard_tablet_control()
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl([ 'CreateKeyspace', '--served_from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace' ]) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) destination_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [destination_replica, destination_rdonly]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'source_keyspace/0', source_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'destination_keyspace/0', destination_master.tablet_alias ], auto_log=True) # read all the keyspaces, this will populate the topology cache. self._populate_topo_cache() # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) if use_clone_worker: # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--strategy', 'populateBlpCheckpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0' ], auto_log=True) else: # take the snapshot for the split utils.run_vtctl([ 'MultiSnapshot', '--tables', 'moving.*,view1', source_rdonly.tablet_alias ], auto_log=True) # perform the restore. utils.run_vtctl([ 'ShardMultiRestore', '--strategy', 'populateBlpCheckpoint', '--tables', 'moving.*,view1', 'destination_keyspace/0', source_rdonly.tablet_alias ], auto_log=True) topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace') # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use the vtworker checker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker([ '-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', source_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', destination_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn( '<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl([ 'MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2']) # check 'vtctl SetShardBlacklistedTables' command works as expected: # clear the rdonly entry, re-add it, and then clear all entries. utils.run_vtctl( ['SetShardBlacklistedTables', 'source_keyspace/0', 'rdonly'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertNotIn('rdonly', shard_json['BlacklistedTablesMap']) self.assertIn('replica', shard_json['BlacklistedTablesMap']) self.assertIn('master', shard_json['BlacklistedTablesMap']) utils.run_vtctl([ 'SetShardBlacklistedTables', 'source_keyspace/0', 'rdonly', 'moving.*,view1' ], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(['moving.*', 'view1'], shard_json['BlacklistedTablesMap']['rdonly']) utils.run_vtctl( ['SetShardBlacklistedTables', 'source_keyspace/0', 'rdonly'], auto_log=True) utils.run_vtctl( ['SetShardBlacklistedTables', 'source_keyspace/0', 'replica'], auto_log=True) utils.run_vtctl( ['SetShardBlacklistedTables', 'source_keyspace/0', 'master'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(None, shard_json['BlacklistedTablesMap']) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # optional method to check the stats are correct self._check_stats() # kill everything tablet.kill_tablets([ source_master, source_replica, source_rdonly, destination_master, destination_replica, destination_rdonly ])
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl(['CreateKeyspace', '--served-from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace']) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) for t in [destination_master, destination_replica, destination_rdonly]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'source_keyspace/0', source_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'destination_keyspace/0', destination_master.tablet_alias], auto_log=True) # read all the keyspaces, this will populate the topology cache. self._populate_topo_cache() # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--tables', 'moving.*,view1', source_rdonly.tablet_alias], auto_log=True) # perform the restore. utils.run_vtctl(['ShardMultiRestore', '--strategy' ,'populateBlpCheckpoint', '--tables', 'moving.*', 'destination_keyspace/0', source_rdonly.tablet_alias], auto_log=True) topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace') # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use the vtworker checker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn('<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*']) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*']) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*']) self._check_blacklisted_tables(source_replica, ['moving.*']) self._check_blacklisted_tables(source_rdonly, ['moving.*']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2']) # check 'vtctl SetBlacklistedTables' command works as expected utils.run_vtctl(['SetBlacklistedTables', source_master.tablet_alias, 'moving.*,view1'], auto_log=True) self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) utils.run_vtctl(['SetBlacklistedTables', source_master.tablet_alias], auto_log=True) self._check_blacklisted_tables(source_master, None) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly, destination_master, destination_replica, destination_rdonly])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['SplitShardCount'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) if use_clone_worker: # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint -write_masters_only', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-c0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option else: # take the snapshot for the split utils.run_vtctl([ 'MultiSnapshot', '--spec=80-c0-', '--exclude_tables=unrelated', shard_1_slave1.tablet_alias ], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-c0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=-populate_blp_checkpoint -dont_start_binlog_player', 'test_keyspace/80-c0', shard_1_slave1.tablet_alias ], auto_log=True) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not failed starting yet', timeout) continue logging.debug( "shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json([ 'ExecuteFetch', shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0' ]) self.assertEqual(qr['RowsAffected'], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not started replication yet', timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl([ 'ShardMultiRestore', '-strategy=-populate_blp_checkpoint', 'test_keyspace/c0-', shard_1_slave1.tablet_alias ], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. self._check_query_service(shard_2_master, False, False) self._check_query_service(shard_3_master, False, False) # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(rdonly): -80 80-\n' + 'TabletTypes: rdonly', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_0_ny_rdonly, True, False) self._check_query_service(shard_1_ny_rdonly, True, False) self._check_query_service(shard_1_rdonly, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(rdonly): -80 80-c0 c0-\n' + 'TabletTypes: rdonly', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_0_ny_rdonly, True, False) self._check_query_service(shard_1_ny_rdonly, False, True) self._check_query_service(shard_1_rdonly, False, True) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_slave2, True, False) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_slave2, False, True) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]) for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def test_resharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_slave.init_tablet('spare', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_slave.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_ny_slave, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet( 'master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet( 'master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet( 'spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet( 'rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--spec=80-C0-', '--exclude_tables=unrelated', shard_1_slave1.tablet_alias], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink(os.path.join(environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-C0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint,dontStartBinlogPlayer', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias], auto_log=True) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step('shard 2 master has not failed starting yet', timeout) continue logging.debug("shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json(['ExecuteFetch', shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0']) self.assertEqual(qr['RowsAffected'], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step('shard 2 master has not started replication yet', timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn('<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. shard_2_master_vars = utils.get_vars(shard_2_master.port) self.assertEqual(shard_2_master_vars['TabletStateName'], 'NOT_SERVING') shard_3_master_vars = utils.get_vars(shard_3_master.port) self.assertEqual(shard_3_master_vars['TabletStateName'], 'NOT_SERVING') # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]) for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', keyspace_id_type ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, they won't be healthy) for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'LegacySplitClone', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') # replica is not started, InitShardMaster should timeout shard_master.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_rdonly1.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_master, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work - expect fail # because replica tablet is not up _, stderr = utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True, expect_fail=True) self.assertIn('tablet test_nj-0000062345 ResetReplication failed', stderr) # start replica shard_replica.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_replica.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # We must start vtgate after tablets are up, or else wait until 1min refresh # (that is the tablet_refresh_interval parameter for discovery gateway) # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.vtgate = None utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) var = None # Wait for the endpoints, either local or remote. utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1, var=var) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( [ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false' ], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 3) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) if base_sharding.use_multi_split_diff: logging.debug('Running vtworker MultiSplitDiff for 0') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'MultiSplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) else: logging.debug('Running vtworker SplitDiff for -80') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' timeout = 10.0 while True: try: s = utils.vtgate.split_query(sql, 'test_keyspace', 2) break except Exception: # pylint: disable=broad-except timeout = utils.wait_step( 'vtgate executes split_query properly', timeout) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = ( base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Test the correct handling of keyspace_id changes which happen after # the first clone. # Let row 2 go to shard 3 instead of shard 2. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0xD000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 2 and inserted to shard 3. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0xD000000000000000, should_be_here=False) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0xD000000000000000) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0x9000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 3 and inserted to shard 2. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0x9000000000000000) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0x9000000000000000, should_be_here=False) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 and 5 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) self._insert_value(shard_3_master, 'resharding1', 5, 'msg5', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 2, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( 'DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug( 'DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) # won't be serving, no replication state for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def test_vertical_split(self): # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl( ["CopySchemaShard", "--tables", "moving.*,view1", source_rdonly1.tablet_alias, "destination_keyspace/0"], auto_log=True, ) utils.run_vtworker( [ "--cell", "test_nj", "--command_display_interval", "10ms", "VerticalSplitClone", "--tables", "moving.*,view1", "--source_reader_count", "10", "--min_table_size_for_split", "1", "destination_keyspace/0", ], auto_log=True, ) # One of the two source rdonly tablets went spare after the clone. # Force a healthcheck on both to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2]: utils.run_vtctl(["RunHealthCheck", t.tablet_alias, "rdonly"]) # check values are present self._check_values(destination_master, "vt_destination_keyspace", "moving1", self.moving1_first, 100) self._check_values(destination_master, "vt_destination_keyspace", "moving2", self.moving2_first, 100) self._check_values(destination_master, "vt_destination_keyspace", "view1", self.moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values("moving1", 100) _ = self._insert_values("staying1", 100) moving2_first_add1 = self._insert_values("moving2", 100) self._check_values_timeout(destination_master, "vt_destination_keyspace", "moving1", moving1_first_add1, 100) self._check_values_timeout(destination_master, "vt_destination_keyspace", "moving2", moving2_first_add1, 100) # use vtworker to compare the data for t in [destination_rdonly1, destination_rdonly2]: utils.run_vtctl(["RunHealthCheck", t.tablet_alias, "rdonly"]) logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(["-cell", "test_nj", "VerticalSplitDiff", "destination_keyspace/0"], auto_log=True) # One of each source and dest rdonly tablet went spare after the diff. # Force a healthcheck on all four to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2, destination_rdonly1, destination_rdonly2]: utils.run_vtctl(["RunHealthCheck", t.tablet_alias, "rdonly"]) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn("Binlog player state: Running", destination_master_status) self.assertIn("moving.*", destination_master_status) self.assertIn( "<td><b>All</b>: 1000<br><b>Query</b>: 700<br>" "<b>Transaction</b>: 300<br></td>", destination_master_status, ) self.assertIn("</html>", destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars["TabletStateName"], "NOT_SERVING") # check we can't migrate the master just yet utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "master"], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(["MigrateServedFrom", "--cells=test_ny", "destination_keyspace/0", "rdonly"], auto_log=True) self._check_srv_keyspace( "ServedFrom(master): source_keyspace\n" "ServedFrom(rdonly): source_keyspace\n" "ServedFrom(replica): source_keyspace\n" ) self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json(["GetKeyspace", "destination_keyspace"]) found = False for ksf in keyspace_json["served_froms"]: if ksf["tablet_type"] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf["cells"], ["test_nj"]) self.assertTrue(found) utils.run_vtctl( [ "SetKeyspaceServedFrom", "-source=source_keyspace", "-remove", "-cells=test_nj", "destination_keyspace", "rdonly", ], auto_log=True, ) keyspace_json = utils.run_vtctl_json(["GetKeyspace", "destination_keyspace"]) found = False for ksf in keyspace_json["served_froms"]: if ksf["tablet_type"] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl( ["SetKeyspaceServedFrom", "-source=source_keyspace", "destination_keyspace", "rdonly"], auto_log=True ) keyspace_json = utils.run_vtctl_json(["GetKeyspace", "destination_keyspace"]) found = False for ksf in keyspace_json["served_froms"]: if ksf["tablet_type"] == topodata_pb2.RDONLY: found = True self.assertNotIn("cells", ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "rdonly"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n" "ServedFrom(replica): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection("destination_keyspace", ["master", "replica"], ["moving1", "moving2"]) # then serve replica from the destination shards utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "replica"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection("destination_keyspace", ["master"], ["moving1", "moving2"]) # move replica back and forth utils.run_vtctl(["MigrateServedFrom", "-reverse", "destination_keyspace/0", "replica"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n" "ServedFrom(replica): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "replica"], auto_log=True) self._check_srv_keyspace("ServedFrom(master): source_keyspace\n") self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) self._check_client_conn_redirection("destination_keyspace", ["master"], ["moving1", "moving2"]) # then serve master from the destination shards utils.run_vtctl(["MigrateServedFrom", "destination_keyspace/0", "master"], auto_log=True) self._check_srv_keyspace("") self._check_blacklisted_tables(source_master, ["moving.*", "view1"]) self._check_blacklisted_tables(source_replica, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly1, ["moving.*", "view1"]) self._check_blacklisted_tables(source_rdonly2, ["moving.*", "view1"]) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # check the stats are correct self._check_stats() self._verify_vtctl_set_shard_tablet_control()
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl( ['RunHealthCheck', shard_3_rdonly1.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br>' '<b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl( ['RunHealthCheck', shard_1_slave2.tablet_alias, 'replica']) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl( ['RunHealthCheck', shard_3_master.tablet_alias, 'replica'], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = (base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery('vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_binlog_throttler(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_binlog_throttler(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug('Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl(['SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-'], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1])