def setup_sharded_keyspace(): utils.run_vtctl(['CreateKeyspace', SHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', SHARDED_KEYSPACE, 'keyspace_id', 'uint64']) shard_0_master.init_tablet('master', keyspace=SHARDED_KEYSPACE, shard='-80') shard_0_replica.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='-80') shard_0_rdonly.init_tablet('rdonly', keyspace=SHARDED_KEYSPACE, shard='-80') shard_1_master.init_tablet('master', keyspace=SHARDED_KEYSPACE, shard='80-') shard_1_replica.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='80-') shard_1_rdonly.init_tablet('rdonly', keyspace=SHARDED_KEYSPACE, shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE,], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.create_db('vt_test_keyspace_sharded') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['ReparentShard', '-force', '%s/-80' % SHARDED_KEYSPACE, shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', '%s/80-' % SHARDED_KEYSPACE, shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', SHARDED_KEYSPACE, 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica')
def setup_unsharded_keyspace(): utils.run_vtctl(['CreateKeyspace', UNSHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', UNSHARDED_KEYSPACE, 'keyspace_id', 'uint64']) unsharded_master.init_tablet('master', keyspace=UNSHARDED_KEYSPACE, shard='0') unsharded_replica.init_tablet('replica', keyspace=UNSHARDED_KEYSPACE, shard='0') unsharded_rdonly.init_tablet('rdonly', keyspace=UNSHARDED_KEYSPACE, shard='0') utils.run_vtctl(['RebuildKeyspaceGraph', UNSHARDED_KEYSPACE,], auto_log=True) for t in [unsharded_master, unsharded_replica, unsharded_rdonly]: t.create_db('vt_test_keyspace_unsharded') t.mquery(unsharded_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [unsharded_master, unsharded_replica, unsharded_rdonly]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['ReparentShard', '-force', '%s/0' % UNSHARDED_KEYSPACE, unsharded_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', UNSHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', UNSHARDED_KEYSPACE, 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica')
def setup_sharded_keyspace(): utils.run_vtctl(['CreateKeyspace', SHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', SHARDED_KEYSPACE, 'keyspace_id', 'uint64']) shard_0_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace=SHARDED_KEYSPACE, init_shard='-80') shard_0_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace=SHARDED_KEYSPACE, init_shard='-80') shard_1_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace=SHARDED_KEYSPACE, init_shard='80-') shard_1_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace=SHARDED_KEYSPACE, init_shard='80-') for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', '%s/-80' % SHARDED_KEYSPACE, shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', '%s/80-' % SHARDED_KEYSPACE, shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['ApplySchema', '-sql', create_vt_insert_test, SHARDED_KEYSPACE]) utils.check_srv_keyspace('test_nj', SHARDED_KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n')
def setup_sharded_keyspace(): utils.run_vtctl(['CreateKeyspace', SHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', SHARDED_KEYSPACE, 'keyspace_id', 'uint64']) shard_0_master.init_tablet('master', keyspace=SHARDED_KEYSPACE, shard='-80') shard_0_replica.init_tablet( 'replica', keyspace=SHARDED_KEYSPACE, shard='-80') shard_1_master.init_tablet('master', keyspace=SHARDED_KEYSPACE, shard='80-') shard_1_replica.init_tablet( 'replica', keyspace=SHARDED_KEYSPACE, shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE,], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace_sharded') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['InitShardMaster', '%s/-80' % SHARDED_KEYSPACE, shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '%s/80-' % SHARDED_KEYSPACE, shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', SHARDED_KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n')
def setup_unsharded_keyspace(): utils.run_vtctl(["CreateKeyspace", UNSHARDED_KEYSPACE]) utils.run_vtctl(["SetKeyspaceShardingInfo", "-force", UNSHARDED_KEYSPACE, "keyspace_id", "uint64"]) unsharded_master.init_tablet("master", keyspace=UNSHARDED_KEYSPACE, shard="0") unsharded_replica.init_tablet("replica", keyspace=UNSHARDED_KEYSPACE, shard="0") utils.run_vtctl(["RebuildKeyspaceGraph", UNSHARDED_KEYSPACE], auto_log=True) for t in [unsharded_master, unsharded_replica]: t.create_db("vt_test_keyspace_unsharded") t.mquery(unsharded_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [unsharded_master, unsharded_replica]: t.wait_for_vttablet_state("SERVING") utils.run_vtctl(["InitShardMaster", "%s/0" % UNSHARDED_KEYSPACE, unsharded_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", UNSHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace( "test_nj", UNSHARDED_KEYSPACE, "Partitions(master): -\n" + "Partitions(rdonly): -\n" + "Partitions(replica): -\n", )
def setup_sharded_keyspace(): utils.run_vtctl(['CreateKeyspace', SHARDED_KEYSPACE]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', SHARDED_KEYSPACE, 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='-80', tablet_index=1) shard_1_master.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace=SHARDED_KEYSPACE, shard='80-', tablet_index=1) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace_sharded') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', '%s/-80' % SHARDED_KEYSPACE, shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', '%s/80-' % SHARDED_KEYSPACE, shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('SERVING') # rebuild to be sure we have the latest data utils.run_vtctl(['RebuildKeyspaceGraph', SHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace( 'test_nj', SHARDED_KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n')
def start_tablets(): global __tablets # start tablets for t in __tablets: t.start_vttablet(wait_for_state=None) # wait for them to come in serving state for t in __tablets: t.wait_for_vttablet_state('SERVING') # InitShardMaster for master tablets for t in __tablets: if t.tablet_type == 'master': utils.run_vtctl(['InitShardMaster', t.keyspace+'/'+t.shard, t.tablet_alias], auto_log=True) for ks in topo_schema.keyspaces: ks_name = ks[0] ks_type = ks[1] utils.run_vtctl(['RebuildKeyspaceGraph', ks_name], auto_log=True) if ks_type == shard_constants.RANGE_SHARDED: utils.check_srv_keyspace('test_nj', ks_name, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n')
def verify(self): self.assert_shard_data_equal(0, worker.shard_master, worker.shard_0_tablets.replica) self.assert_shard_data_equal(1, worker.shard_master, worker.shard_1_tablets.replica) # Verify effect of MigrateServedTypes. Dest shards are serving now. utils.check_srv_keyspace('test_nj', self.KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') # source shard: query service must be disabled after MigrateServedTypes. source_shards = [worker.shard_rdonly1, worker.shard_replica, worker.shard_master] for shard in source_shards: utils.check_tablet_query_service( self, shard, serving=False, tablet_control_disabled=True) # dest shard -80, 80-: query service must be enabled # after MigrateServedTypes. dest_shards = [worker.shard_0_rdonly1, worker.shard_0_replica, worker.shard_0_master, worker.shard_1_rdonly1, worker.shard_1_replica, worker.shard_1_master] for shard in dest_shards: utils.check_tablet_query_service( self, shard, serving=True, tablet_control_disabled=False)
def verify(self): self.assert_shard_data_equal(0, worker.shard_master, worker.shard_0_tablets.replica) self.assert_shard_data_equal(1, worker.shard_master, worker.shard_1_tablets.replica) # Verify effect of MigrateServedTypes. Dest shards are serving now. utils.check_srv_keyspace( 'test_nj', self.KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') # source shard: query service must be disabled after MigrateServedTypes. source_shards = [ worker.shard_rdonly1, worker.shard_replica, worker.shard_master ] for shard in source_shards: utils.check_tablet_query_service(self, shard, serving=False, tablet_control_disabled=True) # dest shard -80, 80-: query service must be enabled # after MigrateServedTypes. dest_shards = [ worker.shard_0_rdonly1, worker.shard_0_replica, worker.shard_0_master, worker.shard_1_rdonly1, worker.shard_1_replica, worker.shard_1_master ] for shard in dest_shards: utils.check_tablet_query_service(self, shard, serving=True, tablet_control_disabled=False)
def start_tablets(): global __tablets # start tablets for t in __tablets: t.start_vttablet(wait_for_state=None) # wait for them to come in serving state for t in __tablets: t.wait_for_vttablet_state('SERVING') # ReparentShard for master tablets for t in __tablets: if t.tablet_type == 'master': utils.run_vtctl(['ReparentShard', '-force', t.keyspace+'/'+t.shard, t.tablet_alias], auto_log=True) for ks in topo_schema.keyspaces: ks_name = ks[0] ks_type = ks[1] utils.run_vtctl(['RebuildKeyspaceGraph', ks_name], auto_log=True) if ks_type == shard_constants.RANGE_SHARDED: utils.check_srv_keyspace('test_nj', ks_name, 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica')
def setup_unsharded_keyspace(): utils.run_vtctl(['CreateKeyspace', UNSHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', UNSHARDED_KEYSPACE, 'keyspace_id', 'uint64']) unsharded_master.init_tablet( 'master', keyspace=UNSHARDED_KEYSPACE, shard='0') unsharded_replica.init_tablet( 'replica', keyspace=UNSHARDED_KEYSPACE, shard='0') utils.run_vtctl(['RebuildKeyspaceGraph', UNSHARDED_KEYSPACE,], auto_log=True) for t in [unsharded_master, unsharded_replica]: t.create_db('vt_test_keyspace_unsharded') t.mquery(unsharded_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [unsharded_master, unsharded_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['InitShardMaster', '%s/0' % UNSHARDED_KEYSPACE, unsharded_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', UNSHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', UNSHARDED_KEYSPACE, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80') shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80') shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-') shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.mquery(shard_0_master.dbname, create_vt_a) t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') vtgate_server, vtgate_port = utils.vtgate_start() vtgate_client = zkocc.ZkOccConnection("localhost:%u" % vtgate_port, "test_nj", 30.0) topology.read_topology(vtgate_client)
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='-80') shard_0_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='-80') shard_1_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='80-') shard_1_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl([ 'ReparentShard', '-force', KEYSPACE_NAME + '/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', KEYSPACE_NAME + '/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( 'test_nj', KEYSPACE_NAME, 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') vtgate_server, vtgate_port = utils.vtgate_start() # FIXME(shrutip): this should be removed once vtgate_cursor's # dependency on topology goes away. vtgate_client = zkocc.ZkOccConnection("localhost:%u" % vtgate_port, "test_nj", 30.0) topology.read_topology(vtgate_client)
def verify(self): self.assert_shard_data_equal(0, worker.shard_master, worker.shard_0_tablets.replica) self.assert_shard_data_equal(1, worker.shard_master, worker.shard_1_tablets.replica) # Verify effect of MigrateServedTypes. Dest shards are serving now. utils.check_srv_keyspace( "test_nj", self.KEYSPACE, "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-\n" "Partitions(replica): -80 80-\n", ) # Check that query service is disabled (source shard) or enabled (dest). # The 'rdonly' tablet requires an explicit healthcheck first because # the following sequence of events is happening in this test: # - SplitDiff returns 'rdonly' as 'spare' tablet (NOT_SERVING) # - MigrateServedTypes runs and does not refresh then 'spare' tablet # (still NOT_SERVING) # Shard_TabletControl.DisableQueryService=true will be set in the topology # - explicit or periodic healthcheck runs: # a) tablet seen as caught up, change type from 'spare' to 'rdonly' # (change to SERVING) # b) post-action callback agent.refreshTablet() reads the topology # and finds out that DisableQueryService=true is set. # (change to NOT_SERVING) # # We must run an explicit healthcheck or we can see one of the two states: # - NOT_SERVING, DisableQueryService=false, tablet type 'spare' # (immediately after SplitDiff returned) # - SERVING, DisableQueryService=false, tablet type 'rdonly' # (during healthcheck before post-action callback is called) utils.run_vtctl(["RunHealthCheck", worker.shard_rdonly1.tablet_alias], auto_log=True) # source shard: query service must be disabled after MigrateServedTypes. utils.check_tablet_query_service(self, worker.shard_rdonly1, serving=False, tablet_control_disabled=True) utils.check_tablet_query_service(self, worker.shard_replica, serving=False, tablet_control_disabled=True) utils.check_tablet_query_service(self, worker.shard_master, serving=False, tablet_control_disabled=True) # dest shard -80: query service must be disabled after MigrateServedTypes. # Run explicit healthcheck because 'rdonly' tablet may still be 'spare'. utils.run_vtctl(["RunHealthCheck", worker.shard_0_rdonly1.tablet_alias], auto_log=True) utils.check_tablet_query_service(self, worker.shard_0_rdonly1, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_0_replica, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_0_master, serving=True, tablet_control_disabled=False) # dest shard 80-: query service must be disabled after MigrateServedTypes. # Run explicit healthcheck because 'rdonly' tablet is still 'spare'. utils.run_vtctl(["RunHealthCheck", worker.shard_1_rdonly1.tablet_alias], auto_log=True) utils.check_tablet_query_service(self, worker.shard_1_rdonly1, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_1_replica, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_1_master, serving=True, tablet_control_disabled=False)
def setup_tablets(): # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80') shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80') shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-') shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.mquery(shard_0_master.dbname, create_vt_a) t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl( ['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') utils.VtGate().start() vtgate_client = zkocc.ZkOccConnection(utils.vtgate.addr(), 'test_nj', 30.0) topology.read_topology(vtgate_client)
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='-80') shard_0_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='-80') shard_1_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='80-') shard_1_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl([ 'ReparentShard', '-force', KEYSPACE_NAME + '/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', KEYSPACE_NAME + '/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( 'test_nj', KEYSPACE_NAME, 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') vtgate_server, vtgate_port = utils.vtgate_start()
def setup_tablets(): # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64']) shard_0_master.init_tablet( 'master', keyspace=KEYSPACE_NAME, shard='0', tablet_index=0) shard_0_replica1.init_tablet( 'replica', keyspace=KEYSPACE_NAME, shard='0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica1]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_0_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', KEYSPACE_NAME+'/0', shard_0_master.tablet_alias], auto_log=True) for t in [shard_0_replica1]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_master, shard_0_replica1]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl( ['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( 'test_nj', KEYSPACE_NAME, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def setup_tablets(): # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64' ]) shard_0_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='0', tablet_index=0) shard_0_replica1.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='0', tablet_index=1) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica1]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_0_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl( ['InitShardMaster', KEYSPACE_NAME + '/0', shard_0_master.tablet_alias], auto_log=True) for t in [shard_0_replica1]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_master, shard_0_replica1]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( 'test_nj', KEYSPACE_NAME, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64']) shard_0_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='-80') shard_0_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='-80') shard_1_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='80-') shard_1_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['ReparentShard', '-force', KEYSPACE_NAME+'/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', KEYSPACE_NAME+'/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace('test_nj', KEYSPACE_NAME, 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') vtgate_server, vtgate_port = utils.vtgate_start() # FIXME(shrutip): this should be removed once vtgate_cursor's # dependency on topology goes away. vtgate_client = zkocc.ZkOccConnection("localhost:%u" % vtgate_port, "test_nj", 30.0) topology.read_topology(vtgate_client)
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', 'uint64']) shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80') shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80') shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-') shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.mquery(shard_0_master.dbname, create_vt_a) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') vtgate_server, vtgate_port = utils.vtgate_start() vtgate_client = zkocc.ZkOccConnection("localhost:%u" % vtgate_port, "test_nj", 30.0) topology.read_topology(vtgate_client)
def setup_tablets(): # Start up a master mysql and vttablet logging.debug('Setting up tablets') utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', 'uint64']) shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80') shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80') shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-') shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') t.mquery(shard_0_master.dbname, create_vt_insert_test) t.mquery(shard_0_master.dbname, create_vt_a) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl( ['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') utils.VtGate().start() vtgate_client = zkocc.ZkOccConnection(utils.vtgate.addr(), 'test_nj', 30.0) topology.read_topology(vtgate_client)
def setup_unsharded_keyspace(): utils.run_vtctl(['CreateKeyspace', UNSHARDED_KEYSPACE]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', UNSHARDED_KEYSPACE, 'keyspace_id', 'uint64']) unsharded_master.init_tablet( 'master', keyspace=UNSHARDED_KEYSPACE, shard='0', tablet_index=0) unsharded_replica.init_tablet( 'replica', keyspace=UNSHARDED_KEYSPACE, shard='0', tablet_index=1) for t in [unsharded_master, unsharded_replica]: t.create_db('vt_test_keyspace_unsharded') t.mquery(unsharded_master.dbname, create_vt_insert_test) t.start_vttablet(wait_for_state=None) for t in [unsharded_master]: t.wait_for_vttablet_state('SERVING') for t in [unsharded_replica]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', '%s/0' % UNSHARDED_KEYSPACE, unsharded_master.tablet_alias], auto_log=True) for t in [unsharded_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [unsharded_master, unsharded_replica]: t.wait_for_vttablet_state('SERVING') # rebuild to be sure we have the right version utils.run_vtctl(['RebuildKeyspaceGraph', UNSHARDED_KEYSPACE], auto_log=True) utils.check_srv_keyspace('test_nj', UNSHARDED_KEYSPACE, 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n')
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(['CreateKeyspace', KEYSPACE_NAME]) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', KEYSPACE_NAME, 'keyspace_id', 'uint64']) shard_0_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='-80') shard_0_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='-80') shard_1_master.init_tablet('master', keyspace=KEYSPACE_NAME, shard='80-') shard_1_replica.init_tablet('replica', keyspace=KEYSPACE_NAME, shard='80-') utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl(['ReparentShard', '-force', KEYSPACE_NAME+'/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', KEYSPACE_NAME+'/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace('test_nj', KEYSPACE_NAME, 'Partitions(master): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,replica') vtgate_server, vtgate_port = utils.vtgate_start()
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(["CreateKeyspace", "test_keyspace"]) utils.run_vtctl(["SetKeyspaceShardingInfo", "-force", "test_keyspace", "keyspace_id", "uint64"]) shard_0_master.init_tablet("master", keyspace="test_keyspace", shard="-80") shard_0_replica.init_tablet("replica", keyspace="test_keyspace", shard="-80") shard_1_master.init_tablet("master", keyspace="test_keyspace", shard="80-") shard_1_replica.init_tablet("replica", keyspace="test_keyspace", shard="80-") utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db("vt_test_keyspace") t.mquery(shard_0_master.dbname, create_vt_insert_test) t.mquery(shard_0_master.dbname, create_vt_a) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state("SERVING") utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/-80", shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/80-", shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,replica", ) vtgate_server, vtgate_port = utils.vtgate_start() vtgate_client = zkocc.ZkOccConnection("localhost:%u" % vtgate_port, "test_nj", 30.0) topology.read_topology(vtgate_client)
def setup_tablets(): global vtgate_server global vtgate_port # Start up a master mysql and vttablet logging.debug("Setting up tablets") utils.run_vtctl(["CreateKeyspace", KEYSPACE_NAME]) utils.run_vtctl(["SetKeyspaceShardingInfo", "-force", KEYSPACE_NAME, "keyspace_id", "uint64"]) shard_0_master.init_tablet("master", keyspace=KEYSPACE_NAME, shard="-80") shard_0_replica.init_tablet("replica", keyspace=KEYSPACE_NAME, shard="-80") shard_1_master.init_tablet("master", keyspace=KEYSPACE_NAME, shard="80-") shard_1_replica.init_tablet("replica", keyspace=KEYSPACE_NAME, shard="80-") utils.run_vtctl(["RebuildKeyspaceGraph", KEYSPACE_NAME], auto_log=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db("vt_test_keyspace") for create_table in create_tables: t.mquery(shard_0_master.dbname, create_table) t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state("SERVING") utils.run_vtctl(["ReparentShard", "-force", KEYSPACE_NAME + "/-80", shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(["ReparentShard", "-force", KEYSPACE_NAME + "/80-", shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", KEYSPACE_NAME], auto_log=True) utils.check_srv_keyspace( "test_nj", KEYSPACE_NAME, "Partitions(master): -80 80-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,replica", ) vtgate_server, vtgate_port = utils.vtgate_start()
def test_merge_sharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'custom_sharding_key', '--sharding_column_type', keyspace_id_type, 'test_keyspace' ]) shard_0_master.init_tablet('master', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('master', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('master', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_sharding_key') # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # masters will be serving for t in [shard_0_master, shard_1_master, shard_2_master]: t.wait_for_vttablet_state('SERVING') # slaves won't be, no replication state for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly, shard_2_replica, shard_2_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-40', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/40-80', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_2_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('master', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_dest_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # copy the schema utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80' ], auto_log=True) # copy the data (will also start filtered replication), reset source utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias ]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly ])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) # won't be serving, no replication state for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl( ['RunHealthCheck', shard_3_rdonly1.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br>' '<b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl( ['RunHealthCheck', shard_1_slave2.tablet_alias, 'replica']) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl( ['RunHealthCheck', shard_3_master.tablet_alias, 'replica'], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') # replica is not started, InitShardMaster should timeout shard_master.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_rdonly1.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_master, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work - expect fail # because replica tablet is not up _, stderr = utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True, expect_fail=True) self.assertIn('tablet test_nj-0000062345 ResetReplication failed', stderr) # start replica shard_replica.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_replica.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # We must start vtgate after tablets are up, or else wait until 1min refresh # (that is the tablet_refresh_interval parameter for discovery gateway) # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.vtgate = None utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) var = None # Wait for the endpoints, either local or remote. utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1, var=var) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1, var=var) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( [ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false' ], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 3) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) if base_sharding.use_multi_split_diff: logging.debug('Running vtworker MultiSplitDiff for 0') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'MultiSplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) else: logging.debug('Running vtworker SplitDiff for -80') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' timeout = 10.0 while True: try: s = utils.vtgate.split_query(sql, 'test_keyspace', 2) break except Exception: # pylint: disable=broad-except timeout = utils.wait_step( 'vtgate executes split_query properly', timeout) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_sharding_key', '--sharding_column_type', keyspace_id_type, '--split_shard_count', '4', 'test_keyspace']) shard_0_master.init_tablet('master', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('master', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('master', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the merge shards shard_dest_master.init_tablet('master', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) # Start masters with enabled healthcheck (necessary for resolving the # destination master). shard_dest_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_endpoints', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_endpoints', '1', '--source_uid', '0', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_endpoints', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet('master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_replica ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [ shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) # won't be serving, no replication state for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '2', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards should be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_master.init_tablet( 'master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly1.init_tablet( 'rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly1.init_tablet( 'rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet( 'rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables' ,'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug("Waiting for binlog players to start on new masters...") shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_master.init_tablet( 'master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly.init_tablet( 'rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly.init_tablet( 'rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly.init_tablet( 'rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('CONNECTING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--spec=-80-', shard_replica.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot shard_replica.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/-80', shard_replica.tablet_alias], auto_log=True) utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-', shard_replica.tablet_alias], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_master, shard_replica, shard_rdonly]) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = ( base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Test the correct handling of keyspace_id changes which happen after # the first clone. # Let row 2 go to shard 3 instead of shard 2. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0xD000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 2 and inserted to shard 3. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0xD000000000000000, should_be_here=False) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0xD000000000000000) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0x9000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 3 and inserted to shard 2. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0x9000000000000000) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0x9000000000000000, should_be_here=False) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 and 5 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) self._insert_value(shard_3_master, 'resharding1', 5, 'msg5', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 2, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( 'DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug( 'DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = (base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery('vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_binlog_throttler(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_binlog_throttler(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug('Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl(['SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-'], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['SplitShardCount'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) if use_clone_worker: # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint -write_masters_only', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-c0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option else: # take the snapshot for the split utils.run_vtctl([ 'MultiSnapshot', '--spec=80-c0-', '--exclude_tables=unrelated', shard_1_slave1.tablet_alias ], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-c0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=-populate_blp_checkpoint -dont_start_binlog_player', 'test_keyspace/80-c0', shard_1_slave1.tablet_alias ], auto_log=True) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not failed starting yet', timeout) continue logging.debug( "shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json([ 'ExecuteFetch', shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0' ]) self.assertEqual(qr['RowsAffected'], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not started replication yet', timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl([ 'ShardMultiRestore', '-strategy=-populate_blp_checkpoint', 'test_keyspace/c0-', shard_1_slave1.tablet_alias ], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. self._check_query_service(shard_2_master, False, False) self._check_query_service(shard_3_master, False, False) # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(rdonly): -80 80-\n' + 'TabletTypes: rdonly', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_0_ny_rdonly, True, False) self._check_query_service(shard_1_ny_rdonly, True, False) self._check_query_service(shard_1_rdonly, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(rdonly): -80 80-c0 c0-\n' + 'TabletTypes: rdonly', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_0_ny_rdonly, True, False) self._check_query_service(shard_1_ny_rdonly, False, True) self._check_query_service(shard_1_rdonly, False, True) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_slave2, True, False) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_slave2, False, True) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) self._check_query_service(shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]) for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def test_resharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_slave.init_tablet('spare', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_slave.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_ny_slave, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet( 'master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet( 'master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet( 'spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet( 'rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--spec=80-C0-', '--exclude_tables=unrelated', shard_1_slave1.tablet_alias], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink(os.path.join(environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-C0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint,dontStartBinlogPlayer', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias], auto_log=True) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step('shard 2 master has not failed starting yet', timeout) continue logging.debug("shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json(['ExecuteFetch', shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0']) self.assertEqual(qr['RowsAffected'], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step('shard 2 master has not started replication yet', timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn('<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. shard_2_master_vars = utils.get_vars(shard_2_master.port) self.assertEqual(shard_2_master_vars['TabletStateName'], 'NOT_SERVING') shard_3_master_vars = utils.get_vars(shard_3_master.port) self.assertEqual(shard_3_master_vars['TabletStateName'], 'NOT_SERVING') # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]) for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly])
def test_resharding(self): utils.run_vtctl( [ "CreateKeyspace", "--sharding_column_name", "bad_column", "--sharding_column_type", "bytes", "--split_shard_count", "2", "test_keyspace", ] ) utils.run_vtctl(["SetKeyspaceShardingInfo", "test_keyspace", "keyspace_id", "uint64"], expect_fail=True) utils.run_vtctl( [ "SetKeyspaceShardingInfo", "-force", "-split_shard_count", "4", "test_keyspace", "keyspace_id", keyspace_id_type, ] ) shard_0_master.init_tablet("master", "test_keyspace", "-80") shard_0_replica.init_tablet("replica", "test_keyspace", "-80") shard_0_ny_slave.init_tablet("spare", "test_keyspace", "-80") shard_1_master.init_tablet("master", "test_keyspace", "80-") shard_1_slave1.init_tablet("replica", "test_keyspace", "80-") shard_1_slave2.init_tablet("spare", "test_keyspace", "80-") shard_1_ny_slave.init_tablet("spare", "test_keyspace", "80-") shard_1_rdonly.init_tablet("rdonly", "test_keyspace", "80-") utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) ks = utils.run_vtctl_json(["GetSrvKeyspace", "test_nj", "test_keyspace"]) self.assertEqual(ks["SplitShardCount"], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly, ]: t.create_db("vt_test_keyspace") t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state("SERVING") shard_0_replica.wait_for_vttablet_state("SERVING") shard_0_ny_slave.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_master.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("SERVING") shard_1_slave2.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_ny_slave.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_rdonly.wait_for_vttablet_state("SERVING") # reparent to make the tablets work utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/-80", shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/80-", shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet("master", "test_keyspace", "80-c0") shard_2_replica1.init_tablet("spare", "test_keyspace", "80-c0") shard_2_replica2.init_tablet("spare", "test_keyspace", "80-c0") shard_3_master.init_tablet("master", "test_keyspace", "c0-") shard_3_replica.init_tablet("spare", "test_keyspace", "c0-") shard_3_rdonly.init_tablet("rdonly", "test_keyspace", "c0-") # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type="replica") for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly]: t.wait_for_vttablet_state("NOT_SERVING") utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/80-c0", shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(["ReparentShard", "-force", "test_keyspace/c0-", shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) if use_clone_worker: # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtworker( [ "--cell", "test_nj", "--command_display_interval", "10ms", "SplitClone", "--exclude_tables", "unrelated", "--strategy", "populateBlpCheckpoint", "--source_reader_count", "10", "--min_table_size_for_split", "1", "test_keyspace/80-c0", ], auto_log=True, ) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option else: # take the snapshot for the split utils.run_vtctl( ["MultiSnapshot", "--spec=80-c0-", "--exclude_tables=unrelated", shard_1_slave1.tablet_alias], auto_log=True, ) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-c0") ) ) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl( [ "ShardMultiRestore", "-strategy=populateBlpCheckpoint,dontStartBinlogPlayer", "test_keyspace/80-c0", shard_1_slave1.tablet_alias, ], auto_log=True, ) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step("shard 2 master has not failed starting yet", timeout) continue logging.debug("shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json( [ "ExecuteFetch", shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0', ] ) self.assertEqual(qr["RowsAffected"], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step("shard 2 master has not started replication yet", timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl( [ "ShardMultiRestore", "-strategy=populateBlpCheckpoint", "test_keyspace/c0-", shard_1_slave1.tablet_alias, ], auto_log=True, ) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(["ValidateSchemaKeyspace", "--exclude_tables=unrelated", "test_keyspace"], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker(["-cell", "test_nj", "SplitDiff", "test_keyspace/c0-"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly.tablet_alias, "rdonly"], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn("Binlog player state: Running", shard_2_master_status) self.assertIn( "<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>", shard_2_master_status ) self.assertIn("</html>", shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(["ChangeSlaveType", shard_1_slave2.tablet_alias, "replica"]) utils.run_vtctl(["ChangeSlaveType", shard_1_slave1.tablet_alias, "spare"]) shard_1_slave2.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("NOT_SERVING") # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. shard_2_master_vars = utils.get_vars(shard_2_master.port) self.assertEqual(shard_2_master_vars["TabletStateName"], "NOT_SERVING") shard_3_master_vars = utils.get_vars(shard_3_master.port) self.assertEqual(shard_3_master_vars["TabletStateName"], "NOT_SERVING") # now serve rdonly from the split shards utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "rdonly"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # then serve replica from the split shards utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-c0 c0-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # move replica back and forth utils.run_vtctl(["MigrateServedTypes", "-reverse", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-c0 c0-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(["ReparentShard", "test_keyspace/80-c0", shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker(["-cell", "test_nj", "SplitDiff", "test_keyspace/c0-"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly.tablet_alias, "rdonly"], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( "DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count, ) logging.debug( "DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count, ) # then serve master from the split shards utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-c0 c0-\n" + "Partitions(rdonly): -80 80-c0 c0-\n" + "Partitions(replica): -80 80-c0 c0-\n" + "TabletTypes: master,rdonly,replica", keyspace_id_type=keyspace_id_type, ) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn("No binlog player is running", shard_2_master_status) self.assertIn("</html>", shard_2_master_status) # scrap the original tablets in the original shard for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(["ScrapTablet", t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]) for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly]: utils.run_vtctl(["DeleteTablet", t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) # test RemoveShardCell utils.run_vtctl(["RemoveShardCell", "test_keyspace/-80", "test_nj"], auto_log=True, expect_fail=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_nj"], auto_log=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_ny"], auto_log=True) shard = utils.run_vtctl_json(["GetShard", "test_keyspace/80-"]) if shard["Cells"]: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(["DeleteShard", "test_keyspace/80-"], auto_log=True) # kill everything tablet.kill_tablets( [ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly, ] )
def test_resharding(self): # create the keyspace with just one shard shard_master.init_tablet( 'master', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None) shard_replica.start_vttablet(wait_for_state=None) shard_rdonly1.start_vttablet(wait_for_state=None) shard_master.wait_for_vttablet_state('SERVING') for t in [shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet( 'master', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet( 'master', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet( 'replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet( 'rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_rdonly1, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_1_master]: t.wait_for_vttablet_state('SERVING') for t in [shard_0_replica, shard_0_rdonly1, shard_1_replica, shard_1_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery('vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--exclude_tables', 'unrelated', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl( ['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_master.init_tablet('master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly1.init_tablet('rdonly', 'test_keyspace', '0') shard_rdonly2.init_tablet('rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly1.wait_for_vttablet_state('SERVING') shard_rdonly2.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly1.init_tablet('rdonly', 'test_keyspace', '-80') shard_0_rdonly2.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly2.init_tablet('rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_0_rdonly2, shard_1_master, shard_1_replica, shard_1_rdonly1, shard_1_rdonly2 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_0_rdonly2, shard_1_master, shard_1_replica, shard_1_rdonly1, shard_1_rdonly2 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly2.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug("Waiting for binlog players to start on new masters...") shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly2.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets( [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]) for t in [shard_master, shard_replica, shard_rdonly1, shard_rdonly2]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_0_rdonly2, shard_1_master, shard_1_replica, shard_1_rdonly1, shard_1_rdonly2 ])
def test_resharding(self): utils.run_vtctl( [ "CreateKeyspace", "--sharding_column_name", "bad_column", "--sharding_column_type", "bytes", "--split_shard_count", "2", "test_keyspace", ] ) utils.run_vtctl(["SetKeyspaceShardingInfo", "test_keyspace", "keyspace_id", "uint64"], expect_fail=True) utils.run_vtctl( [ "SetKeyspaceShardingInfo", "-force", "-split_shard_count", "4", "test_keyspace", "keyspace_id", keyspace_id_type, ] ) shard_0_master.init_tablet("master", "test_keyspace", "-80") shard_0_replica.init_tablet("replica", "test_keyspace", "-80") shard_0_ny_rdonly.init_tablet("rdonly", "test_keyspace", "-80") shard_1_master.init_tablet("master", "test_keyspace", "80-") shard_1_slave1.init_tablet("replica", "test_keyspace", "80-") shard_1_slave2.init_tablet("spare", "test_keyspace", "80-") shard_1_ny_rdonly.init_tablet("rdonly", "test_keyspace", "80-") shard_1_rdonly1.init_tablet("rdonly", "test_keyspace", "80-") utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) ks = utils.run_vtctl_json(["GetSrvKeyspace", "test_nj", "test_keyspace"]) self.assertEqual(ks["split_shard_count"], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1, ]: t.create_db("vt_test_keyspace") t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state("SERVING") shard_0_replica.wait_for_vttablet_state("SERVING") shard_0_ny_rdonly.wait_for_vttablet_state("SERVING") shard_1_master.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("SERVING") shard_1_slave2.wait_for_vttablet_state("NOT_SERVING") # spare shard_1_ny_rdonly.wait_for_vttablet_state("SERVING") shard_1_rdonly1.wait_for_vttablet_state("SERVING") # reparent to make the tablets work utils.run_vtctl(["InitShardMaster", "test_keyspace/-80", shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(["InitShardMaster", "test_keyspace/80-", shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet("master", "test_keyspace", "80-c0") shard_2_replica1.init_tablet("spare", "test_keyspace", "80-c0") shard_2_replica2.init_tablet("spare", "test_keyspace", "80-c0") shard_3_master.init_tablet("master", "test_keyspace", "c0-") shard_3_replica.init_tablet("spare", "test_keyspace", "c0-") shard_3_rdonly1.init_tablet("rdonly", "test_keyspace", "c0-") # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type="replica") for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state("NOT_SERVING") utils.run_vtctl(["InitShardMaster", "test_keyspace/80-c0", shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(["InitShardMaster", "test_keyspace/c0-", shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" + "Partitions(rdonly): -80 80-\n" + "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ("test_keyspace/80-c0", "test_keyspace/c0-"): utils.run_vtctl( ["CopySchemaShard", "--exclude_tables", "unrelated", shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True, ) utils.run_vtworker( [ "--cell", "test_nj", "--command_display_interval", "10ms", "SplitClone", "--exclude_tables", "unrelated", "--strategy=-populate_blp_checkpoint", "--source_reader_count", "10", "--min_table_size_for_split", "1", "test_keyspace/80-", ], auto_log=True, ) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly1.tablet_alias, "rdonly"], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(["ValidateSchemaKeyspace", "--exclude_tables=unrelated", "test_keyspace"], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ["-cell", "test_nj", "SplitDiff", "--exclude_tables", "unrelated", "test_keyspace/c0-"], auto_log=True ) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn("Binlog player state: Running", shard_2_master_status) self.assertIn( "<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>", shard_2_master_status ) self.assertIn("</html>", shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(["ChangeSlaveType", shard_1_slave2.tablet_alias, "replica"]) utils.run_vtctl(["ChangeSlaveType", shard_1_slave1.tablet_alias, "spare"]) shard_1_slave2.wait_for_vttablet_state("SERVING") shard_1_slave1.wait_for_vttablet_state("NOT_SERVING") # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(["RunHealthCheck", shard_3_master.tablet_alias, "replica"], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(["VtTabletStreamHealth", "-count", "1", master.tablet_alias]) logging.debug("Got health: %s", str(stream_health)) self.assertIn("realtime_stats", stream_health) self.assertNotIn("serving", stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(["MigrateServedTypes", "--cells=test_nj", "test_keyspace/80-", "rdonly"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_srv_keyspace( "test_ny", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "rdonly"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_srv_keyspace( "test_ny", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards source_tablet = shard_1_slave2 destination_shards = ["test_keyspace/80-c0", "test_keyspace/c0-"] utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-c0 c0-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl(["MigrateServedTypes", "-reverse", "test_keyspace/80-", "replica"], auto_log=True) # After a backwards migration, queryservice should be enabled on source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other reasons than the migration, # so check the shard record instead of the tablets directly utils.check_shard_query_services(self, destination_shards, tablet.Tablet.tablet_type_value["REPLICA"], False) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-\n", keyspace_id_type=keyspace_id_type, ) utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "replica"], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, tablet.Tablet.tablet_type_value["REPLICA"], True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-c0 c0-\n", keyspace_id_type=keyspace_id_type, ) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(["PlannedReparentShard", "test_keyspace/80-c0", shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ["-cell", "test_nj", "SplitDiff", "--exclude_tables", "unrelated", "test_keyspace/c0-"], auto_log=True ) utils.run_vtctl(["ChangeSlaveType", shard_1_rdonly1.tablet_alias, "rdonly"], auto_log=True) utils.run_vtctl(["ChangeSlaveType", shard_3_rdonly1.tablet_alias, "rdonly"], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( "DELAY 1: %s max_lag=%d avg_lag=%d", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count, ) logging.debug( "DELAY 2: %s max_lag=%d avg_lag=%d", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count, ) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(["SourceShardDelete", "test_keyspace/c0-", "0"], auto_log=True) utils.run_vtctl( ["SourceShardAdd", "--key_range=80-", "test_keyspace/c0-", "0", "test_keyspace/80-"], auto_log=True ) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(["MigrateServedTypes", "test_keyspace/80-", "master"], auto_log=True) utils.check_srv_keyspace( "test_nj", "test_keyspace", "Partitions(master): -80 80-c0 c0-\n" "Partitions(rdonly): -80 80-c0 c0-\n" "Partitions(replica): -80 80-c0 c0-\n", keyspace_id_type=keyspace_id_type, ) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn("No binlog player is running", shard_2_master_status) self.assertIn("</html>", shard_2_master_status) # scrap the original tablets in the original shard for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(["ScrapTablet", t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(["DeleteTablet", t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(["RebuildKeyspaceGraph", "test_keyspace"], auto_log=True) # test RemoveShardCell utils.run_vtctl(["RemoveShardCell", "test_keyspace/-80", "test_nj"], auto_log=True, expect_fail=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_nj"], auto_log=True) utils.run_vtctl(["RemoveShardCell", "test_keyspace/80-", "test_ny"], auto_log=True) shard = utils.run_vtctl_json(["GetShard", "test_keyspace/80-"]) self.assertNotIn("cells", shard) # delete the original shard utils.run_vtctl(["DeleteShard", "test_keyspace/80-"], auto_log=True) # kill everything tablet.kill_tablets( [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1, ] )
def test_resharding(self): # create the keyspace with just one shard shard_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') shard_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') shard_rdonly1.start_vttablet( wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='0') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # must start vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.VtGate().start(cache_ttl='0') # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', keyspace_id_type]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias, 'replica']) # create the split shards shard_0_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='-80') shard_0_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='-80') shard_0_rdonly1.start_vttablet( wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='-80') shard_1_master.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='80-') shard_1_replica.start_vttablet( wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='80-') shard_1_rdonly1.start_vttablet( wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='80-') # start vttablet on the split shards (no db created, # so they're all not serving) sharded_tablets = [shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1] for t in sharded_tablets: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() utils.VtGate().start(cache_ttl='0') # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias, 'rdonly']) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl( ['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br>' '<b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias, 'replica']) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias, 'replica'], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug('Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl(['SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-'], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # delete the original tablets in the original shard tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1])
def test_cells_aliases(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = (base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args, binlog_use_v3_resharding_mode=False) # wait for the tablets (replication is not setup, they won't be healthy) for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # Make sure srv keyspace graph looks as expected utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # Bootstrap vtgate utils.apply_vschema(vschema) # Adds alias so vtgate can route to replica/rdonly tablets that are not in the same cell, but same alias if use_alias: utils.run_vtctl(['AddCellsAlias', '-cells', 'test_nj,test_ny','region_east_coast'], auto_log=True) tablet_types_to_wait='MASTER,REPLICA' else: tablet_types_to_wait='MASTER' utils.VtGate().start( tablets=[shard_0_master, shard_1_master], tablet_types_to_wait=tablet_types_to_wait, cells_to_watch='test_nj,test_ny', ) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) vtgate_conn = self._get_connection() result = self._execute_on_tablet_type( vtgate_conn, 'master', 'select count(*) from test_table', {}) self.assertEqual( result, ([(3,)], 1, 0, [('count(*)', self.int_type)])) if use_alias: vtgate_conn = self._get_connection() result = self._execute_on_tablet_type( vtgate_conn, 'master', 'select count(*) from test_table', {}) self.assertEqual( result, ([(3,)], 1, 0, [('count(*)', self.int_type)])) vtgate_conn = self._get_connection() result = self._execute_on_tablet_type( vtgate_conn, 'replica', 'select count(*) from test_table', {}) self.assertEqual( result, ([(3,)], 1, 0, [('count(*)', self.int_type)])) vtgate_conn = self._get_connection() result = self._execute_on_tablet_type( vtgate_conn, 'rdonly', 'select count(*) from test_table', {}) self.assertEqual( result, ([(3,)], 1, 0, [('count(*)', self.int_type)])) else: vtgate_conn = self._get_connection() try: self._execute_on_tablet_type( vtgate_conn, 'replica', 'select count(*) from test_table', {}) self.fail('Expected execute to fail, did not get error') except Exception as e: s = str(e) self.assertIn('80.replica, no valid tablet: node', s) vtgate_conn = self._get_connection() try: self._execute_on_tablet_type( vtgate_conn, 'rdonly', 'select count(*) from test_table', {}) self.fail('Expected execute to fail, did not get error') except Exception as e: s = str(e) self.assertIn('80.rdonly, no valid tablet: node', s)
def test_resharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', keyspace_id_type]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1]: t.start_vttablet(wait_for_state=None) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias], auto_log=True) # check the shards shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'LegacySplitClone', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace'], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl(['SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-'], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]) for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl(['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl( ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl( ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', keyspace_id_type ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, they won't be healthy) for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'LegacySplitClone', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet('master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl( ['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-C0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias ], auto_log=True) # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias ], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets( [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly]) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def test_resharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace']) utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'], expect_fail=True) utils.run_vtctl(['SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type]) shard_0_master.init_tablet( 'master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet( 'master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet( 'master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet( 'master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly]: t.start_vttablet(wait_for_state=None) shard_2_master.wait_for_vttablet_state('CONNECTING') shard_2_replica1.wait_for_vttablet_state('NOT_SERVING') shard_2_replica2.wait_for_vttablet_state('NOT_SERVING') shard_3_master.wait_for_vttablet_state('CONNECTING') shard_3_replica.wait_for_vttablet_state('NOT_SERVING') shard_3_rdonly.wait_for_vttablet_state('CONNECTING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', '-use-served-types', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica') # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot, # and check all the others while we're at it shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias], auto_log=True) utils.run_vtctl(['ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica') # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica') # move replica back and forth utils.run_vtctl(['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl(['ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias]) logging.debug("Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica') # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # kill everything tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly])
def verify(self): self.assert_shard_data_equal(0, worker.shard_master, worker.shard_0_tablets.replica) self.assert_shard_data_equal(1, worker.shard_master, worker.shard_1_tablets.replica) # Verify effect of MigrateServedTypes. Dest shards are serving now. utils.check_srv_keyspace( 'test_nj', self.KEYSPACE, 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n') # Check that query service is disabled (source shard) or enabled (dest). # The 'rdonly' tablet requires an explicit healthcheck first because # the following sequence of events is happening in this test: # - SplitDiff returns 'rdonly' as 'spare' tablet (NOT_SERVING) # - MigrateServedTypes runs and does not refresh then 'spare' tablet # (still NOT_SERVING) # Shard_TabletControl.DisableQueryService=true will be set in the topology # - explicit or periodic healthcheck runs: # a) tablet seen as caught up, change type from 'spare' to 'rdonly' # (change to SERVING) # b) post-action callback agent.refreshTablet() reads the topology # and finds out that DisableQueryService=true is set. # (change to NOT_SERVING) # # We must run an explicit healthcheck or we can see one of the two states: # - NOT_SERVING, DisableQueryService=false, tablet type 'spare' # (immediately after SplitDiff returned) # - SERVING, DisableQueryService=false, tablet type 'rdonly' # (during healthcheck before post-action callback is called) utils.run_vtctl(['RunHealthCheck', worker.shard_rdonly1.tablet_alias], auto_log=True) # source shard: query service must be disabled after MigrateServedTypes. utils.check_tablet_query_service(self, worker.shard_rdonly1, serving=False, tablet_control_disabled=True) utils.check_tablet_query_service(self, worker.shard_replica, serving=False, tablet_control_disabled=True) utils.check_tablet_query_service(self, worker.shard_master, serving=False, tablet_control_disabled=True) # dest shard -80: query service must be disabled after MigrateServedTypes. # Run explicit healthcheck because 'rdonly' tablet may still be 'spare'. utils.run_vtctl( ['RunHealthCheck', worker.shard_0_rdonly1.tablet_alias], auto_log=True) utils.check_tablet_query_service(self, worker.shard_0_rdonly1, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_0_replica, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_0_master, serving=True, tablet_control_disabled=False) # dest shard 80-: query service must be disabled after MigrateServedTypes. # Run explicit healthcheck because 'rdonly' tablet is still 'spare'. utils.run_vtctl( ['RunHealthCheck', worker.shard_1_rdonly1.tablet_alias], auto_log=True) utils.check_tablet_query_service(self, worker.shard_1_rdonly1, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_1_replica, serving=True, tablet_control_disabled=False) utils.check_tablet_query_service(self, worker.shard_1_master, serving=True, tablet_control_disabled=False)