def test_scrap_and_reinit(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') # one master one replica tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('replica', 'test_keyspace', '0') # make sure the replica is in the replication graph before_scrap = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(1, len(before_scrap['ReplicationLinks']), 'wrong replication links before: %s' % str(before_scrap)) # scrap and re-init utils.run_vtctl(['ScrapTablet', '-force', tablet_62044.tablet_alias]) tablet_62044.init_tablet('replica', 'test_keyspace', '0') after_scrap = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(1, len(after_scrap['ReplicationLinks']), 'wrong replication links after: %s' % str(after_scrap)) # manually add a bogus entry to the replication graph, and check # it is removed by ShardReplicationFix utils.run_vtctl(['ShardReplicationAdd', 'test_keyspace/0', 'test_nj-0000066666', 'test_nj-0000062344'], auto_log=True) with_bogus = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(2, len(with_bogus['ReplicationLinks']), 'wrong replication links with bogus: %s' % str(with_bogus)) utils.run_vtctl(['ShardReplicationFix', 'test_nj', 'test_keyspace/0'], auto_log=True) after_fix = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(1, len(after_scrap['ReplicationLinks']), 'wrong replication links after fix: %s' % str(after_fix))
def _test_reparent_from_outside_check(self, brutal, base_time): # make sure the shard replication graph is fine shard_replication = utils.run_vtctl_json(["GetShardReplication", "test_nj", "test_keyspace/0"]) hashed_nodes = {} for node in shard_replication["nodes"]: key = node["tablet_alias"]["cell"] + "-" + str(node["tablet_alias"]["uid"]) hashed_nodes[key] = True logging.debug("Got shard replication nodes: %s", str(hashed_nodes)) expected_nodes = {"test_nj-41983": True, "test_nj-62044": True} if not brutal: expected_nodes["test_nj-62344"] = True self.assertEqual( expected_nodes, hashed_nodes, "Got unexpected nodes: %s != %s" % (str(expected_nodes), str(hashed_nodes)) ) # make sure the master status page says it's the master tablet_62044_master_status = tablet_62044.get_status() self.assertIn("Serving graph: test_keyspace 0 master", tablet_62044_master_status) # make sure the master health stream says it's the master too # (health check is disabled on these servers, force it first) utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"]) health = utils.run_vtctl_json(["VtTabletStreamHealth", "-count", "1", tablet_62044.tablet_alias]) self.assertEqual(health["target"]["tablet_type"], topodata_pb2.MASTER) # have to compare the int version, or the rounding errors can break self.assertTrue(health["tablet_externally_reparented_timestamp"] >= int(base_time))
def test_shard_replication_fix(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') # one master one replica tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('replica', 'test_keyspace', '0') # make sure the replica is in the replication graph before_bogus = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(2, len(before_bogus['nodes']), 'wrong shard replication nodes before: %s' % str(before_bogus)) # manually add a bogus entry to the replication graph, and check # it is removed by ShardReplicationFix utils.run_vtctl(['ShardReplicationAdd', 'test_keyspace/0', 'test_nj-0000066666'], auto_log=True) with_bogus = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(3, len(with_bogus['nodes']), 'wrong shard replication nodes with bogus: %s' % str(with_bogus)) utils.run_vtctl(['ShardReplicationFix', 'test_nj', 'test_keyspace/0'], auto_log=True) after_fix = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) self.assertEqual(2, len(after_fix['nodes']), 'wrong shard replication nodes after fix: %s' % str(after_fix))
def _test_sanity(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', '-force', 'test_keyspace']) utils.run_vtctl(['createshard', '-force', 'test_keyspace/0']) tablet_62344.init_tablet('master', 'test_keyspace', '0', parent=False) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace']) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') # if these statements don't run before the tablet it will wedge # waiting for the db to become accessible. this is more a bug than # a feature. tablet_62344.populate('vt_test_keyspace', self._create_vt_select_test, self._populate_vt_select_test) tablet_62344.start_vttablet() # make sure the query service is started right away result, _ = utils.run_vtctl(['Query', 'test_nj', 'test_keyspace', 'select * from vt_select_test'], mode=utils.VTCTL_VTCTL, trap_output=True) rows = result.splitlines() self.assertEqual(len(rows), 5, "expected 5 rows in vt_select_test: %s %s" % (str(rows), result)) # make sure direct dba queries work query_result = utils.run_vtctl_json(['ExecuteFetch', '-want_fields', tablet_62344.tablet_alias, 'select * from vt_test_keyspace.vt_select_test']) self.assertEqual(len(query_result['Rows']), 4, "expected 4 rows in vt_select_test: %s" % str(query_result)) self.assertEqual(len(query_result['Fields']), 2, "expected 2 fields in vt_select_test: %s" % str(query_result)) # check Pings utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) utils.run_vtctl(['RpcPing', tablet_62344.tablet_alias]) # Quickly check basic actions. utils.run_vtctl(['SetReadOnly', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.run_vtctl(['SetReadWrite', tablet_62344.tablet_alias]) utils.check_db_read_write(62344) utils.run_vtctl(['DemoteMaster', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.validate_topology() utils.run_vtctl(['ValidateKeyspace', 'test_keyspace']) # not pinging tablets, as it enables replication checks, and they # break because we only have a single master, no slaves utils.run_vtctl(['ValidateShard', '-ping-tablets=false', 'test_keyspace/0']) srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.kill_vttablet() tablet_62344.init_tablet('idle') tablet_62344.scrap(force=True)
def _test_reparent_from_outside_check(self, brutal, base_time): # make sure the shard replication graph is fine shard_replication = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) hashed_nodes = {} for node in shard_replication['nodes']: key = node['tablet_alias']['cell']+'-'+str(node['tablet_alias']['uid']) hashed_nodes[key] = True logging.debug('Got shard replication nodes: %s', str(hashed_nodes)) expected_nodes = { 'test_nj-41983': True, 'test_nj-62044': True, } if not brutal: expected_nodes['test_nj-62344'] = True self.assertEqual(expected_nodes, hashed_nodes, 'Got unexpected nodes: %s != %s' % (str(expected_nodes), str(hashed_nodes))) # make sure the master status page says it's the master tablet_62044_master_status = tablet_62044.get_status() self.assertIn('Serving graph: test_keyspace 0 master', tablet_62044_master_status) # make sure the master health stream says it's the master too # (health check is disabled on these servers, force it first) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias, 'replica']) health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertEqual(health['target']['tablet_type'], topodata_pb2.MASTER) # have to compare the int version, or the rounding errors can break self.assertTrue( health['tablet_externally_reparented_timestamp'] >= int(base_time))
def test_vtctl_copyschemashard_different_dbs_should_fail(self): # Apply initial schema to the whole keyspace before creating shard 2. self._apply_initial_schema() _setup_shard_2() try: # InitShardMaster creates the db, but there shouldn't be any tables yet. self._check_tables(shard_2_master, 0) self._check_tables(shard_2_replica1, 0) # Change the db charset on the destination shard from utf8 to latin1. # This will make CopySchemaShard fail during its final diff. # (The different charset won't be corrected on the destination shard # because we use "CREATE DATABASE IF NOT EXISTS" and this doesn't fail if # there are differences in the options e.g. the character set.) shard_2_schema = self._get_schema(shard_2_master.tablet_alias) self.assertIn('utf8', shard_2_schema['database_schema']) utils.run_vtctl_json( ['ExecuteFetchAsDba', '-json', shard_2_master.tablet_alias, 'ALTER DATABASE vt_test_keyspace CHARACTER SET latin1']) _, stderr = utils.run_vtctl(['CopySchemaShard', 'test_keyspace/0', 'test_keyspace/2'], expect_fail=True, auto_log=True) self.assertIn('source and dest don\'t agree on database creation command', stderr) # shard_2_master should have the same number of tables. Only the db # character set is different. self._check_tables(shard_2_master, 4) finally: _teardown_shard_2()
def test_master_restart_sets_ter_timestamp(self): """Test that TER timestamp is set when we restart the MASTER vttablet. TER = TabletExternallyReparented. See StreamHealthResponse.tablet_externally_reparented_timestamp for details. """ master, replica = tablet_62344, tablet_62044 tablets = [master, replica] # Start vttablets. Our future master is initially a REPLICA. for t in tablets: t.create_db('vt_test_keyspace') for t in tablets: t.start_vttablet(wait_for_state='NOT_SERVING', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') # Initialize tablet as MASTER. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', master.tablet_alias]) master.wait_for_vttablet_state('SERVING') # Capture the current TER. health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) self.assertEqual(topodata_pb2.MASTER, health['target']['tablet_type']) self.assertIn('tablet_externally_reparented_timestamp', health) self.assertGreater(health['tablet_externally_reparented_timestamp'], 0, 'TER on MASTER must be set after InitShardMaster') # Restart the MASTER vttablet. master.kill_vttablet() master.start_vttablet(wait_for_state='SERVING', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') # Make sure that the TER increased i.e. it was set to the current time. health_after_restart = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) self.assertEqual(topodata_pb2.MASTER, health_after_restart['target']['tablet_type']) self.assertIn('tablet_externally_reparented_timestamp', health_after_restart) self.assertGreater( health_after_restart['tablet_externally_reparented_timestamp'], health['tablet_externally_reparented_timestamp'], 'When the MASTER vttablet was restarted, the TER timestamp must be set' ' to the current time.') # Shutdown. for t in tablets: t.kill_vttablet()
def test_reparent_cross_cell(self, shard_id="0"): utils.run_vtctl(["CreateKeyspace", "test_keyspace"]) # create the database so vttablets start, as they are serving tablet_62344.create_db("vt_test_keyspace") tablet_62044.create_db("vt_test_keyspace") tablet_41983.create_db("vt_test_keyspace") tablet_31981.create_db("vt_test_keyspace") # Start up a master mysql and vttablet tablet_62344.init_tablet("master", "test_keyspace", shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(["GetShard", "test_keyspace/" + shard_id]) self.assertEqual(shard["cells"], ["test_nj"], "wrong list of cell in Shard: %s" % str(shard["cells"])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False) for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state("SERVING") shard = utils.run_vtctl_json(["GetShard", "test_keyspace/" + shard_id]) self.assertEqual( shard["cells"], ["test_nj", "test_ny"], "wrong list of cell in Shard: %s" % str(shard["cells"]) ) # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(["RebuildShardGraph", "test_keyspace/" + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are # identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(["InitShardMaster", "test_keyspace/" + shard_id, tablet_62344.tablet_alias], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, "master", tablet_62344.port) # Verify MasterCell is properly set self._check_master_cell("test_nj", shard_id, "test_nj") self._check_master_cell("test_ny", shard_id, "test_nj") # Perform a graceful reparent operation to another cell. utils.pause("test_reparent_cross_cell PlannedReparentShard") utils.run_vtctl(["PlannedReparentShard", "test_keyspace/" + shard_id, tablet_31981.tablet_alias], auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, "master", tablet_31981.port, cell="test_ny") # Verify MasterCell is set to new cell. self._check_master_cell("test_nj", shard_id, "test_ny") self._check_master_cell("test_ny", shard_id, "test_ny") tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_reparent_cross_cell(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. Won't be healthy # as replication is not running. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_62344.wait_for_vttablet_state('SERVING') for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are # identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(['InitShardMaster', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_master_tablet(tablet_62344) # Perform a graceful reparent operation to another cell. utils.pause('test_reparent_cross_cell PlannedReparentShard') utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id, tablet_31981.tablet_alias], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_31981) tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def _check_master_tablet(self, t, port=None): """Makes sure the tablet type is master, and its health check agrees.""" ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) self.assertEqual(ti['type'], topodata_pb2.MASTER) if port: self.assertEqual(ti['port_map']['vt'], port) # make sure the health stream is updated health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', t.tablet_alias]) self.assertIn('serving', health) self.assertEqual(health['target']['tablet_type'], topodata_pb2.MASTER)
def test_health_check(self): utils.run_vtctl('CreateKeyspace test_keyspace') # one master, one replica that starts in spare tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if ti['Type'] == "replica": logging.info("Slave tablet went to replica, good") break timeout = utils.wait_step('slave tablet going to replica', timeout) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['Type'], 'master', "unexpected master type: %s" % ti['Type']) # stop replication on the slave, see it trigger the slave going # slightly unhealthy tablet_62044.mquery('', 'stop slave') timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if 'Health' in ti and ti['Health']: if 'replication_lag' in ti['Health']: if ti['Health']['replication_lag'] == 'high': logging.info("Slave tablet replication_lag went to high, good") break timeout = utils.wait_step('slave has high replication lag', timeout) # make sure the serving graph was updated ep = utils.run_vtctl_json(['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) if not ep['entries'][0]['health']: self.fail('Replication lag parameter not propagated to serving graph: %s' % str(ep)) self.assertEqual(ep['entries'][0]['health']['replication_lag'], 'high', 'Replication lag parameter not propagated to serving graph: %s' % str(ep)) tablet.kill_tablets([tablet_62344, tablet_62044])
def test_scrap(self): # Start up a master mysql and vttablet utils.run_vtctl(["CreateKeyspace", "test_keyspace"]) tablet_62344.init_tablet("master", "test_keyspace", "0") tablet_62044.init_tablet("replica", "test_keyspace", "0") utils.run_vtctl(["RebuildShardGraph", "test_keyspace/*"]) utils.validate_topology() srvShard = utils.run_vtctl_json(["GetSrvShard", "test_nj", "test_keyspace/0"]) self.assertEqual(srvShard["MasterCell"], "test_nj") tablet_62044.scrap(force=True) utils.validate_topology() srvShard = utils.run_vtctl_json(["GetSrvShard", "test_nj", "test_keyspace/0"]) self.assertEqual(srvShard["MasterCell"], "test_nj")
def test_scrap(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('replica', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/*']) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62044.scrap(force=True) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj')
def test_health_check_uid_collision(self): # If two tablets are running with the same UID, we should prevent the # healthcheck on the older one from modifying the tablet record after the # record has been claimed by a newer instance. tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') # Before starting tablets, simulate another tablet # owning the replica's record. utils.run_vtctl(['InitTablet', '-allow_update', '-hostname', 'localhost', '-keyspace', 'test_keyspace', '-shard', '0', '-port', '0', '-parent', tablet_62044.tablet_alias, 'replica']) # Set up tablets. tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) tablet_62044.wait_for_vttablet_state('SERVING') # Check that the tablet owns the record. tablet_record = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEquals(tablet_record['port_map']['vt'], tablet_62044.port, "tablet didn't take over the record") # Take away ownership again. utils.run_vtctl(['InitTablet', '-allow_update', '-hostname', 'localhost', '-keyspace', 'test_keyspace', '-shard', '0', '-port', '0', '-parent', tablet_62044.tablet_alias, 'replica']) # Tell the tablets to shutdown gracefully, # which normally includes going SPARE. tablet.kill_tablets([tablet_62344, tablet_62044]) # Make sure the tablet record hasn't been touched. tablet_record = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEquals(tablet_record['type'], tablet_62044.tablet_type_value['REPLICA'], 'tablet changed record without owning it')
def test_charset(self): start_position = mysql_flavor().master_position(dst_replica) logging.debug('test_charset: starting @ %s', start_position) # Insert something that will replicate incorrectly if the charset is not # propagated through binlog streamer to the destination. # # Vitess tablets default to using utf8, so we insert something crazy and # pretend it's latin1. If the binlog player doesn't also pretend it's # latin1, it will be inserted as utf8, which will change its value. src_master.mquery( 'vt_test_keyspace', "INSERT INTO test_table (id, keyspace_id, msg) " "VALUES (41523, 1, 'Šṛ́rỏé') /* vtgate:: keyspace_id:00000001 */", conn_params={'charset': 'latin1'}, write=True) # Wait for it to replicate. event = utils.run_vtctl_json(['VtTabletUpdateStream', '-position', start_position, '-count', '1', dst_replica.tablet_alias]) self.assertIn('event_token', event) self.assertIn('timestamp', event['event_token']) # Check the value. data = dst_master.mquery( 'vt_test_keyspace', 'SELECT id, keyspace_id, msg FROM test_table WHERE id=41523 LIMIT 1') self.assertEqual(len(data), 1, 'No data replicated.') self.assertEqual(len(data[0]), 3, 'Wrong number of columns.') self.assertEqual(data[0][2], 'Šṛ́rỏé', 'Data corrupted due to wrong charset.')
def check_stream_health_equals_binlog_player_vars(self, tablet_obj, count): """Checks the variables exported by streaming health check match vars. Args: tablet_obj: the tablet to check. count: number of binlog players to expect. """ blp_stats = utils.get_vars(tablet_obj.port) self.assertEqual(blp_stats['BinlogPlayerMapSize'], count) # Enforce health check because it's not running by default as # tablets may not be started with it, or may not run it in time. utils.run_vtctl(['RunHealthCheck', tablet_obj.tablet_alias]) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', tablet_obj.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertNotIn('serving', stream_health) self.assertIn('realtime_stats', stream_health) self.assertNotIn('health_error', stream_health['realtime_stats']) self.assertIn('binlog_players_count', stream_health['realtime_stats']) self.assertEqual(blp_stats['BinlogPlayerMapSize'], stream_health['realtime_stats']['binlog_players_count']) self.assertEqual(blp_stats['BinlogPlayerSecondsBehindMaster'], stream_health['realtime_stats'].get( 'seconds_behind_master_filtered_replication', 0))
def _test_reparent_from_outside_check(self, brutal): if environment.topo_server().flavor() != 'zookeeper': return # make sure the shard replication graph is fine shard_replication = utils.run_vtctl_json(['GetShardReplication', 'test_nj', 'test_keyspace/0']) hashed_nodes = {} for node in shard_replication['nodes']: key = node['tablet_alias']['cell']+'-'+str(node['tablet_alias']['uid']) hashed_nodes[key] = True logging.debug('Got shard replication nodes: %s', str(hashed_nodes)) expected_nodes = { 'test_nj-41983': True, 'test_nj-62044': True, } if not brutal: expected_nodes['test_nj-62344'] = True self.assertEqual(expected_nodes, hashed_nodes, 'Got unexpected nodes: %s != %s' % (str(expected_nodes), str(hashed_nodes))) tablet_62044_master_status = tablet_62044.get_status() self.assertIn('Serving graph: test_keyspace 0 master', tablet_62044_master_status)
def execute(self, sql, bindvars=None, transaction_id=None, execute_options=None, auto_log=True): """execute uses 'vtctl VtTabletExecute' to execute a command. Args: sql: the command to execute. bindvars: a dict of bind variables. transaction_id: the id of the transaction to use if necessary. execute_options: proto-encoded ExecuteOptions object. auto_log: passed to run_vtctl. Returns: the result of running vtctl command. """ args = [ 'VtTabletExecute', '-json', ] if bindvars: args.extend(['-bind_variables', json.dumps(bindvars)]) if transaction_id: args.extend(['-transaction_id', str(transaction_id)]) if execute_options: args.extend(['-options', execute_options]) args.extend([self.tablet_alias, sql]) return utils.run_vtctl_json(args, auto_log=auto_log)
def _verify_vtctl_set_shard_tablet_control(self): """Test that manually editing the blacklisted tables works correctly. TODO(mberlin): This is more an integration test and should be moved to the Go codebase eventually. """ # check 'vtctl SetShardTabletControl' command works as expected: # clear the rdonly entry: utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly'], auto_log=True) self._assert_tablet_controls([topodata_pb2.MASTER, topodata_pb2.REPLICA]) # re-add rdonly: utils.run_vtctl(['SetShardTabletControl', '--tables=moving.*,view1', 'source_keyspace/0', 'rdonly'], auto_log=True) self._assert_tablet_controls([topodata_pb2.MASTER, topodata_pb2.REPLICA, topodata_pb2.RDONLY]) # and then clear all entries: utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly'], auto_log=True) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'replica'], auto_log=True) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'master'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertNotIn('tablet_controls', shard_json)
def _check_srv_keyspace(self, expected): cell = 'test_nj' keyspace = 'destination_keyspace' ks = utils.run_vtctl_json(['GetSrvKeyspace', cell, keyspace]) result = '' if 'served_from' in ks and ks['served_from']: a = [] for served_from in sorted(ks['served_from']): tt = topodata_pb2.TabletType.Name(served_from['tablet_type']).lower() if tt == 'batch': tt = 'rdonly' a.append('ServedFrom(%s): %s\n' % (tt, served_from['keyspace'])) for line in sorted(a): result += line logging.debug('Cell %s keyspace %s has data:\n%s', cell, keyspace, result) self.assertEqual( expected, result, 'Mismatch in srv keyspace for cell %s keyspace %s, expected:\n' '%s\ngot:\n%s' % ( cell, keyspace, expected, result)) self.assertNotIn('sharding_column_name', ks, 'Got a sharding_column_name in SrvKeyspace: %s' % str(ks)) self.assertNotIn('sharding_column_type', ks, 'Got a sharding_column_type in SrvKeyspace: %s' % str(ks))
def _check_blacklisted_tables(self, tablet, expected): ti = utils.run_vtctl_json(['GetTablet', tablet.tablet_alias]) logging.debug("Tablet %s has balcklisted tables: %s", tablet.tablet_alias, ti['BlacklistedTables']) self.assertEqual(ti['BlacklistedTables'], expected, "Got unexpected BlacklistedTables: %s (expecting %s)" %( ti['BlacklistedTables'], expected))
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db("vt_test_keyspace") pos = mysql_flavor().master_position(tablet_62344) changeMasterCmds = mysql_flavor().change_master_commands(utils.hostname, tablet_62344.mysql_port, pos) tablet_62044.mquery("", ["RESET MASTER", "RESET SLAVE"] + changeMasterCmds + ["START SLAVE"]) # now shutdown all mysqld shutdown_procs = [tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql()] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet("master", "test_keyspace", "0") tablet_62044.init_tablet("spare", "test_keyspace", "0", include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet( wait_for_state=None, target_tablet_type="replica", full_mycnf_args=True, include_mysql_port=False ) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state("NOT_SERVING") self.check_healthz(t, False) # restart mysqld start_procs = [tablet_62344.start_mysql(), tablet_62044.start_mysql()] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl(["RunHealthCheck", tablet_62344.tablet_alias, "replica"], auto_log=True) self.check_healthz(tablet_62344, True) # the slave won't be healthy at first, as replication is not running utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True) self.check_healthz(tablet_62044, False) tablet_62044.wait_for_vttablet_state("NOT_SERVING") # restart replication tablet_62044.mquery("", ["START SLAVE"]) # wait for the tablet to become healthy and fix its mysql port utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True) tablet_62044.wait_for_vttablet_state("SERVING") self.check_healthz(tablet_62044, True) for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(["GetTablet", t.tablet_alias]) if "mysql" in ti["Portmap"]: break timeout = utils.wait_step("mysql port in tablet record", timeout) self.assertEqual(ti["Portmap"]["mysql"], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def test_vtaction_dies_hard(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as it is serving tablet_62344.create_db('vt_test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True) # start a 'vtctl Sleep' command, don't wait for it action_path, _ = utils.run_vtctl(['-no-wait', 'Sleep', tablet_62344.tablet_alias, '60s'], trap_output=True) action_path = action_path.strip() # wait for the action to be 'Running', capture its pid timeout = 10 while True: an = utils.run_vtctl_json(['ReadTabletAction', action_path]) if an.get('State', None) == 'Running': pid = an['Pid'] logging.info("Action is running with pid %u, good", pid) break timeout = utils.wait_step('sleep action to run', timeout) # let's kill it hard, wait until it's gone for good os.kill(pid, signal.SIGKILL) try: os.waitpid(pid, 0) except OSError: # this means the process doesn't exist any more, we're good pass # Then let's make sure the next action cleans up properly and can execute. # If that doesn't work, this will time out and the test will fail. utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) tablet_62344.kill_vttablet()
def test_sigterm(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as it is serving tablet_62344.create_db('vt_test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True) # start a 'vtctl Sleep' command, don't wait for it action_path, _ = utils.run_vtctl(['-no-wait', 'Sleep', tablet_62344.tablet_alias, '60s'], trap_output=True) action_path = action_path.strip() # wait for the action to be 'Running', capture its pid timeout = 10 while True: an = utils.run_vtctl_json(['ReadTabletAction', action_path]) if an.get('State', None) == 'Running': pid = an['Pid'] logging.info("Action is running with pid %u, good", pid) break timeout = utils.wait_step('sleep action to run', timeout) # let's kill the vtaction process with a regular SIGTERM os.kill(pid, signal.SIGTERM) # check the vtctl command got the right remote error back out, err = utils.run_vtctl(['WaitForAction', action_path], trap_output=True, raise_on_error=False) if "vtaction interrupted by signal" not in err: self.fail("cannot find expected output in error: " + err) logging.debug("vtaction was interrupted correctly:\n" + err) tablet_62344.kill_vttablet()
def test_actions_and_timeouts(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['RpcPing', tablet_62344.tablet_alias]) # schedule long action in the background, sleep a little bit to make sure # it started to run args = (environment.binary_args('vtctl') + environment.topo_server_flags() + environment.tablet_manager_protocol_flags() + environment.tabletconn_protocol_flags() + ['-log_dir', environment.vtlogroot, 'Sleep', tablet_62344.tablet_alias, '10s']) bg = utils.run_bg(args) time.sleep(3) # try a frontend RpcPing that should timeout as the tablet is busy # running the other one stdout, stderr = utils.run_vtctl(['-wait-time', '3s', 'RpcPing', tablet_62344.tablet_alias], expect_fail=True) if 'Timeout waiting for' not in stderr: self.fail("didn't find the right error strings in failed RpcPing: " + stderr) # wait for the background vtctl bg.wait() if environment.topo_server_implementation == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: self.fail('not enough time in Connected state: %u', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def test_restart_during_action(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) # schedule long action utils.run_vtctl(['-no-wait', 'Sleep', tablet_62344.tablet_alias, '15s'], stdout=utils.devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl(['-no-wait', 'Ping', tablet_62344.tablet_alias], trap_output=True) action_path = action_path.strip() # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_vtctl(['-wait-time', '2s', 'WaitForAction', action_path], expect_fail=True) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run_vtctl(['-wait-time', '20s', 'WaitForAction', action_path], auto_log=True) if environment.topo_server_implementation == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: self.fail('not enough time in Connected state: %u', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def _check_db_addr(self, shard, db_type, expected_port, cell="test_nj"): ep = utils.run_vtctl_json(["GetEndPoints", cell, "test_keyspace/" + shard, db_type]) self.assertEqual(len(ep["entries"]), 1, "Wrong number of entries: %s" % str(ep)) port = ep["entries"][0]["named_port_map"]["_vtocc"] self.assertEqual(port, expected_port, "Unexpected port: %u != %u from %s" % (port, expected_port, str(ep))) host = ep["entries"][0]["host"] if not host.startswith(utils.hostname): self.fail("Invalid hostname %s was expecting something starting with %s" % (host, utils.hostname))
def wait_for_tablet_type_change(self, tablet_alias, expected_type): timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_alias]) if ti['Type'] == expected_type: logging.debug("Slave tablet went to %s, good" % expected_type) break timeout = utils.wait_step('slave becomes ' + expected_type, timeout)
def _check_db_addr(self, shard, db_type, expected_port): ep = utils.run_vtctl_json(['GetEndPoints', 'test_nj', 'test_keyspace/'+shard, db_type]) self.assertEqual(len(ep['entries']), 1 , 'Wrong number of entries: %s' % str(ep)) port = ep['entries'][0]['named_port_map']['_vtocc'] self.assertEqual(port, expected_port, 'Unexpected port: %u != %u from %s' % (port, expected_port, str(ep))) host = ep['entries'][0]['host'] if not host.startswith(utils.hostname): self.fail('Invalid hostname %s was expecting something starting with %s' % (host, utils.hostname))
def _test_sanity(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', '-force', 'test_keyspace']) utils.run_vtctl(['createshard', '-force', 'test_keyspace/0']) tablet_62344.init_tablet('master', 'test_keyspace', '0', parent=False) utils.run_vtctl( ['RebuildKeyspaceGraph', '-rebuild_srv_shards', 'test_keyspace']) utils.validate_topology() self._check_srv_shard() # if these statements don't run before the tablet it will wedge # waiting for the db to become accessible. this is more a bug than # a feature. tablet_62344.populate('vt_test_keyspace', self._create_vt_select_test, self._populate_vt_select_test) tablet_62344.start_vttablet() # make sure the query service is started right away qr = tablet_62344.execute('select * from vt_select_test') self.assertEqual(len(qr['rows']), 4, 'expected 4 rows in vt_select_test: %s' % str(qr)) # make sure direct dba queries work query_result = utils.run_vtctl_json( ['ExecuteFetchAsDba', '-json', tablet_62344.tablet_alias, 'select * from vt_test_keyspace.vt_select_test']) self.assertEqual( len(query_result['rows']), 4, 'expected 4 rows in vt_select_test: %s' % str(query_result)) self.assertEqual( len(query_result['fields']), 2, 'expected 2 fields in vt_select_test: %s' % str(query_result)) # check Ping / RefreshState utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) utils.run_vtctl(['RefreshState', tablet_62344.tablet_alias]) # Quickly check basic actions. utils.run_vtctl(['SetReadOnly', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.run_vtctl(['SetReadWrite', tablet_62344.tablet_alias]) utils.check_db_read_write(62344) utils.run_vtctl(['DemoteMaster', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.validate_topology() utils.run_vtctl(['ValidateKeyspace', 'test_keyspace']) # not pinging tablets, as it enables replication checks, and they # break because we only have a single master, no slaves utils.run_vtctl(['ValidateShard', '-ping-tablets=false', 'test_keyspace/0']) self._check_srv_shard() tablet_62344.kill_vttablet()
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_slave.init_tablet('spare', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_slave.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl([ 'MultiSnapshot', '--spec=80-c0-', '--exclude_tables=unrelated', shard_1_slave1.tablet_alias ], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-c0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint,dontStartBinlogPlayer', 'test_keyspace/80-c0', shard_1_slave1.tablet_alias ], auto_log=True) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not failed starting yet', timeout) continue logging.debug("shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json([ 'ExecuteFetch', shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0' ]) self.assertEqual(qr['RowsAffected'], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not started replication yet', timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/c0-', shard_1_slave1.tablet_alias ], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. shard_2_master_vars = utils.get_vars(shard_2_master.port) self.assertEqual(shard_2_master_vars['TabletStateName'], 'NOT_SERVING') shard_3_master_vars = utils.get_vars(shard_3_master.port) self.assertEqual(shard_3_master_vars['TabletStateName'], 'NOT_SERVING') # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]) for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def _test_reparent_graceful(self, shard_id): utils.run_vtctl('CreateKeyspace test_keyspace') # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True) if environment.topo_server_implementation == 'zookeeper': shard = utils.run_vtctl_json( ['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['Cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['Cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state("SERVING") if environment.topo_server_implementation == 'zookeeper': shard = utils.run_vtctl_json( ['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['Cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['Cells'])) # Recompute the shard layout node - until you do that, it might not be valid. utils.run_vtctl('RebuildShardGraph test_keyspace/' + shard_id) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.pause("force ReparentShard?") utils.run_vtctl('ReparentShard -force test_keyspace/%s %s' % (shard_id, tablet_62344.tablet_alias)) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Verify MasterCell is set to new cell. srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_ny', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') # Convert two replica to spare. That should leave only one node serving traffic, # but still needs to appear in the replication graph. utils.run_vtctl( ['ChangeSlaveType', tablet_41983.tablet_alias, 'spare']) utils.run_vtctl( ['ChangeSlaveType', tablet_31981.tablet_alias, 'spare']) utils.validate_topology() self._check_db_addr(shard_id, 'replica', tablet_62044.port) # Run this to make sure it succeeds. utils.run_vtctl('ShardReplicationPositions test_keyspace/%s' % shard_id, stdout=utils.devnull) # Perform a graceful reparent operation. utils.pause("graceful ReparentShard?") utils.run_vtctl('ReparentShard test_keyspace/%s %s' % (shard_id, tablet_62044.tablet_alias), auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, 'master', tablet_62044.port) # Verify MasterCell is set to new cell. srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_ny', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981]) # Test address correction. new_port = environment.reserve_ports(1) tablet_62044.start_vttablet(port=new_port) # Wait a moment for address to reregister. time.sleep(1.0) self._check_db_addr(shard_id, 'master', new_port) tablet_62044.kill_vttablet()
def test_get_keyspace(self): ki = utils.run_vtctl_json(['GetKeyspace', UNSHARDED_KEYSPACE]) self.assertEqual('keyspace_id', ki['ShardingColumnName']) self.assertEqual('uint64', ki['ShardingColumnType'])
def test_health_check(self): # one master, one replica that starts in spare # (for the replica, we let vttablet do the InitTablet) tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', lameduck_period='5s', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'replica') self.check_healthz(tablet_62044, True) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['type'], tablet.Tablet.tablet_type_value['MASTER'], 'unexpected master type: %s' % ti['type']) # stop replication, make sure we go unhealthy. utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'spare') self.check_healthz(tablet_62044, False) # make sure the serving graph was updated timeout = 10 while True: try: utils.run_vtctl_json( ['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) except: logging.debug('Tablet is gone from serving graph, good') break timeout = utils.wait_step( 'Stopped replication didn\'t trigger removal from serving graph', timeout) # make sure status web page is unhappy self.assertIn( '>unhealthy: replication_reporter: ' 'Replication is not running</span></div>', tablet_62044.get_status()) # make sure the health stream is updated health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertIn('replication_reporter: Replication is not running', health['realtime_stats']['health_error']) # then restart replication, and write data, make sure we go back to healthy utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'replica') # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the vars is updated v = utils.get_vars(tablet_62044.port) self.assertEqual(v['LastHealthMapCount'], 0) # now test VtTabletStreamHealth returns the right thing stdout, stderr = utils.run_vtctl( ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias], trap_output=True, auto_log=True) lines = stdout.splitlines() self.assertEqual(len(lines), 2) for line in lines: logging.debug('Got health: %s', line) data = json.loads(line) self.assertIn('realtime_stats', data) self.assertNotIn('health_error', data['realtime_stats']) self.assertNotIn('tablet_externally_reparented_timestamp', data) self.assertEqual('test_keyspace', data['target']['keyspace']) self.assertEqual('0', data['target']['shard']) self.assertEqual(3, data['target']['tablet_type']) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044]) # the replica was in lameduck for 5 seconds, should have been enough # to reset its state to spare ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEqual( ti['type'], tablet.Tablet.tablet_type_value['SPARE'], "tablet didn't go to spare while in lameduck mode: %s" % str(ti))
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet('master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) shard_2_master.wait_for_vttablet_state('CONNECTING') shard_2_replica1.wait_for_vttablet_state('NOT_SERVING') shard_2_replica2.wait_for_vttablet_state('NOT_SERVING') shard_3_master.wait_for_vttablet_state('CONNECTING') shard_3_replica.wait_for_vttablet_state('NOT_SERVING') shard_3_rdonly.wait_for_vttablet_state('CONNECTING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl( ['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot, # and check all the others while we're at it shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets( [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly]) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def _check_master_cell(self, cell, shard_id, master_cell): srvShard = utils.run_vtctl_json(['GetSrvShard', cell, 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['master_cell'], master_cell)
def _check_srv_shard(self): srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['master_cell'], 'test_nj')
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') pos = mysql_flavor().master_position(tablet_62344) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', tablet_62344.mysql_port, pos) tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] + change_master_cmds + ['START SLAVE']) # now shutdown all mysqld shutdown_procs = [ tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql(), ] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('replica', 'test_keyspace', '0', include_mysql_port=False) for t in tablet_62344, tablet_62044: # Since MySQL is down at this point and we want the tablet to start up # successfully, we have to use supports_backups=False. t.start_vttablet(wait_for_state=None, supports_backups=False, full_mycnf_args=True, include_mysql_port=False) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # Tell slave to not try to repair replication in healthcheck. # The StopSlave will ultimately fail because mysqld is not running, # But vttablet should remember that it's not supposed to fix replication. utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias], expect_fail=True) # The above notice to not fix replication should survive tablet restart. tablet_62044.kill_vttablet() tablet_62044.start_vttablet(wait_for_state='NOT_SERVING', full_mycnf_args=True, include_mysql_port=False, supports_backups=False) # restart mysqld start_procs = [ tablet_62344.start_mysql(), tablet_62044.start_mysql(), ] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl(['RunHealthCheck', tablet_62344.tablet_alias], auto_log=True) self.check_healthz(tablet_62344, True) # the slave will now be healthy, but report a very high replication # lag, because it can't figure out what it exactly is. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias], auto_log=True) tablet_62044.wait_for_vttablet_state('SERVING') self.check_healthz(tablet_62044, True) health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertTrue('seconds_behind_master' in health['realtime_stats']) self.assertEqual(health['realtime_stats']['seconds_behind_master'], 7200) self.assertIn('serving', health) # restart replication, wait until health check goes small # (a value of zero is default and won't be in structure) utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) timeout = 10 while True: utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias], auto_log=True) health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias ]) if 'serving' in health and ( ('seconds_behind_master' not in health['realtime_stats']) or (health['realtime_stats']['seconds_behind_master'] < 30)): break timeout = utils.wait_step('health delay goes back down', timeout) # wait for the tablet to fix its mysql port for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) if 'mysql' in ti['port_map']: break timeout = utils.wait_step('mysql port in tablet record', timeout) self.assertEqual(ti['port_map']['mysql'], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def test_reparent_cross_cell(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. Won't be healthy # as replication is not running. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_master_tablet(tablet_62344) # Perform a graceful reparent operation to another cell. utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, '-new_master', tablet_31981.tablet_alias ], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_31981) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_vertical_split(self): utils.run_vtctl(['CopySchemaShard', '--tables', '/moving/,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0'], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', '/moving/,view1', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'destination_keyspace/0'], auto_log=True) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', self.moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', self.moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', self.moving1_first, 100) # check the binlog player is running and exporting vars self.check_destination_master(destination_master, ['source_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(source_replica, horizontal=False) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) self.check_binlog_player_vars(destination_master, ['source_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(source_replica, horizontal=False, min_statements=100, min_transactions=100) # use vtworker to compare the data logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', '--min_healthy_rdonly_tablets', '1', 'destination_keyspace/0'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all self.check_running_binlog_player(destination_master, 700, 300, extra_text='moving') # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(['MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf['cells'], ['test_nj']) self.assertTrue(found) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertNotIn('cells', ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) self._check_client_conn_redirection( 'destination_keyspace', ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['/moving/', 'view1']) self._check_blacklisted_tables(source_replica, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['/moving/', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['/moving/', 'view1']) # check the binlog player is gone now self.check_no_binlog_player(destination_master) # check the stats are correct self._check_stats() # now remove the tables on the source shard. The blacklisted tables # in the source shard won't match any table, make sure that works. utils.run_vtctl(['ApplySchema', '-sql=drop view view1', 'source_keyspace'], auto_log=True) for t in ['moving1', 'moving2']: utils.run_vtctl(['ApplySchema', '-sql=drop table %s' % (t), 'source_keyspace'], auto_log=True) for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: utils.run_vtctl(['ReloadSchema', t.tablet_alias]) qr = source_master.execute('select count(1) from staying1') self.assertEqual(len(qr['rows']), 1, 'cannot read staying1: got %s' % str(qr)) # test SetShardTabletControl self._verify_vtctl_set_shard_tablet_control()
def test_health_check(self): # one master, one replica that starts not initialized # (for the replica, we let vttablet do the InitTablet) tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None) tablet_62044.start_vttablet(wait_for_state=None, lameduck_period='5s', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the unhealthy slave goes to healthy tablet_62044.wait_for_vttablet_state('SERVING') utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) self.check_healthz(tablet_62044, True) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['type'], topodata_pb2.MASTER, 'unexpected master type: %s' % ti['type']) # stop replication at the mysql level. tablet_62044.mquery('', 'stop slave') # vttablet replication_reporter should restart it. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) # insert something on the master and wait for it on the slave. tablet_62344.mquery('vt_test_keyspace', [ 'create table repl_test_table (id int)', 'insert into repl_test_table values (123)' ], write=True) timeout = 10.0 while True: try: result = tablet_62044.mquery('vt_test_keyspace', 'select * from repl_test_table') if result: self.assertEqual(result[0][0], 123L) break except MySQLdb.ProgrammingError: # Maybe the create table hasn't gone trough yet, we wait more logging.exception( 'got this exception waiting for data, ignoring it') timeout = utils.wait_step( 'slave replication repaired by replication_reporter', timeout) # stop replication, make sure we don't go unhealthy. # (we have a baseline as well, so the time should be good). utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) self.check_healthz(tablet_62044, True) # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the health stream is updated health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertTrue( ('seconds_behind_master' not in health['realtime_stats']) or (health['realtime_stats']['seconds_behind_master'] < 30), 'got unexpected health: %s' % str(health)) self.assertIn('serving', health) # then restart replication, make sure we stay healthy utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # now test VtTabletStreamHealth returns the right thing stdout, _ = utils.run_vtctl( ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias], trap_output=True, auto_log=True) lines = stdout.splitlines() self.assertEqual(len(lines), 2) for line in lines: logging.debug('Got health: %s', line) data = json.loads(line) self.assertIn('realtime_stats', data) self.assertIn('serving', data) self.assertTrue(data['serving']) self.assertNotIn('health_error', data['realtime_stats']) self.assertNotIn('tablet_externally_reparented_timestamp', data) self.assertEqual('test_keyspace', data['target']['keyspace']) self.assertEqual('0', data['target']['shard']) self.assertEqual(topodata_pb2.REPLICA, data['target']['tablet_type']) # Test that VtTabletStreamHealth reports a QPS >0.0. # Therefore, issue several reads first. # NOTE: This may be potentially flaky because we'll observe a QPS >0.0 # exactly "once" for the duration of one sampling interval (5s) and # after that we'll see 0.0 QPS rates again. If this becomes actually # flaky, we need to read continuously in a separate thread. for _ in range(10): tablet_62044.execute('select 1 from dual') # This may take up to 5 seconds to become true because we sample the query # counts for the rates only every 5 seconds (see query_service_stats.go). timeout = 10 while True: health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias ]) if health['realtime_stats'].get('qps', 0.0) > 0.0: break timeout = utils.wait_step('QPS >0.0 seen', timeout) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl(['CreateKeyspace', '--served-from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace']) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) destination_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [destination_replica, destination_rdonly]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['ReparentShard', '-force', 'source_keyspace/0', source_master.tablet_alias], auto_log=True) utils.run_vtctl(['ReparentShard', '-force', 'destination_keyspace/0', destination_master.tablet_alias], auto_log=True) # read all the keyspaces, this will populate the topology cache. self._populate_topo_cache() # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) if use_clone_worker: # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--strategy', 'populateBlpCheckpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0'], auto_log=True) else: # take the snapshot for the split utils.run_vtctl(['MultiSnapshot', '--tables', 'moving.*,view1', source_rdonly.tablet_alias], auto_log=True) # perform the restore. utils.run_vtctl(['ShardMultiRestore', '--strategy' ,'populateBlpCheckpoint', '--tables', 'moving.*,view1', 'destination_keyspace/0', source_rdonly.tablet_alias], auto_log=True) topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace') # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use the vtworker checker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn('<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2']) # check 'vtctl SetShardBlacklistedTables' command works as expected: # clear the rdonly entry, re-add it, and then clear all entries. utils.run_vtctl(['SetShardBlacklistedTables', 'source_keyspace/0', 'rdonly'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertNotIn('rdonly', shard_json['BlacklistedTablesMap']) self.assertIn('replica', shard_json['BlacklistedTablesMap']) self.assertIn('master', shard_json['BlacklistedTablesMap']) utils.run_vtctl(['SetShardBlacklistedTables', 'source_keyspace/0', 'rdonly', 'moving.*,view1'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(['moving.*', 'view1'], shard_json['BlacklistedTablesMap']['rdonly']) utils.run_vtctl(['SetShardBlacklistedTables', 'source_keyspace/0', 'rdonly'], auto_log=True) utils.run_vtctl(['SetShardBlacklistedTables', 'source_keyspace/0', 'replica'], auto_log=True) utils.run_vtctl(['SetShardBlacklistedTables', 'source_keyspace/0', 'master'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(None, shard_json['BlacklistedTablesMap']) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # optional method to check the stats are correct self._check_stats() # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly, destination_master, destination_replica, destination_rdonly])
def _test_reparent_graceful(self, shard_id): # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True) if environment.topo_server().flavor() == 'zookeeper': shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') if environment.topo_server().flavor() == 'zookeeper': shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are # identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(['InitShardMaster', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias]) utils.validate_topology(ping_tablets=True) tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Verify MasterCell is set to new cell. self._check_master_cell('test_nj', shard_id, 'test_nj') self._check_master_cell('test_ny', shard_id, 'test_nj') # Convert two replica to spare. That should leave only one node # serving traffic, but still needs to appear in the replication # graph. utils.run_vtctl(['ChangeSlaveType', tablet_41983.tablet_alias, 'spare']) utils.run_vtctl(['ChangeSlaveType', tablet_31981.tablet_alias, 'spare']) utils.validate_topology() self._check_db_addr(shard_id, 'replica', tablet_62044.port) # Run this to make sure it succeeds. utils.run_vtctl(['ShardReplicationPositions', 'test_keyspace/' + shard_id], stdout=utils.devnull) # Perform a graceful reparent operation. utils.pause('_test_reparent_graceful PlannedReparentShard') utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id, tablet_62044.tablet_alias], auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, 'master', tablet_62044.port) # insert data into the new master, check the connected slaves work self._populate_vt_insert_test(tablet_62044, 1) self._check_vt_insert_test(tablet_41983, 1) self._check_vt_insert_test(tablet_62344, 1) # Verify MasterCell is set to new cell. self._check_master_cell('test_nj', shard_id, 'test_nj') self._check_master_cell('test_ny', shard_id, 'test_nj') tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981]) # Test address correction. new_port = environment.reserve_ports(1) tablet_62044.start_vttablet(port=new_port) # Wait until the new address registers. timeout = 30.0 while True: try: self._check_db_addr(shard_id, 'master', new_port) break except: timeout = utils.wait_step('waiting for new port to register', timeout, sleep_time=0.1) tablet_62044.kill_vttablet()
def test_reparent_cross_cell(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are # identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(['InitShardMaster', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Verify MasterCell is properly set self._check_master_cell('test_nj', shard_id, 'test_nj') self._check_master_cell('test_ny', shard_id, 'test_nj') # Perform a graceful reparent operation to another cell. utils.pause('test_reparent_cross_cell PlannedReparentShard') utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id, tablet_31981.tablet_alias], auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, 'master', tablet_31981.port, cell='test_ny') # Verify MasterCell is set to new cell. self._check_master_cell('test_nj', shard_id, 'test_ny') self._check_master_cell('test_ny', shard_id, 'test_ny') tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def _test_reparent_graceful(self, shard_id): # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', shard_id, start=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ]) utils.validate_topology(ping_tablets=True) tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) self._check_master_tablet(tablet_62344) utils.validate_topology() # Run this to make sure it succeeds. stdout, _ = utils.run_vtctl( ['ShardReplicationPositions', 'test_keyspace/' + shard_id], trap_output=True) lines = stdout.splitlines() self.assertEqual(len(lines), 4) # one master, three slaves self.assertIn('master', lines[0]) # master first # Perform a graceful reparent operation. utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, '-new_master', tablet_62044.tablet_alias ], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_62044) # insert data into the new master, check the connected slaves work self._populate_vt_insert_test(tablet_62044, 1) self._check_vt_insert_test(tablet_41983, 1) self._check_vt_insert_test(tablet_62344, 1) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981]) # Test address correction. new_port = environment.reserve_ports(1) tablet_62044.start_vttablet(port=new_port) # Wait until the new address registers. timeout = 30.0 while True: try: self._check_master_tablet(tablet_62044, port=new_port) break except protocols_flavor().client_error_exception_type(): timeout = utils.wait_step('waiting for new port to register', timeout, sleep_time=0.1) tablet_62044.kill_vttablet()
def test_get_keyspace(self): ki = utils.run_vtctl_json(['GetKeyspace', UNSHARDED_KEYSPACE]) self.assertEqual('keyspace_id', ki['sharding_column_name']) self.assertEqual(1, ki['sharding_column_type'])
def test_health_check(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # one master, one replica that starts in spare tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', lameduck_period='5s') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ]) # make sure the 'spare' slave goes to 'replica' timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if ti['Type'] == "replica": logging.info("Slave tablet went to replica, good") break timeout = utils.wait_step('slave tablet going to replica', timeout) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['Type'], 'master', "unexpected master type: %s" % ti['Type']) # stop replication on the slave, see it trigger the slave going # slightly unhealthy tablet_62044.mquery('', 'stop slave') timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if 'Health' in ti and ti['Health']: if 'replication_lag' in ti['Health']: if ti['Health']['replication_lag'] == 'high': logging.info( "Slave tablet replication_lag went to high, good") break timeout = utils.wait_step('slave has high replication lag', timeout) # make sure the serving graph was updated ep = utils.run_vtctl_json( ['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) if not ep['entries'][0]['health']: self.fail( 'Replication lag parameter not propagated to serving graph: %s' % str(ep)) self.assertEqual( ep['entries'][0]['health']['replication_lag'], 'high', 'Replication lag parameter not propagated to serving graph: %s' % str(ep)) # make sure status web page is unhappy self.assertIn('>unhappy</span></div>', tablet_62044.get_status()) # make sure the vars is updated v = utils.get_vars(tablet_62044.port) self.assertEqual(v['LastHealthMapCount'], 1) # then restart replication, make sure we go back to healthy tablet_62044.mquery('', 'start slave') timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if 'Health' in ti and ti['Health']: if 'replication_lag' in ti['Health']: if ti['Health']['replication_lag'] == 'high': timeout = utils.wait_step( 'slave has no replication lag', timeout) continue logging.info("Slave tablet replication_lag is gone, good") break # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the vars is updated v = utils.get_vars(tablet_62044.port) self.assertEqual(v['LastHealthMapCount'], 0) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044]) # the replica was in lameduck for 5 seconds, should have been enough # to reset its state to spare ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEqual( ti['Type'], 'spare', "tablet didn't go to spare while in lameduck mode: %s" % str(ti))
def _test_sanity(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', '-force', 'test_keyspace']) utils.run_vtctl(['createshard', '-force', 'test_keyspace/0']) tablet_62344.init_tablet('master', 'test_keyspace', '0', parent=False) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace']) utils.validate_topology() srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') # if these statements don't run before the tablet it will wedge waiting for the # db to become accessible. this is more a bug than a feature. tablet_62344.populate('vt_test_keyspace', self._create_vt_select_test, self._populate_vt_select_test) tablet_62344.start_vttablet() # make sure the query service is started right away result, _ = utils.run_vtctl([ 'Query', 'test_nj', 'test_keyspace', 'select * from vt_select_test' ], trap_output=True) rows = result.splitlines() self.assertEqual( len(rows), 5, "expected 5 rows in vt_select_test: %s %s" % (str(rows), result)) # make sure direct dba queries work query_result = utils.run_vtctl_json([ 'ExecuteFetch', '-want_fields', tablet_62344.tablet_alias, 'select * from vt_test_keyspace.vt_select_test' ]) self.assertEqual( len(query_result['Rows']), 4, "expected 4 rows in vt_select_test: %s" % str(query_result)) self.assertEqual( len(query_result['Fields']), 2, "expected 2 fields in vt_select_test: %s" % str(query_result)) # check Pings utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) utils.run_vtctl(['RpcPing', tablet_62344.tablet_alias]) # Quickly check basic actions. utils.run_vtctl(['SetReadOnly', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.run_vtctl(['SetReadWrite', tablet_62344.tablet_alias]) utils.check_db_read_write(62344) utils.run_vtctl(['DemoteMaster', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.validate_topology() utils.run_vtctl('ValidateKeyspace test_keyspace') # not pinging tablets, as it enables replication checks, and they # break because we only have a single master, no slaves utils.run_vtctl('ValidateShard -ping-tablets=false test_keyspace/0') srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.kill_vttablet() tablet_62344.init_tablet('idle') tablet_62344.scrap(force=True)
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', keyspace_id_type ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, they won't be healthy) for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'LegacySplitClone', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_restart_during_action(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) tablet_62344.init_tablet('master', 'test_keyspace', '0') utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet_62344.create_db('vt_test_keyspace') tablet_62344.start_vttablet() utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) # schedule long action utils.run_vtctl( ['-no-wait', 'Sleep', tablet_62344.tablet_alias, '15s'], stdout=utils.devnull) # ping blocks until the sleep finishes unless we have a schedule race action_path, _ = utils.run_vtctl( ['-no-wait', 'Ping', tablet_62344.tablet_alias], trap_output=True) action_path = action_path.strip() # kill agent leaving vtaction running tablet_62344.kill_vttablet() # restart agent tablet_62344.start_vttablet() # we expect this action with a short wait time to fail. this isn't the best # and has some potential for flakiness. utils.run_vtctl(['-wait-time', '2s', 'WaitForAction', action_path], expect_fail=True) # wait until the background sleep action is done, otherwise there will be # a leftover vtaction whose result may overwrite running actions # NOTE(alainjobart): Yes, I've seen it happen, it's a pain to debug: # the zombie Sleep clobbers the Clone command in the following tests utils.run_vtctl(['-wait-time', '20s', 'WaitForAction', action_path], auto_log=True) if environment.topo_server_implementation == 'zookeeper': # extra small test: we ran for a while, get the states we were in, # make sure they're accounted for properly # first the query engine States v = utils.get_vars(tablet_62344.port) logging.debug("vars: %s" % str(v)) # then the Zookeeper connections if v['ZkMetaConn']['test_nj']['Current'] != 'Connected': self.fail('invalid zk test_nj state: %s' % v['ZkMetaConn']['test_nj']['Current']) if v['ZkMetaConn']['global']['Current'] != 'Connected': self.fail('invalid zk global state: %s' % v['ZkMetaConn']['global']['Current']) if v['ZkMetaConn']['test_nj']['DurationConnected'] < 10e9: self.fail('not enough time in Connected state: %u', v['ZkMetaConn']['test_nj']['DurationConnected']) if v['TabletType'] != 'master': self.fail('TabletType not exported correctly') tablet_62344.kill_vttablet()
def test_srv_vschema(self): """Makes sure the SrvVSchema object is properly built.""" v = utils.run_vtctl_json(['GetSrvVSchema', 'test_nj']) self.assertEqual(len(v['keyspaces']), 2, 'wrong vschema: %s' % str(v)) self.assertIn('user', v['keyspaces']) self.assertIn('lookup', v['keyspaces'])
def test_vtgate(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl(['CreateShard', 'test_keyspace/0']) tablet_62344.init_tablet('master', 'test_keyspace', '0', parent=False) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace']) utils.validate_topology() srvShard = utils.run_vtctl_json( ['GetSrvShard', 'test_nj', 'test_keyspace/0']) self.assertEqual(srvShard['MasterCell'], 'test_nj') # if these statements don't run before the tablet it will wedge waiting for the # db to become accessible. this is more a bug than a feature. tablet_62344.mquery("", ["set global read_only = off"]) tablet_62344.populate('vt_test_keyspace', self._create_vt_select_test, self._populate_vt_select_test) tablet_62344.start_vttablet() gate_proc, gate_port = utils.vtgate_start() conn = vtgate.connect("localhost:%s" % (gate_port), "master", "test_keyspace", "0", 2.0) # _execute (result, count, lastrow, fields) = conn._execute("select * from vt_select_test", {}) self.assertEqual(count, 4, "want 4, got %d" % (count)) self.assertEqual(len(fields), 2, "want 2, got %d" % (len(fields))) # _execute_batch queries = [ "select * from vt_select_test where id = :id", "select * from vt_select_test where id = :id", ] bindvars = [ { "id": 1 }, { "id": 2 }, ] rowsets = conn._execute_batch(queries, bindvars) self.assertEqual(rowsets[0][0][0][0], 1) self.assertEqual(rowsets[1][0][0][0], 2) # _stream_execute (result, count, lastrow, fields) = conn._stream_execute("select * from vt_select_test", {}) self.assertEqual(len(fields), 2, "want 2, got %d" % (len(fields))) count = 0 while 1: r = conn._stream_next() if not r: break count += 1 self.assertEqual(count, 4, "want 4, got %d" % (count)) # begin-rollback conn.begin() conn._execute("insert into vt_select_test values(:id, :msg)", { "id": 5, "msg": "test4" }) conn.rollback() (result, count, lastrow, fields) = conn._execute("select * from vt_select_test", {}) self.assertEqual(count, 4, "want 4, got %d" % (count)) # begin-commit conn.begin() conn._execute("insert into vt_select_test values(:id, :msg)", { "id": 5, "msg": "test4" }) conn.commit() (result, count, lastrow, fields) = conn._execute("select * from vt_select_test", {}) self.assertEqual(count, 5, "want 5, got %d" % (count)) # error on dml. We still need to get a transaction id conn.begin() with self.assertRaises(dbexceptions.IntegrityError): conn._execute("insert into vt_select_test values(:id, :msg)", { "id": 5, "msg": "test4" }) self.assertTrue(conn.session["ShardSessions"][0]["TransactionId"] != 0) conn.commit() # interleaving conn2 = vtgate.connect("localhost:%s" % (gate_port), "master", "test_keyspace", "0", 2.0) thd = threading.Thread(target=self._query_lots, args=(conn2, )) thd.start() for i in xrange(250): (result, count, lastrow, fields) = conn._execute( "select id from vt_select_test where id = 2", {}) self.assertEqual(result, [(2, )]) if i % 10 == 0: conn._stream_execute( "select id from vt_select_test where id = 3", {}) while 1: result = conn._stream_next() if not result: break self.assertEqual(result, (3, )) thd.join() # close conn.close() utils.vtgate_kill(gate_proc) tablet_62344.kill_vttablet()
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') pos = mysql_flavor().master_position(tablet_62344) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. changeMasterCmds = mysql_flavor().change_master_commands( 'localhost', tablet_62344.mysql_port, pos) tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] + changeMasterCmds + ['START SLAVE']) # now shutdown all mysqld shutdown_procs = [ tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql(), ] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0', include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet(wait_for_state=None, target_tablet_type='replica', full_mycnf_args=True, include_mysql_port=False) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # restart mysqld start_procs = [ tablet_62344.start_mysql(), tablet_62044.start_mysql(), ] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl( ['RunHealthCheck', tablet_62344.tablet_alias, 'replica'], auto_log=True) self.check_healthz(tablet_62344, True) # the slave won't be healthy at first, as replication is not running utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'replica'], auto_log=True) self.check_healthz(tablet_62044, False) tablet_62044.wait_for_vttablet_state('NOT_SERVING') # restart replication tablet_62044.mquery('', ['START SLAVE']) # wait for the tablet to become healthy and fix its mysql port utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'replica'], auto_log=True) tablet_62044.wait_for_vttablet_state('SERVING') self.check_healthz(tablet_62044, True) for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) if 'mysql' in ti['port_map']: break timeout = utils.wait_step('mysql port in tablet record', timeout) self.assertEqual(ti['port_map']['mysql'], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = ( base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, they won't be healthy) for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Test the correct handling of keyspace_id changes which happen after # the first clone. # Let row 2 go to shard 3 instead of shard 2. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0xD000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 2 and inserted to shard 3. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0xD000000000000000, should_be_here=False) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0xD000000000000000) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0x9000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 3 and inserted to shard 2. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0x9000000000000000) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0x9000000000000000, should_be_here=False) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 and 5 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) self._insert_value(shard_3_master, 'resharding1', 5, 'msg5', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 2, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # When the binlog players/filtered replication is turned on, the query # service must be turned off on the destination masters. # The tested behavior is a safeguard to prevent that somebody can # accidentally modify data on the destination masters while they are not # migrated yet and the source shards are still the source of truth. shard_2_master.wait_for_vttablet_state('NOT_SERVING') shard_3_master.wait_for_vttablet_state('NOT_SERVING') # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Executing MultiValue Insert Queries') self._exec_multi_shard_dmls() logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) logging.debug('Checking MultiValue Insert Queries') self._check_multi_shard_values() self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all if base_sharding.use_rbr: # We submitted non-annotated DMLs, that are properly routed # with RBR, but not with SBR. So the first shard counts # are smaller. In the second shard, we submitted statements # that affect more than one keyspace id. These will result # in two queries with RBR. So the count there is higher. self.check_running_binlog_player(shard_2_master, 4018, 2008) self.check_running_binlog_player(shard_3_master, 4028, 2008) else: self.check_running_binlog_player(shard_2_master, 4022, 2008) self.check_running_binlog_player(shard_3_master, 4024, 2008) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-c0', '-new_master', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( 'DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug( 'DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # make sure we can't delete the destination shard now that it's serving _, stderr = utils.run_vtctl(['DeleteShard', 'test_keyspace/80-c0'], expect_fail=True) self.assertIn('is still serving, cannot delete it', stderr) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_vertical_split(self): # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl([ 'CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0' ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0' ], auto_log=True) # One of the two source rdonly tablets went spare after the clone. # Force a healthcheck on both to get them back to "rdonly". for t in [source_rdonly1, source_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', self.moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', self.moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', self.moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) _ = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use vtworker to compare the data for t in [destination_rdonly1, destination_rdonly2]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) logging.debug('Running vtworker VerticalSplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0' ], auto_log=True) # One of each source and dest rdonly tablet went spare after the diff. # Force a healthcheck on all four to get them back to "rdonly". for t in [ source_rdonly1, source_rdonly2, destination_rdonly1, destination_rdonly2 ]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn( '<td><b>All</b>: 1000<br><b>Query</b>: 700<br>' '<b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl([ 'MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(rdonly): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertEqual(ksf['cells'], ['test_nj']) self.assertTrue(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertFalse(found) utils.run_vtctl([ 'SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly' ], auto_log=True) keyspace_json = utils.run_vtctl_json( ['GetKeyspace', 'destination_keyspace']) found = False for ksf in keyspace_json['served_froms']: if ksf['tablet_type'] == topodata_pb2.RDONLY: found = True self.assertNotIn('cells', ksf) self.assertTrue(found) # now serve rdonly from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl([ 'MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica' ], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('destination_keyspace', ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl( ['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # check the stats are correct self._check_stats() self._verify_vtctl_set_shard_tablet_control()
def test_custom_end_to_end(self): """Runs through the common operations of a custom sharded keyspace. Tests creation with one shard, schema change, reading / writing data, adding one more shard, reading / writing data from both shards, applying schema changes again, and reading / writing data from both shards again. """ utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # start the first shard only for now shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_0_rdonly.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_0_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_0_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_0_rdonly.tablet_alias, 'rdonly') for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: t.wait_for_vttablet_state('SERVING') self._check_shards_count_in_srv_keyspace(1) s = utils.run_vtctl_json(['GetShard', 'test_keyspace/0']) self.assertEqual(s['is_master_serving'], True) # create a table on shard 0 sql = '''create table data( id bigint auto_increment, name varchar(64), primary key (id) ) Engine=InnoDB''' utils.run_vtctl(['ApplySchema', '-sql=' + sql, 'test_keyspace'], auto_log=True) # reload schema everywhere so the QueryService knows about the tables for t in [shard_0_master, shard_0_replica, shard_0_rdonly]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # create shard 1 shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='1', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='1', tablet_index=1) shard_1_rdonly.init_tablet('rdonly', keyspace='test_keyspace', shard='1', tablet_index=2) for t in [shard_1_master, shard_1_replica, shard_1_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') s = utils.run_vtctl_json(['GetShard', 'test_keyspace/1']) self.assertEqual(s['is_master_serving'], True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/1', shard_1_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_1_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_1_rdonly.tablet_alias, 'rdonly') for t in [shard_1_master, shard_1_replica, shard_1_rdonly]: t.wait_for_vttablet_state('SERVING') utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/1' ], auto_log=True) # we need to rebuild SrvKeyspace here to account for the new shards. utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) self._check_shards_count_in_srv_keyspace(2) # must start vtgate after tablets are up, or else wait until 1min refresh utils.VtGate().start(tablets=[ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.1.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.1.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.1.rdonly', 1) # insert and check data on shard 0 self._insert_data('0', 100, 10) self._check_data('0', 100, 10) # insert and check data on shard 1 self._insert_data('1', 200, 10) self._check_data('1', 200, 10) # create a second table on all shards sql = '''create table data2( id bigint auto_increment, name varchar(64), primary key (id) ) Engine=InnoDB''' utils.run_vtctl(['ApplySchema', '-sql=' + sql, 'test_keyspace'], auto_log=True) # reload schema everywhere so the QueryService knows about the tables for t in all_tablets: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # insert and read data on all shards self._insert_data('0', 300, 10, table='data2') self._insert_data('1', 400, 10, table='data2') self._check_data('0', 300, 10, table='data2') self._check_data('1', 400, 10, table='data2') # Now test SplitQuery API works (used in MapReduce usually, but bringing # up a full MR-capable cluster is too much for this test environment) sql = 'select id, name from data' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 4) shard0count = 0 shard1count = 0 for q in s: if q['shard_part']['shards'][0] == '0': shard0count += 1 if q['shard_part']['shards'][0] == '1': shard1count += 1 self.assertEqual(shard0count, 2) self.assertEqual(shard1count, 2) # run the queries, aggregate the results, make sure we have all rows rows = {} for q in s: bindvars = {} for name, value in q['query']['bind_variables'].iteritems(): # vtctl encodes bytes as base64. bindvars[name] = int(base64.standard_b64decode(value['value'])) qr = utils.vtgate.execute_shards(q['query']['sql'], 'test_keyspace', ','.join( q['shard_part']['shards']), tablet_type='master', bindvars=bindvars) for r in qr['rows']: rows[int(r[0])] = r[1] self.assertEqual(len(rows), 20) expected = {} for i in xrange(10): expected[100 + i] = 'row %d' % (100 + i) expected[200 + i] = 'row %d' % (200 + i) self.assertEqual(rows, expected)
def test_merge_sharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'custom_sharding_key', '--sharding_column_type', keyspace_id_type, '--split_shard_count', '4', 'test_keyspace' ]) shard_0_master.init_tablet('master', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('master', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('master', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-40', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/40-80', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_2_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the merge shards shard_dest_master.init_tablet('master', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_dest_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # copy the schema utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80' ], auto_log=True) # copy the data (will also start filtered replication), reset source utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_endpoints', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl( ['RunHealthCheck', shard_dest_rdonly.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_endpoints', '1', '--source_uid', '0', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_endpoints', '1', '--source_uid', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias ]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly ])
def _test_sanity(self): # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', '-force', 'test_keyspace']) utils.run_vtctl(['createshard', '-force', 'test_keyspace/0']) tablet_62344.init_tablet('master', 'test_keyspace', '0', parent=False) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace']) utils.validate_topology() # if these statements don't run before the tablet it will wedge # waiting for the db to become accessible. this is more a bug than # a feature. tablet_62344.populate('vt_test_keyspace', self._create_vt_select_test, self._populate_vt_select_test) tablet_62344.start_vttablet() # make sure the query service is started right away. qr = tablet_62344.execute('select id, msg from vt_select_test') self.assertEqual(len(qr['rows']), 4, 'expected 4 rows in vt_select_test: %s' % str(qr)) self.assertEqual(qr['fields'][0]['name'], 'id') self.assertEqual(qr['fields'][1]['name'], 'msg') # test exclude_field_names to vttablet works as expected. qr = tablet_62344.execute('select id, msg from vt_select_test', execute_options='exclude_field_names:true ') self.assertEqual(len(qr['rows']), 4, 'expected 4 rows in vt_select_test: %s' % str(qr)) self.assertNotIn('name', qr['fields'][0]) self.assertNotIn('name', qr['fields'][1]) # make sure direct dba queries work query_result = utils.run_vtctl_json([ 'ExecuteFetchAsDba', '-json', tablet_62344.tablet_alias, 'select * from vt_test_keyspace.vt_select_test' ]) self.assertEqual( len(query_result['rows']), 4, 'expected 4 rows in vt_select_test: %s' % str(query_result)) self.assertEqual( len(query_result['fields']), 2, 'expected 2 fields in vt_select_test: %s' % str(query_result)) # check Ping / RefreshState / RefreshStateByShard utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) utils.run_vtctl(['RefreshState', tablet_62344.tablet_alias]) utils.run_vtctl(['RefreshStateByShard', 'test_keyspace/0']) utils.run_vtctl( ['RefreshStateByShard', '--cells=test_nj', 'test_keyspace/0']) # Quickly check basic actions. utils.run_vtctl(['SetReadOnly', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.run_vtctl(['SetReadWrite', tablet_62344.tablet_alias]) utils.check_db_read_write(62344) utils.run_vtctl(['DemoteMaster', tablet_62344.tablet_alias]) utils.wait_db_read_only(62344) utils.validate_topology() utils.run_vtctl(['ValidateKeyspace', 'test_keyspace']) # not pinging tablets, as it enables replication checks, and they # break because we only have a single master, no slaves utils.run_vtctl( ['ValidateShard', '-ping-tablets=false', 'test_keyspace/0']) tablet_62344.kill_vttablet()
def test_merge_sharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace' ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) # won't be serving, no replication state for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl([ 'CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80' ], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( [ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false' ], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars( shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias ]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]) for t in [ shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly ])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', '--split_shard_count', '2', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_sharding_key', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', '-split_shard_count', '4', 'test_keyspace', 'custom_sharding_key', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['split_shard_count'], 4) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('SERVING') shard_1_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica']) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'rdonly']) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) self._check_stream_health_equals_binlog_player_vars(shard_2_master) self._check_stream_health_equals_binlog_player_vars(shard_3_master) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl( ['RunHealthCheck', shard_3_rdonly1.tablet_alias, 'rdonly']) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br>' '<b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl( ['RunHealthCheck', shard_1_slave2.tablet_alias, 'replica']) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl( ['RunHealthCheck', shard_3_master.tablet_alias, 'replica'], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])