def tearDownModule(): global vtgate_server logging.debug("in tearDownModule") if utils.options.skip_teardown: return logging.debug("Tearing down the servers and setup") utils.vtgate_kill(vtgate_server) tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]) teardown_procs = [shard_0_master.teardown_mysql(), shard_0_replica.teardown_mysql(), shard_1_master.teardown_mysql(), shard_1_replica.teardown_mysql(), ] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server_teardown() utils.kill_sub_processes() utils.remove_tmp_files() shard_0_master.remove_tree() shard_0_replica.remove_tree() shard_1_master.remove_tree() shard_1_replica.remove_tree()
def test_reparent_cross_cell(self, shard_id="0"): utils.run_vtctl("CreateKeyspace test_keyspace") # create the database so vttablets start, as they are serving tablet_62344.create_db("vt_test_keyspace") tablet_62044.create_db("vt_test_keyspace") tablet_41983.create_db("vt_test_keyspace") tablet_31981.create_db("vt_test_keyspace") # Start up a master mysql and vttablet tablet_62344.init_tablet("master", "test_keyspace", shard_id, start=True) if environment.topo_server_implementation == "zookeeper": shard = utils.run_vtctl_json(["GetShard", "test_keyspace/" + shard_id]) self.assertEqual(shard["Cells"], ["test_nj"], "wrong list of cell in Shard: %s" % str(shard["Cells"])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False) for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state("SERVING") if environment.topo_server_implementation == "zookeeper": shard = utils.run_vtctl_json(["GetShard", "test_keyspace/" + shard_id]) self.assertEqual( shard["Cells"], ["test_nj", "test_ny"], "wrong list of cell in Shard: %s" % str(shard["Cells"]) ) # Recompute the shard layout node - until you do that, it might not be valid. utils.run_vtctl("RebuildShardGraph test_keyspace/" + shard_id) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.pause("force ReparentShard?") utils.run_vtctl("ReparentShard -force test_keyspace/%s %s" % (shard_id, tablet_62344.tablet_alias)) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, "master", tablet_62344.port) # Verify MasterCell is properly set srvShard = utils.run_vtctl_json(["GetSrvShard", "test_nj", "test_keyspace/%s" % (shard_id)]) self.assertEqual(srvShard["MasterCell"], "test_nj") srvShard = utils.run_vtctl_json(["GetSrvShard", "test_ny", "test_keyspace/%s" % (shard_id)]) self.assertEqual(srvShard["MasterCell"], "test_nj") # Perform a graceful reparent operation to another cell. utils.pause("graceful ReparentShard?") utils.run_vtctl("ReparentShard test_keyspace/%s %s" % (shard_id, tablet_31981.tablet_alias), auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, "master", tablet_31981.port, cell="test_ny") # Verify MasterCell is set to new cell. srvShard = utils.run_vtctl_json(["GetSrvShard", "test_nj", "test_keyspace/%s" % (shard_id)]) self.assertEqual(srvShard["MasterCell"], "test_ny") srvShard = utils.run_vtctl_json(["GetSrvShard", "test_ny", "test_keyspace/%s" % (shard_id)]) self.assertEqual(srvShard["MasterCell"], "test_ny") tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def teardown(self): all_tablets = self.tablet_map.values() tablet.kill_tablets(all_tablets) teardown_procs = [t.teardown_mysql() for t in all_tablets] utils.wait_procs(teardown_procs, raise_on_error=False) for t in all_tablets: t.remove_tree()
def _teardown_shard_2(): tablet.kill_tablets(shard_2_tablets) utils.run_vtctl(["DeleteShard", "-recursive", "test_keyspace/2"], auto_log=True) for t in shard_2_tablets: t.clean_dbs()
def tearDown(self): # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly1, source_rdonly2, destination_master, destination_replica, destination_rdonly1, destination_rdonly2]) utils.vtgate.kill()
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db("vt_test_keyspace") pos = mysql_flavor().master_position(tablet_62344) changeMasterCmds = mysql_flavor().change_master_commands(utils.hostname, tablet_62344.mysql_port, pos) tablet_62044.mquery("", ["RESET MASTER", "RESET SLAVE"] + changeMasterCmds + ["START SLAVE"]) # now shutdown all mysqld shutdown_procs = [tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql()] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet("master", "test_keyspace", "0") tablet_62044.init_tablet("spare", "test_keyspace", "0", include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet( wait_for_state=None, target_tablet_type="replica", full_mycnf_args=True, include_mysql_port=False ) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state("NOT_SERVING") self.check_healthz(t, False) # restart mysqld start_procs = [tablet_62344.start_mysql(), tablet_62044.start_mysql()] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl(["RunHealthCheck", tablet_62344.tablet_alias, "replica"], auto_log=True) self.check_healthz(tablet_62344, True) # the slave won't be healthy at first, as replication is not running utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True) self.check_healthz(tablet_62044, False) tablet_62044.wait_for_vttablet_state("NOT_SERVING") # restart replication tablet_62044.mquery("", ["START SLAVE"]) # wait for the tablet to become healthy and fix its mysql port utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True) tablet_62044.wait_for_vttablet_state("SERVING") self.check_healthz(tablet_62044, True) for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(["GetTablet", t.tablet_alias]) if "mysql" in ti["Portmap"]: break timeout = utils.wait_step("mysql port in tablet record", timeout) self.assertEqual(ti["Portmap"]["mysql"], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def tearDownModule(): if utils.options.skip_teardown: return tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]) teardown_procs = [ shard_0_master.teardown_mysql(), shard_0_replica.teardown_mysql(), shard_1_master.teardown_mysql(), shard_1_replica.teardown_mysql(), unsharded_master.teardown_mysql(), unsharded_replica.teardown_mysql(), ] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() shard_0_master.remove_tree() shard_0_replica.remove_tree() shard_1_master.remove_tree() shard_1_replica.remove_tree() unsharded_master.remove_tree() unsharded_replica.remove_tree()
def tearDownModule(): if utils.options.skip_teardown: return tablet.kill_tablets([src_master, src_replica, src_rdonly1, src_rdonly2, dst_master, dst_replica]) teardown_procs = [ src_master.teardown_mysql(), src_replica.teardown_mysql(), src_rdonly1.teardown_mysql(), src_rdonly2.teardown_mysql(), dst_master.teardown_mysql(), dst_replica.teardown_mysql(), ] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() src_master.remove_tree() src_replica.remove_tree() src_rdonly1.remove_tree() src_rdonly2.remove_tree() dst_master.remove_tree() dst_replica.remove_tree()
def test_primecache(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) master.init_tablet( 'master', 'test_keyspace', '0') replica.init_tablet('idle') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) master.create_db('vt_test_keyspace') master.start_vttablet(wait_for_state=None) replica.start_vttablet(wait_for_state=None) master.wait_for_vttablet_state('SERVING') replica.wait_for_vttablet_state('NOT_SERVING') # DB doesn't exist self._create_data() # we use clone to not prime the mysql cache on the slave db utils.run_vtctl(['Clone', '-force', '-server-mode', master.tablet_alias, replica.tablet_alias], auto_log=True) # sync the buffer cache, and clear it. This will prompt for user's password utils.run(['sync']) utils.run(['sudo', 'bash', '-c', 'echo 1 > /proc/sys/vm/drop_caches']) # we can now change data on the master for 30s, while slave is stopped. # master's binlog will be in OS buffer cache now. replica.mquery('', 'slave stop') self._change_random_data() use_primecache = True # easy to test without if use_primecache: # starting vtprimecache, sleeping for a couple seconds args = environment.binary_args('vtprimecache') + [ '-db-config-dba-uname', 'vt_dba', '-db-config-dba-charset', 'utf8', '-db-config-dba-dbname', 'vt_test_keyspace', '-db-config-app-uname', 'vt_app', '-db-config-app-charset', 'utf8', '-db-config-app-dbname', 'vt_test_keyspace', '-relay_logs_path', replica.tablet_dir+'/relay-logs', '-mysql_socket_file', replica.tablet_dir+'/mysql.sock', '-log_dir', environment.vtlogroot, '-worker_count', '4', '-alsologtostderr', ] vtprimecache = utils.run_bg(args) time.sleep(2) # start slave, see how longs it takes to catch up on replication replica.mquery('', 'slave start') self.catch_up() if use_primecache: # TODO(alainjobart): read and check stats utils.kill_sub_process(vtprimecache) tablet.kill_tablets([master, replica])
def test_health_check_worker_state_does_not_shutdown_query_service(self): # This test is similar to test_health_check, but has the following # differences: # - the second tablet is an 'rdonly' and not a 'replica' # - the second tablet will be set to 'worker' and we expect that # the query service won't be shutdown # Setup master and rdonly tablets. tablet_62344.init_tablet("master", "test_keyspace", "0") for t in tablet_62344, tablet_62044: t.create_db("vt_test_keyspace") tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type="replica") tablet_62044.start_vttablet( wait_for_state=None, target_tablet_type="rdonly", init_keyspace="test_keyspace", init_shard="0" ) tablet_62344.wait_for_vttablet_state("SERVING") tablet_62044.wait_for_vttablet_state("NOT_SERVING") self.check_healthz(tablet_62044, False) # Enable replication. utils.run_vtctl(["InitShardMaster", "test_keyspace/0", tablet_62344.tablet_alias]) # Trigger healthcheck to save time waiting for the next interval. utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "rdonly") self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state("SERVING") # Change from rdonly to worker and stop replication. (These # actions are similar to the SplitClone vtworker command # implementation.) The tablet will become unhealthy, but the # query service is still running. utils.run_vtctl(["ChangeSlaveType", tablet_62044.tablet_alias, "worker"]) utils.run_vtctl(["StopSlave", tablet_62044.tablet_alias]) # Trigger healthcheck explicitly to avoid waiting for the next interval. utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "worker") self.check_healthz(tablet_62044, False) # Make sure that replication got disabled. self.assertIn( ">unhealthy: replication_reporter: " "Replication is not running</span></div>", tablet_62044.get_status() ) # Query service is still running. tablet_62044.wait_for_vttablet_state("SERVING") # Restart replication. Tablet will become healthy again. utils.run_vtctl(["ChangeSlaveType", tablet_62044.tablet_alias, "spare"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "spare") utils.run_vtctl(["StartSlave", tablet_62044.tablet_alias]) utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, "rdonly") self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state("SERVING") # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def shutdown(self): tablet.kill_tablets(self.tablets) teardown_procs = [t.teardown_mysql() for t in self.tablets] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() for t in self.tablets: t.remove_tree()
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') pos = mysql_flavor().master_position(tablet_62344) changeMasterCmds = mysql_flavor().change_master_commands( utils.hostname, tablet_62344.mysql_port, pos) tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] + changeMasterCmds + ['START SLAVE']) # now shutdown all mysqld shutdown_procs = [ tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql(), ] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0', include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet(wait_for_state=None, target_tablet_type='replica', full_mycnf_args=True, include_mysql_port=False) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') # restart mysqld start_procs = [ tablet_62344.start_mysql(), tablet_62044.start_mysql(), ] utils.wait_procs(start_procs) # wait for the tablets to become healthy and fix their mysql port for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('SERVING') for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) if 'mysql' in ti['Portmap']: break timeout = utils.wait_step('mysql port in tablet record', timeout) self.assertEqual(ti['Portmap']['mysql'], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def test_reparent_lag_slave(self, shard_id='0'): utils.run_vtctl('CreateKeyspace test_keyspace') # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('lag', 'test_keyspace', shard_id, start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_31981]: t.wait_for_vttablet_state("SERVING") tablet_41983.wait_for_vttablet_state("NOT_SERVING") # Recompute the shard layout node - until you do that, it might not be valid. utils.run_vtctl('RebuildShardGraph test_keyspace/' + shard_id) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl('ReparentShard -force test_keyspace/%s %s' % (shard_id, tablet_62344.tablet_alias)) utils.validate_topology(ping_tablets=True) tablet_62344.create_db('vt_test_keyspace') tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) tablet_41983.mquery('', 'stop slave') for q in self._populate_vt_insert_test: tablet_62344.mquery('vt_test_keyspace', q, write=True) # Perform a graceful reparent operation. utils.run_vtctl('ReparentShard test_keyspace/%s %s' % (shard_id, tablet_62044.tablet_alias)) tablet_41983.mquery('', 'start slave') time.sleep(1) utils.pause("check orphan") utils.run_vtctl('ReparentTablet %s' % tablet_41983.tablet_alias) result = tablet_41983.mquery('vt_test_keyspace', 'select msg from vt_insert_test where id=1') if len(result) != 1: self.fail('expected 1 row from vt_insert_test: %s' % str(result)) utils.pause("check lag reparent") tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_health_check_drained_state_does_not_shutdown_query_service(self): # This test is similar to test_health_check, but has the following # differences: # - the second tablet is an 'rdonly' and not a 'replica' # - the second tablet will be set to 'drained' and we expect that # the query service won't be shutdown # Setup master and rdonly tablets. tablet_62344.init_tablet('replica', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') # Note we only have a master and a rdonly. So we can't enable # semi-sync in this case, as the rdonly slaves don't semi-sync ack. tablet_62344.start_vttablet(wait_for_state=None, enable_semi_sync=False) tablet_62044.start_vttablet(wait_for_state=None, init_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='0', enable_semi_sync=False) tablet_62344.wait_for_vttablet_state('NOT_SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) # Enable replication. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # Trigger healthcheck to save time waiting for the next interval. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) tablet_62044.wait_for_vttablet_state('SERVING') self.check_healthz(tablet_62044, True) # Change from rdonly to drained and stop replication. (These # actions are similar to the SplitClone vtworker command # implementation.) The tablet will stay healthy, and the # query service is still running. utils.run_vtctl(['ChangeSlaveType', tablet_62044.tablet_alias, 'drained']) utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) # Trigger healthcheck explicitly to avoid waiting for the next interval. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'drained') self.check_healthz(tablet_62044, True) # Query service is still running. tablet_62044.wait_for_vttablet_state('SERVING') # Restart replication. Tablet will become healthy again. utils.run_vtctl(['ChangeSlaveType', tablet_62044.tablet_alias, 'rdonly']) utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) self.check_healthz(tablet_62044, True) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def _teardown_shard_2(): tablet.kill_tablets(shard_2_tablets) utils.run_vtctl( ['DeleteShard', '-recursive', 'test_keyspace/2'], auto_log=True) for t in shard_2_tablets: t.reset_replication() t.set_semi_sync_enabled(master=False) t.clean_dbs()
def test_reparent_cross_cell(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. Won't be healthy # as replication is not running. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_62344.wait_for_vttablet_state('SERVING') for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are # identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(['InitShardMaster', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_master_tablet(tablet_62344) # Perform a graceful reparent operation to another cell. utils.pause('test_reparent_cross_cell PlannedReparentShard') utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id, tablet_31981.tablet_alias], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_31981) tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_repeated_init_shard_master(self): """Test that using InitShardMaster can go back and forth between 2 hosts.""" for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, lameduck_period='5s', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') # Tablets are not replicating, so they won't be healthy. for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # Pick one master out of the two. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0) # Pick the other one as master, make sure they are still healthy. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62044.tablet_alias]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'replica', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'master', timeout=0) # Come back to the original guy. utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0) # And done. tablet.kill_tablets([tablet_62344, tablet_62044])
def shutdown(self): # Explicitly kill vtgate first because StreamingServerShutdownIT.java expects an EOF from the vtgate client # and not an error that vttablet killed the query (which is seen when vtgate is killed last). utils.vtgate.kill() tablet.kill_tablets(self.tablets) teardown_procs = [t.teardown_mysql() for t in self.tablets] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() for t in self.tablets: t.remove_tree()
def tearDown(self): # kill all tablets tablet.kill_tablets(initial_tablets) for t in initial_tablets: t.reset_replication() t.set_semi_sync_enabled(master=False) t.clean_dbs() utils.run_vtctl(['DeleteShard', '-recursive', '-even_if_serving', test_keyspace + '/0'], auto_log=True) utils.run_vtctl(['DeleteShard', '-recursive', '-even_if_serving', test_keyspace + '/1'], auto_log=True)
def test_health_check(self): utils.run_vtctl('CreateKeyspace test_keyspace') # one master, one replica that starts in spare tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if ti['Type'] == "replica": logging.info("Slave tablet went to replica, good") break timeout = utils.wait_step('slave tablet going to replica', timeout) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['Type'], 'master', "unexpected master type: %s" % ti['Type']) # stop replication on the slave, see it trigger the slave going # slightly unhealthy tablet_62044.mquery('', 'stop slave') timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if 'Health' in ti and ti['Health']: if 'replication_lag' in ti['Health']: if ti['Health']['replication_lag'] == 'high': logging.info("Slave tablet replication_lag went to high, good") break timeout = utils.wait_step('slave has high replication lag', timeout) # make sure the serving graph was updated ep = utils.run_vtctl_json(['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) if not ep['entries'][0]['health']: self.fail('Replication lag parameter not propagated to serving graph: %s' % str(ep)) self.assertEqual(ep['entries'][0]['health']['replication_lag'], 'high', 'Replication lag parameter not propagated to serving graph: %s' % str(ep)) tablet.kill_tablets([tablet_62344, tablet_62044])
def _test_reparent_slave_offline(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) # wait for all tablets to start tablet_62344.wait_for_vttablet_state('SERVING') for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are # identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias]) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Kill one tablet so we seem offline tablet_31981.kill_vttablet() # Perform a graceful reparent operation. utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id, tablet_62044.tablet_alias]) self._check_db_addr(shard_id, 'master', tablet_62044.port) tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])
def tearDownModule(): if utils.options.skip_teardown: return logging.debug("Tearing down the servers and setup") tablet.Tablet.tablets_running = 2 tablet.kill_tablets([master_tablet, replica_tablet]) teardown_procs = [master_tablet.teardown_mysql(), replica_tablet.teardown_mysql()] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() master_tablet.remove_tree() replica_tablet.remove_tree()
def tearDownModule(): utils.required_teardown() if utils.options.skip_teardown: return logging.debug('Tearing down the servers and setup') tablet.kill_tablets(all_tablets) utils.wait_procs([t.teardown_mysql() for t in all_tablets], raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() for t in all_tablets: t.remove_tree()
def _test_vtctl_clone(self, server_mode): if server_mode: clone_flags = ['-server-mode'] else: clone_flags = [] # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', 'snapshot_test']) tablet_62344.init_tablet('master', 'snapshot_test', '0') utils.run_vtctl(['RebuildShardGraph', 'snapshot_test/0']) utils.validate_topology() tablet_62344.populate('vt_snapshot_test', self._create_vt_insert_test, self._populate_vt_insert_test) tablet_62344.start_vttablet() tablet_31981.create_db('vt_snapshot_test') tablet_31981.init_tablet('idle', start=True) # small test to make sure the directory validation works snapshot_dir = os.path.join(environment.vtdataroot, 'snapshot') utils.run("rm -rf %s" % snapshot_dir) utils.run("mkdir -p %s" % snapshot_dir) utils.run("chmod -w %s" % snapshot_dir) out, err = utils.run_vtctl(['Clone', '-force'] + clone_flags + [tablet_62344.tablet_alias, tablet_31981.tablet_alias], log_level='INFO', expect_fail=True) if "Cannot validate snapshot directory" not in err: self.fail("expected validation error: %s" % err) if "Un-reserved test_ny-0000031981" not in err: self.fail("expected Un-reserved: %s" % err) logging.debug("Failed Clone output: " + err) utils.run("chmod +w %s" % snapshot_dir) call(["touch", "/tmp/vtSimulateFetchFailures"]) utils.run_vtctl(['Clone', '-force'] + clone_flags + [tablet_62344.tablet_alias, tablet_31981.tablet_alias], auto_log=True) self._check_shard() utils.pause("look at logs!") tablet_31981.assert_table_count('vt_snapshot_test', 'vt_insert_test', 4) tablet_62344.assert_table_count('vt_snapshot_test', 'vt_insert_test', 4) utils.validate_topology() tablet.kill_tablets([tablet_62344, tablet_31981])
def test_health_check_uid_collision(self): # If two tablets are running with the same UID, we should prevent the # healthcheck on the older one from modifying the tablet record after the # record has been claimed by a newer instance. tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') # Before starting tablets, simulate another tablet # owning the replica's record. utils.run_vtctl(['InitTablet', '-allow_update', '-hostname', 'localhost', '-keyspace', 'test_keyspace', '-shard', '0', '-port', '0', '-parent', tablet_62044.tablet_alias, 'replica']) # Set up tablets. tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) tablet_62044.wait_for_vttablet_state('SERVING') # Check that the tablet owns the record. tablet_record = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEquals(tablet_record['port_map']['vt'], tablet_62044.port, "tablet didn't take over the record") # Take away ownership again. utils.run_vtctl(['InitTablet', '-allow_update', '-hostname', 'localhost', '-keyspace', 'test_keyspace', '-shard', '0', '-port', '0', '-parent', tablet_62044.tablet_alias, 'replica']) # Tell the tablets to shutdown gracefully, # which normally includes going SPARE. tablet.kill_tablets([tablet_62344, tablet_62044]) # Make sure the tablet record hasn't been touched. tablet_record = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEquals(tablet_record['type'], tablet_62044.tablet_type_value['REPLICA'], 'tablet changed record without owning it')
def _test_vtctl_clone(self, server_mode): if server_mode: clone_flags = ["-server-mode"] else: clone_flags = [] # Start up a master mysql and vttablet utils.run_vtctl(["CreateKeyspace", "snapshot_test"]) tablet_62344.init_tablet("master", "snapshot_test", "0") utils.run_vtctl(["RebuildShardGraph", "snapshot_test/0"]) utils.validate_topology() tablet_62344.populate("vt_snapshot_test", self._create_vt_insert_test, self._populate_vt_insert_test) tablet_62344.start_vttablet() tablet_62044.create_db("vt_snapshot_test") tablet_62044.init_tablet("idle", start=True) # small test to make sure the directory validation works snapshot_dir = os.path.join(environment.vtdataroot, "snapshot") utils.run("rm -rf %s" % snapshot_dir) utils.run("mkdir -p %s" % snapshot_dir) utils.run("chmod -w %s" % snapshot_dir) out, err = utils.run_vtctl( ["Clone", "-force"] + clone_flags + [tablet_62344.tablet_alias, tablet_62044.tablet_alias], log_level="INFO", expect_fail=True, ) if "Cannot validate snapshot directory" not in err: self.fail("expected validation error: %s" % err) if "Un-reserved test_nj-0000062044" not in err: self.fail("expected Un-reserved: %s" % err) logging.debug("Failed Clone output: " + err) utils.run("chmod +w %s" % snapshot_dir) call(["touch", "/tmp/vtSimulateFetchFailures"]) utils.run_vtctl( ["Clone", "-force"] + clone_flags + [tablet_62344.tablet_alias, tablet_62044.tablet_alias], auto_log=True ) utils.pause("look at logs!") tablet_62044.assert_table_count("vt_snapshot_test", "vt_insert_test", 4) tablet_62344.assert_table_count("vt_snapshot_test", "vt_insert_test", 4) utils.validate_topology() tablet.kill_tablets([tablet_62344, tablet_62044])
def tearDownModule(): utils.required_teardown() if utils.options.skip_teardown: return tablet.kill_tablets(all_tablets) teardown_procs = [t.teardown_mysql() for t in all_tablets] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() for t in all_tablets: t.remove_tree()
def test_repeated_init_shard_master(self): for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, target_tablet_type='replica', lameduck_period='5s', init_keyspace='test_keyspace', init_shard='0') # tablets are not replicating, so they won't be healthy for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # pick one master out of the two utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # run health check on both, make sure they are both healthy for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica'], auto_log=True) self.check_healthz(t, True) # pick the other one as master, make sure they are still healthy utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62044.tablet_alias]) # run health check on both, make sure they are both healthy for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica'], auto_log=True) self.check_healthz(t, True) # and come back to the original guy utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # run health check on both, make sure they are both healthy for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica'], auto_log=True) self.check_healthz(t, True) # and done tablet.kill_tablets([tablet_62344, tablet_62044])
def tearDownModule(): if utils.options.skip_teardown: return tablet.kill_tablets([master_tablet, replica_tablet]) teardown_procs = [ master_tablet.teardown_mysql(), replica_tablet.teardown_mysql(), ] utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server_teardown() utils.kill_sub_processes() utils.remove_tmp_files() master_tablet.remove_tree() replica_tablet.remove_tree()
def tearDownModule(): global __tablets if utils.options.skip_teardown: return if __tablets is not None: tablet.kill_tablets(__tablets) teardown_procs = [] for t in __tablets: teardown_procs.append(t.teardown_mysql()) utils.wait_procs(teardown_procs, raise_on_error=False) environment.topo_server().teardown() utils.kill_sub_processes() utils.remove_tmp_files() if __tablets is not None: for t in __tablets: t.remove_tree()
def _test_reparent_graceful(self, shard_id): # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', shard_id, start=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ]) utils.validate_topology(ping_tablets=True) tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) self._check_master_tablet(tablet_62344) utils.validate_topology() # Run this to make sure it succeeds. stdout, _ = utils.run_vtctl( ['ShardReplicationPositions', 'test_keyspace/' + shard_id], trap_output=True) lines = stdout.splitlines() self.assertEqual(len(lines), 4) # one master, three slaves self.assertIn('master', lines[0]) # master first # Perform a graceful reparent operation. utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, '-new_master', tablet_62044.tablet_alias ], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_62044) # insert data into the new master, check the connected slaves work self._populate_vt_insert_test(tablet_62044, 1) self._check_vt_insert_test(tablet_41983, 1) self._check_vt_insert_test(tablet_62344, 1) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981]) # Test address correction. new_port = environment.reserve_ports(1) tablet_62044.start_vttablet(port=new_port) # Wait until the new address registers. timeout = 30.0 while True: try: self._check_master_tablet(tablet_62044, port=new_port) break except protocols_flavor().client_error_exception_type(): timeout = utils.wait_step('waiting for new port to register', timeout, sleep_time=0.1) tablet_62044.kill_vttablet()
def test_sharding(self): shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run checks now before we start the tablets utils.validate_topology() # create databases, start the tablets, wait for them to start for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.wait_for_vttablet_state('SERVING') # apply the schema on the first shard through vtctl, so all tablets # are the same. shard_0_master.mquery('vt_test_keyspace', create_vt_select_test.replace('\n', ''), write=True) shard_0_replica.mquery('vt_test_keyspace', create_vt_select_test.replace('\n', ''), write=True) # apply the schema on the second shard. shard_1_master.mquery( 'vt_test_keyspace', create_vt_select_test_reverse.replace('\n', ''), write=True) shard_1_replica.mquery( 'vt_test_keyspace', create_vt_select_test_reverse.replace('\n', ''), write=True) for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: utils.run_vtctl(['ReloadSchema', t.tablet_alias]) # start vtgate, we'll use it later utils.VtGate().start() for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]: t.reset_replication() utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias], auto_log=True) # insert some values directly (db is RO after minority reparent) # FIXME(alainjobart) these values don't match the shard map utils.run_vtctl(['SetReadWrite', shard_0_master.tablet_alias]) utils.run_vtctl(['SetReadWrite', shard_1_master.tablet_alias]) shard_0_master.mquery( 'vt_test_keyspace', "insert into vt_select_test (id, msg) values (1, 'test 1')", write=True) shard_1_master.mquery( 'vt_test_keyspace', "insert into vt_select_test (id, msg) values (10, 'test 10')", write=True) utils.validate_topology(ping_tablets=True) utils.pause('Before the sql scatter query') # make sure the '1' value was written on first shard rows = shard_0_master.mquery( 'vt_test_keyspace', 'select id, msg from vt_select_test order by id') self.assertEqual(rows, ((1, 'test 1'),), 'wrong mysql_query output: %s' % str(rows)) utils.pause('After db writes') # throw in some schema validation step # we created the schema differently, so it should show utils.run_vtctl(['ValidateSchemaShard', 'test_keyspace/-80']) utils.run_vtctl(['ValidateSchemaShard', 'test_keyspace/80-']) out, err = utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], trap_output=True, raise_on_error=False) if ('test_nj-0000062344 and test_nj-0000062346 disagree on schema ' 'for table vt_select_test:\nCREATE TABLE' not in err or 'test_nj-0000062344 and test_nj-0000062347 disagree on schema ' 'for table vt_select_test:\nCREATE TABLE' not in err): self.fail('wrong ValidateSchemaKeyspace output: ' + err) # validate versions utils.run_vtctl(['ValidateVersionShard', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ValidateVersionKeyspace', 'test_keyspace'], auto_log=True) # show and validate permissions utils.run_vtctl(['GetPermissions', 'test_nj-0000062344'], auto_log=True) utils.run_vtctl(['ValidatePermissionsShard', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ValidatePermissionsKeyspace', 'test_keyspace'], auto_log=True) if environment.topo_server().flavor() == 'zookeeper': # and create zkns on this complex keyspace, make sure a few # files are created utils.run_vtctl(['ExportZknsForKeyspace', 'test_keyspace']) out, err = utils.run( environment.binary_argstr('zk') + ' ls -R /zk/test_nj/zk?s/vt/test_keysp*', trap_output=True) lines = out.splitlines() for base in ['-80', '80-']: for db_type in ['master', 'replica']: for sub_path in ['', '.vdns', '/0', '/vt.vdns']: expected = ('/zk/test_nj/zkns/vt/test_keyspace/' + base + '/' + db_type + sub_path) if expected not in lines: self.fail('missing zkns part:\n%s\nin:%s' %(expected, out)) # connect to the tablets directly, make sure they know / validate # their own shard sql = 'select id, msg from vt_select_test order by id' qr = shard_0_master.execute(sql) self.assertEqual(qr['Rows'], [['1', 'test 1'],]) qr = shard_1_master.execute(sql) self.assertEqual(qr['Rows'], [['10', 'test 10'],]) _, stderr = utils.run_vtctl(['VtTabletExecute', '-keyspace', 'test_keyspace', '-shard', '-90', shard_0_master.tablet_alias, sql], expect_fail=True) self.assertIn('fatal: Shard mismatch, expecting -80, received -90', stderr) utils.vtgate.kill() tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master, shard_1_replica])
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_sharding_key', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_sharding_key', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_sharding_key') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, the slaves won't be # healthy) shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('NOT_SERVING') shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING') shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--source_reader_count', '10', '--min_table_size_for_split', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_binlog_throttler(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_binlog_throttler(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all self.check_running_binlog_player(shard_2_master, 4000, 2000) self.check_running_binlog_player(shard_3_master, 4000, 2000) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low') monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high') # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_srv_keyspace('test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d', monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d', monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=keyspace_id_type, sharding_column_name='custom_sharding_key') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def _test_reparent_slave_offline(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ]) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Kill one tablet so we seem offline tablet_31981.kill_vttablet() # Perform a graceful reparent operation. utils.run_vtctl([ 'ReparentShard', 'test_keyspace/' + shard_id, tablet_62044.tablet_alias ]) tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])
def _test_reparent_from_outside(self, brutal=False, rpc=False): """This test will start a master and 3 slaves. Then: - one slave will be the new master - one slave will be reparented to that new master - one slave will be busted and ded in the water and we'll call ShardExternallyReparented. Args: brutal: scraps the old master first rpc: sends an RPC to the new master instead of doing the work. """ utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') # Reparent as a starting point for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ], auto_log=True) # now manually reparent 1 out of 2 tablets # 62044 will be the new master # 31981 won't be re-parented, so it will be busted tablet_62044.mquery('', mysql_flavor().promote_slave_commands()) new_pos = mysql_flavor().master_position(tablet_62044) logging.debug('New master position: %s', str(new_pos)) changeMasterCmds = mysql_flavor().change_master_commands( utils.hostname, tablet_62044.mysql_port, new_pos) # 62344 will now be a slave of 62044 tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] + changeMasterCmds + ['START SLAVE']) # 41983 will be a slave of 62044 tablet_41983.mquery('', ['STOP SLAVE'] + changeMasterCmds + ['START SLAVE']) # in brutal mode, we scrap the old master first if brutal: tablet_62344.scrap(force=True) # we have some automated tools that do this too, so it's good to simulate if environment.topo_server().flavor() == 'zookeeper': utils.run( environment.binary_args('zk') + ['rm', '-rf', tablet_62344.zk_tablet_path]) # update zk with the new graph extra_args = [] if rpc: extra_args = ['-use_rpc'] utils.run_vtctl(['ShardExternallyReparented'] + extra_args + ['test_keyspace/0', tablet_62044.tablet_alias], mode=utils.VTCTL_VTCTL, auto_log=True) self._test_reparent_from_outside_check(brutal) utils.run_vtctl( ['RebuildReplicationGraph', 'test_nj', 'test_keyspace']) self._test_reparent_from_outside_check(brutal) tablet.kill_tablets( [tablet_31981, tablet_62344, tablet_62044, tablet_41983])
def test_health_check(self): # one master, one replica that starts in spare # (for the replica, we let vttablet do the InitTablet) tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica', lameduck_period='5s', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'replica') self.check_healthz(tablet_62044, True) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['type'], tablet.Tablet.tablet_type_value['MASTER'], 'unexpected master type: %s' % ti['type']) # stop replication, make sure we go unhealthy. utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'spare') self.check_healthz(tablet_62044, False) # make sure the serving graph was updated timeout = 10 while True: try: utils.run_vtctl_json( ['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) except: logging.debug('Tablet is gone from serving graph, good') break timeout = utils.wait_step( 'Stopped replication didn\'t trigger removal from serving graph', timeout) # make sure status web page is unhappy self.assertIn( '>unhealthy: replication_reporter: ' 'Replication is not running</span></div>', tablet_62044.get_status()) # make sure the health stream is updated health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertIn('replication_reporter: Replication is not running', health['realtime_stats']['health_error']) # then restart replication, and write data, make sure we go back to healthy utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'replica') # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the vars is updated v = utils.get_vars(tablet_62044.port) self.assertEqual(v['LastHealthMapCount'], 0) # now test VtTabletStreamHealth returns the right thing stdout, stderr = utils.run_vtctl( ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias], trap_output=True, auto_log=True) lines = stdout.splitlines() self.assertEqual(len(lines), 2) for line in lines: logging.debug('Got health: %s', line) data = json.loads(line) self.assertIn('realtime_stats', data) self.assertNotIn('health_error', data['realtime_stats']) self.assertNotIn('tablet_externally_reparented_timestamp', data) self.assertEqual('test_keyspace', data['target']['keyspace']) self.assertEqual('0', data['target']['shard']) self.assertEqual(3, data['target']['tablet_type']) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044]) # the replica was in lameduck for 5 seconds, should have been enough # to reset its state to spare ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) self.assertEqual( ti['type'], tablet.Tablet.tablet_type_value['SPARE'], "tablet didn't go to spare while in lameduck mode: %s" % str(ti))
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') pos = mysql_flavor().master_position(tablet_62344) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. changeMasterCmds = mysql_flavor().change_master_commands( 'localhost', tablet_62344.mysql_port, pos) tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] + changeMasterCmds + ['START SLAVE']) # now shutdown all mysqld shutdown_procs = [ tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql(), ] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0', include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet(wait_for_state=None, target_tablet_type='replica', full_mycnf_args=True, include_mysql_port=False) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # restart mysqld start_procs = [ tablet_62344.start_mysql(), tablet_62044.start_mysql(), ] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl( ['RunHealthCheck', tablet_62344.tablet_alias, 'replica'], auto_log=True) self.check_healthz(tablet_62344, True) # the slave won't be healthy at first, as replication is not running utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'replica'], auto_log=True) self.check_healthz(tablet_62044, False) tablet_62044.wait_for_vttablet_state('NOT_SERVING') # restart replication tablet_62044.mquery('', ['START SLAVE']) # wait for the tablet to become healthy and fix its mysql port utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'replica'], auto_log=True) tablet_62044.wait_for_vttablet_state('SERVING') self.check_healthz(tablet_62044, True) for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) if 'mysql' in ti['port_map']: break timeout = utils.wait_step('mysql port in tablet record', timeout) self.assertEqual(ti['port_map']['mysql'], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def _test_reparent_graceful(self, shard_id): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True) if environment.topo_server().flavor() == 'zookeeper': shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['Cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['Cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') if environment.topo_server().flavor() == 'zookeeper': shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['Cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['Cells'])) # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.pause('force ReparentShard?') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias]) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Verify MasterCell is set to new cell. srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') # Convert two replica to spare. That should leave only one node serving traffic, # but still needs to appear in the replication graph. utils.run_vtctl(['ChangeSlaveType', tablet_41983.tablet_alias, 'spare']) utils.run_vtctl(['ChangeSlaveType', tablet_31981.tablet_alias, 'spare']) utils.validate_topology() self._check_db_addr(shard_id, 'replica', tablet_62044.port) # Run this to make sure it succeeds. utils.run_vtctl(['ShardReplicationPositions', 'test_keyspace/' + shard_id], stdout=utils.devnull) # Perform a graceful reparent operation. utils.pause('graceful ReparentShard?') utils.run_vtctl(['ReparentShard', 'test_keyspace/' + shard_id, tablet_62044.tablet_alias], auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, 'master', tablet_62044.port) # Verify MasterCell is set to new cell. srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981]) # Test address correction. new_port = environment.reserve_ports(1) tablet_62044.start_vttablet(port=new_port) # Wait until the new address registers. timeout = 30.0 while True: try: self._check_db_addr(shard_id, 'master', new_port) break except: timeout = utils.wait_step('waiting for new port to register', timeout, sleep_time=0.1) tablet_62044.kill_vttablet()
def test_reparent_cross_cell(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual(shard['Cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['Cells'])) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['Cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['Cells'])) # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.pause('force ReparentShard?') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias]) utils.validate_topology(ping_tablets=True) self._check_db_addr(shard_id, 'master', tablet_62344.port) # Verify MasterCell is properly set srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_nj') # Perform a graceful reparent operation to another cell. utils.pause('graceful ReparentShard?') utils.run_vtctl(['ReparentShard', 'test_keyspace/' + shard_id, tablet_31981.tablet_alias], auto_log=True) utils.validate_topology() self._check_db_addr(shard_id, 'master', tablet_31981.port, cell='test_ny') # Verify MasterCell is set to new cell. srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_ny') srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny', 'test_keyspace/%s' % (shard_id)]) self.assertEqual(srvShard['MasterCell'], 'test_ny') tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_reparent_down_master(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('SERVING') # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0']) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias], auto_log=True) utils.validate_topology() # Make the master agent and database unavailable. tablet_62344.kill_vttablet() tablet_62344.shutdown_mysql().wait() self._check_db_addr('0', 'master', tablet_62344.port) # Perform a reparent operation - the Validate part will try to ping # the master and fail somewhat quickly stdout, stderr = utils.run_vtctl(['-wait-time', '5s', 'ReparentShard', 'test_keyspace/0', tablet_62044.tablet_alias], expect_fail=True) logging.debug('Failed ReparentShard output:\n' + stderr) if 'ValidateShard verification failed' not in stderr: self.fail( "didn't find the right error strings in failed ReparentShard: " + stderr) # Should fail to connect and fail stdout, stderr = utils.run_vtctl(['-wait-time', '10s', 'ScrapTablet', tablet_62344.tablet_alias], expect_fail=True) logging.debug('Failed ScrapTablet output:\n' + stderr) if 'connection refused' not in stderr and protocols_flavor().rpc_timeout_message() not in stderr: self.fail("didn't find the right error strings in failed ScrapTablet: " + stderr) # Force the scrap action in zk even though tablet is not accessible. tablet_62344.scrap(force=True) # Re-run reparent operation, this should now proceed unimpeded. utils.run_vtctl(['ReparentShard', 'test_keyspace/0', tablet_62044.tablet_alias], auto_log=True) utils.validate_topology() self._check_db_addr('0', 'master', tablet_62044.port) utils.run_vtctl(['ChangeSlaveType', '-force', tablet_62344.tablet_alias, 'idle']) idle_tablets, _ = utils.run_vtctl(['ListAllTablets', 'test_nj'], trap_output=True) if '0000062344 <null> <null> idle' not in idle_tablets: self.fail('idle tablet not found: %s' % idle_tablets) tablet.kill_tablets([tablet_62044, tablet_41983, tablet_31981]) # so the other tests don't have any surprise tablet_62344.start_mysql().wait()
def test_resharding(self): # we're going to reparent and swap these two global shard_2_master, shard_2_replica1 utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) shard_0_master.init_tablet('replica', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-') shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json( ['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = ( base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES) # create databases so vttablet can start behaving somewhat normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets (replication is not setup, they won't be healthy) for t in [ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards)) self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards)) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_slave1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the split shards shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0') shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0') shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-') shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_2_master.start_vttablet(wait_for_state=None) shard_3_master.start_vttablet(wait_for_state=None) for t in [ shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_replica, shard_3_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) # check the shards shards = utils.run_vtctl_json( ['FindAllShardsInKeyspace', 'test_keyspace']) for s in ['-80', '80-', '80-c0', 'c0-']: self.assertIn(s, shards, 'unexpected shards: %s' % str(shards)) self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards)) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # disable shard_1_slave2, so we're sure filtered replication will go # from shard_1_slave1 utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_1_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Copy the data from the source to the destination shards. # --max_tps is only specified to enable the throttler and ensure that the # code is executed. But the intent here is not to throttle the test, hence # the rate limit is set very high. # # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Test the correct handling of keyspace_id changes which happen after # the first clone. # Let row 2 go to shard 3 instead of shard 2. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0xD000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 2 and inserted to shard 3. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0xD000000000000000, should_be_here=False) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0xD000000000000000) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again. shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set' ' custom_ksid_col=0x9000000000000000 WHERE id=2', write=True) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Row 2 will be deleted from shard 3 and inserted to shard 2. self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 0, 1, 1) self._check_value(shard_2_master, 'resharding1', 2, 'msg2', 0x9000000000000000) self._check_value(shard_3_master, 'resharding1', 2, 'msg2', 0x9000000000000000, should_be_here=False) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 2 (provokes an insert). shard_2_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_3_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 and 5 (provokes a delete). self._insert_value(shard_3_master, 'resharding1', 4, 'msg4', 0xD000000000000000) self._insert_value(shard_3_master, 'resharding1', 5, 'msg5', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999', 'test_keyspace/80-' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablet, which was taken offline, back to rdonly. utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 2, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # TODO(alainjobart): experiment with the dontStartBinlogPlayer option # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars self.check_destination_master(shard_2_master, ['test_keyspace/80-']) self.check_destination_master(shard_3_master, ['test_keyspace/80-']) # When the binlog players/filtered replication is turned on, the query # service must be turned off on the destination masters. # The tested behavior is a safeguard to prevent that somebody can # accidentally modify data on the destination masters while they are not # migrated yet and the source shards are still the source of truth. shard_2_master.wait_for_vttablet_state('NOT_SERVING') shard_3_master.wait_for_vttablet_state('NOT_SERVING') # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_1_slave1, horizontal=True) # Check that the throttler was enabled. self.check_throttler_service(shard_2_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) self.check_throttler_service(shard_3_master.rpc_endpoint(), ['BinlogPlayer/0'], 9999) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Executing MultiValue Insert Queries') self._exec_multi_shard_dmls() logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) logging.debug('Checking MultiValue Insert Queries') self._check_multi_shard_values() self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_1_slave1, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias]) logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for destination master tablets, make sure we have it all if base_sharding.use_rbr: # We submitted non-annotated DMLs, that are properly routed # with RBR, but not with SBR. So the first shard counts # are smaller. In the second shard, we submitted statements # that affect more than one keyspace id. These will result # in two queries with RBR. So the count there is higher. self.check_running_binlog_player(shard_2_master, 4018, 2008) self.check_running_binlog_player(shard_3_master, 4028, 2008) else: self.check_running_binlog_player(shard_2_master, 4022, 2008) self.check_running_binlog_player(shard_3_master, 4024, 2008) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1) monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2) # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias]) # test data goes through again logging.debug('Inserting lots of data on source shard') self._insert_lots(1000, base=1000) logging.debug('Checking 80 percent of data was sent quickly') self._check_lots_timeout(1000, 80, 5, base=1000) self.check_binlog_server_vars(shard_1_slave2, horizontal=True, min_statements=800, min_transactions=800) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere (we run it to be sure). utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias], auto_log=True) for master in [shard_2_master, shard_3_master]: utils.check_tablet_query_service(self, master, False, False) stream_health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_3_master.get_healthz() # now serve rdonly from the split shards, in test_nj only utils.run_vtctl([ 'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-', 'rdonly' ], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # now serve rdonly from the split shards, everywhere utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_srv_keyspace( 'test_ny', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False) utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True) utils.check_tablet_query_service(self, shard_1_rdonly1, False, True) # then serve replica from the split shards destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-'] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_slave2, False, True) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, True, False) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly. utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, shard_1_slave2, False, True) # Destination tablets would have query service disabled for other # reasons than the migration, so check the shard record instead of # the tablets directly utils.check_shard_query_services(self, destination_shards, topodata_pb2.REPLICA, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-c0', '-new_master', shard_2_replica1.tablet_alias ]) # update our test variables to point at the new master shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master logging.debug( 'Inserting lots of data on source shard after reparenting') self._insert_lots(3000, base=2000) logging.debug('Checking 80 percent of data was sent fairly quickly') self._check_lots_timeout(3000, 80, 10, base=2000) # use vtworker to compare the data again logging.debug('Running vtworker SplitDiff') utils.run_vtworker([ '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug( 'DELAY 1: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms, monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count) logging.debug( 'DELAY 2: %s max_lag=%d ms avg_lag=%d ms', monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms, monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count) # mock with the SourceShard records to test 'vtctl SourceShardDelete' # and 'vtctl SourceShardAdd' utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'], auto_log=True) utils.run_vtctl([ 'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0', 'test_keyspace/80-' ], auto_log=True) # then serve master from the split shards, make sure the source master's # query service is now turned off utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' 'Partitions(rdonly): -80 80-c0 c0-\n' 'Partitions(replica): -80 80-c0 c0-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_2_master) self.check_no_binlog_player(shard_3_master) # delete the original tablets in the original shard tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]) for t in [ shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly, shard_1_rdonly1 ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) self.assertNotIn('cells', shard) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # make sure we can't delete the destination shard now that it's serving _, stderr = utils.run_vtctl(['DeleteShard', 'test_keyspace/80-c0'], expect_fail=True) self.assertIn('is still serving, cannot delete it', stderr) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master, shard_2_replica1, shard_2_replica2, shard_2_rdonly1, shard_3_master, shard_3_replica, shard_3_rdonly1 ])
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') pos = mysql_flavor().master_position(tablet_62344) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', tablet_62344.mysql_port, pos) tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] + change_master_cmds + ['START SLAVE']) # now shutdown all mysqld shutdown_procs = [ tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql(), ] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('replica', 'test_keyspace', '0', include_mysql_port=False) for t in tablet_62344, tablet_62044: # Since MySQL is down at this point and we want the tablet to start up # successfully, we have to use supports_backups=False. t.start_vttablet(wait_for_state=None, supports_backups=False, full_mycnf_args=True, include_mysql_port=False) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # Tell slave to not try to repair replication in healthcheck. # The StopSlave will ultimately fail because mysqld is not running, # But vttablet should remember that it's not supposed to fix replication. utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias], expect_fail=True) # The above notice to not fix replication should survive tablet restart. tablet_62044.kill_vttablet() tablet_62044.start_vttablet(wait_for_state='NOT_SERVING', full_mycnf_args=True, include_mysql_port=False, supports_backups=False) # restart mysqld start_procs = [ tablet_62344.start_mysql(), tablet_62044.start_mysql(), ] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl(['RunHealthCheck', tablet_62344.tablet_alias], auto_log=True) self.check_healthz(tablet_62344, True) # the slave will now be healthy, but report a very high replication # lag, because it can't figure out what it exactly is. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias], auto_log=True) tablet_62044.wait_for_vttablet_state('SERVING') self.check_healthz(tablet_62044, True) health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertTrue('seconds_behind_master' in health['realtime_stats']) self.assertEqual(health['realtime_stats']['seconds_behind_master'], 7200) self.assertIn('serving', health) # restart replication, wait until health check goes small # (a value of zero is default and won't be in structure) utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) timeout = 10 while True: utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias], auto_log=True) health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias ]) if 'serving' in health and ( ('seconds_behind_master' not in health['realtime_stats']) or (health['realtime_stats']['seconds_behind_master'] < 30)): break timeout = utils.wait_step('health delay goes back down', timeout) # wait for the tablet to fix its mysql port for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) if 'mysql' in ti['port_map']: break timeout = utils.wait_step('mysql port in tablet record', timeout) self.assertEqual(ti['port_map']['mysql'], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def test_change_type_semi_sync(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # Create new names for tablets, so this test is less confusing. master = tablet_62344 replica = tablet_62044 rdonly1 = tablet_41983 rdonly2 = tablet_31981 # create the database so vttablets start, as they are serving for t in [master, replica, rdonly1, rdonly2]: t.create_db('vt_test_keyspace') # Start up a soon-to-be master, one replica and two rdonly. master.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) replica.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) rdonly1.init_tablet('rdonly', 'test_keyspace', '0', start=True, wait_for_start=False) rdonly2.init_tablet('rdonly', 'test_keyspace', '0', start=True, wait_for_start=False) for t in [master, replica, rdonly1, rdonly2]: t.wait_for_vttablet_state('NOT_SERVING') # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', master.tablet_alias ], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_master_tablet(master) # Stop replication on rdonly1, to make sure when we make it # replica it doesn't start again. # Note we do a similar test for replica -> rdonly below. utils.run_vtctl(['StopSlave', rdonly1.tablet_alias]) # Check semi-sync on slaves. # The flag is only an indication of the value to use next time # we turn replication on, so also check the status. # rdonly1 is not replicating, so its status is off. replica.check_db_var('rpl_semi_sync_slave_enabled', 'ON') rdonly1.check_db_var('rpl_semi_sync_slave_enabled', 'OFF') rdonly2.check_db_var('rpl_semi_sync_slave_enabled', 'OFF') replica.check_db_status('rpl_semi_sync_slave_status', 'ON') rdonly1.check_db_status('rpl_semi_sync_slave_status', 'OFF') rdonly2.check_db_status('rpl_semi_sync_slave_status', 'OFF') # Change replica to rdonly while replicating, should turn off semi-sync, # and restart replication. utils.run_vtctl(['ChangeSlaveType', replica.tablet_alias, 'rdonly'], auto_log=True) replica.check_db_var('rpl_semi_sync_slave_enabled', 'OFF') replica.check_db_status('rpl_semi_sync_slave_status', 'OFF') # Change rdonly1 to replica, should turn on semi-sync, and not start rep. utils.run_vtctl(['ChangeSlaveType', rdonly1.tablet_alias, 'replica'], auto_log=True) rdonly1.check_db_var('rpl_semi_sync_slave_enabled', 'ON') rdonly1.check_db_status('rpl_semi_sync_slave_status', 'OFF') slave_io_running = 10 slave_sql_running = 11 s = rdonly1.mquery('', 'show slave status') self.assertEqual(s[0][slave_io_running], 'No') self.assertEqual(s[0][slave_sql_running], 'No') # Now change from replica back to rdonly, make sure replication is # still not enabled. utils.run_vtctl(['ChangeSlaveType', rdonly1.tablet_alias, 'rdonly'], auto_log=True) rdonly1.check_db_var('rpl_semi_sync_slave_enabled', 'OFF') rdonly1.check_db_status('rpl_semi_sync_slave_status', 'OFF') s = rdonly1.mquery('', 'show slave status') self.assertEqual(s[0][slave_io_running], 'No') self.assertEqual(s[0][slave_sql_running], 'No') # Change rdonly2 to replica, should turn on semi-sync, and restart rep. utils.run_vtctl(['ChangeSlaveType', rdonly2.tablet_alias, 'replica'], auto_log=True) rdonly2.check_db_var('rpl_semi_sync_slave_enabled', 'ON') rdonly2.check_db_status('rpl_semi_sync_slave_status', 'ON') # Clean up. tablet.kill_tablets([master, replica, rdonly1, rdonly2])
def test_reparent_with_down_slave(self, shard_id='0'): """See if a missing slave can be safely reparented after the fact.""" utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_31981, tablet_41983]: t.wait_for_vttablet_state('NOT_SERVING') # Force the slaves to reparent assuming that all the datasets are identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ]) utils.validate_topology(ping_tablets=True) tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) utils.wait_procs([tablet_41983.shutdown_mysql()]) # Perform a graceful reparent operation. It will fail as one tablet is down. _, stderr = utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, '-new_master', tablet_62044.tablet_alias ], expect_fail=True) self.assertIn('TabletManager.SetMaster on test_nj-0000041983 error', stderr) # insert data into the new master, check the connected slaves work self._populate_vt_insert_test(tablet_62044, 3) self._check_vt_insert_test(tablet_31981, 3) self._check_vt_insert_test(tablet_62344, 3) # restart mysql on the old slave, should still be connecting to the # old master utils.wait_procs([tablet_41983.start_mysql()]) utils.pause('check orphan') # reparent the tablet (will not start replication, so we have to # do it ourselves), then it should catch up on replication really quickly utils.run_vtctl(['ReparentTablet', tablet_41983.tablet_alias]) utils.run_vtctl(['StartSlave', tablet_41983.tablet_alias]) # wait until it gets the data self._check_vt_insert_test(tablet_41983, 3) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_master.init_tablet('master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly.init_tablet('rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl( ['MultiSnapshot', '--spec=-80-', shard_replica.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot shard_replica.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/-80', shard_replica.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-', shard_replica.tablet_alias ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug("Waiting for binlog players to start on new masters...") shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff for -80") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug("Running vtworker SplitDiff for 80-") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # scrap the original tablets in the original shard for t in [shard_master, shard_replica, shard_rdonly]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([shard_master, shard_replica, shard_rdonly]) for t in [shard_master, shard_replica, shard_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly ])
def test_resharding(self): # create the keyspace with just one shard utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_master.init_tablet('master', 'test_keyspace', '0') shard_replica.init_tablet('replica', 'test_keyspace', '0') shard_rdonly1.init_tablet('rdonly', 'test_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_master.wait_for_vttablet_state('SERVING') shard_replica.wait_for_vttablet_state('SERVING') shard_rdonly1.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', shard_master.tablet_alias], auto_log=True) # create the tables and add startup values self._create_schema() self._insert_startup_values() # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # create the split shards shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_rdonly1.init_tablet('rdonly', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_replica.init_tablet('replica', 'test_keyspace', '80-') shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.start_vttablet(wait_for_state=None) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtworker([ '--cell', 'test_nj', '--command_display_interval', '10ms', 'SplitClone', '--exclude_tables', 'unrelated', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'test_keyspace/0' ], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') shard_0_master.wait_for_binlog_player_count(1) shard_1_master.wait_for_binlog_player_count(1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') self._check_lots_timeout(1000, 80, 5) logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.pause('Good time to test vtworker for diffs') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_0_master.wait_for_binlog_player_count(0) shard_1_master.wait_for_binlog_player_count(0) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_health_check_worker_state_does_not_shutdown_query_service(self): # This test is similar to test_health_check, but has the following # differences: # - the second tablet is an 'rdonly' and not a 'replica' # - the second tablet will be set to 'worker' and we expect that # the query service won't be shutdown # Setup master and rdonly tablets. tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='rdonly', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) # Enable replication. utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # Trigger healthcheck to save time waiting for the next interval. utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly']) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'rdonly') self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state('SERVING') # Change from rdonly to worker and stop replication. (These # actions are similar to the SplitClone vtworker command # implementation.) The tablet will become unhealthy, but the # query service is still running. utils.run_vtctl( ['ChangeSlaveType', tablet_62044.tablet_alias, 'worker']) utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) # Trigger healthcheck explicitly to avoid waiting for the next interval. utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly']) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'worker') self.check_healthz(tablet_62044, False) # Make sure that replication got disabled. self.assertIn( '>unhealthy: replication_reporter: ' 'Replication is not running</span></div>', tablet_62044.get_status()) # Query service is still running. tablet_62044.wait_for_vttablet_state('SERVING') # Restart replication. Tablet will become healthy again. utils.run_vtctl( ['ChangeSlaveType', tablet_62044.tablet_alias, 'spare']) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'spare') utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.run_vtctl( ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly']) self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'rdonly') self.check_healthz(tablet_62044, True) tablet_62044.wait_for_vttablet_state('SERVING') # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def test_reparent_slave_offline(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ]) utils.validate_topology(ping_tablets=True) self._check_master_tablet(tablet_62344) # Kill one tablet so we seem offline tablet_31981.kill_vttablet() # Perform a graceful reparent operation. _, stderr = utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, '-new_master', tablet_62044.tablet_alias ], expect_fail=True) self.assertIn('Tablet test_ny-0000031981 SetMaster failed', stderr) self._check_master_tablet(tablet_62044) tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])
def test_reparent_lag_slave(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('lag', 'test_keyspace', shard_id, start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_31981]: t.wait_for_vttablet_state('SERVING') tablet_41983.wait_for_vttablet_state('NOT_SERVING') # Recompute the shard layout node - until you do that, it might not be # valid. utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id]) utils.validate_topology() # Force the slaves to reparent assuming that all the datasets are identical. for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.reset_replication() utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ]) utils.validate_topology(ping_tablets=True) tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) tablet_41983.mquery('', 'stop slave') for q in self._populate_vt_insert_test: tablet_62344.mquery('vt_test_keyspace', q, write=True) # Perform a graceful reparent operation. utils.run_vtctl([ 'ReparentShard', 'test_keyspace/' + shard_id, tablet_62044.tablet_alias ]) tablet_41983.mquery('', 'start slave') time.sleep(1) utils.pause('check orphan') utils.run_vtctl(['ReparentTablet', tablet_41983.tablet_alias]) result = tablet_41983.mquery( 'vt_test_keyspace', 'select msg from vt_insert_test where id=1') if len(result) != 1: self.fail('expected 1 row from vt_insert_test: %s' % str(result)) utils.pause('check lag reparent') tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_merge_sharding(self): utils.run_vtctl(['CreateKeyspace', '--sharding_column_name', 'custom_ksid_col', '--sharding_column_type', base_sharding.keyspace_id_type, 'test_keyspace']) shard_0_master.init_tablet('replica', 'test_keyspace', '-40') shard_0_replica.init_tablet('replica', 'test_keyspace', '-40') shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40') shard_1_master.init_tablet('replica', 'test_keyspace', '40-80') shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80') shard_2_master.init_tablet('replica', 'test_keyspace', '80-') shard_2_replica.init_tablet('replica', 'test_keyspace', '80-') shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') # rebuild and check SrvKeyspace utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace']) self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col') # create databases so vttablet can start behaving normally for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # won't be serving, no replication state for t in [shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly, shard_2_master, shard_2_replica, shard_2_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40', shard_0_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80', shard_1_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-', shard_2_master.tablet_alias], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # run a health check on source replicas so they respond to discovery # (for binlog players) and on the source rdonlys (for workers) for t in [shard_0_replica, shard_1_replica]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) for t in [shard_0_rdonly, shard_1_rdonly]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) # create the merge shards shard_dest_master.init_tablet('replica', 'test_keyspace', '-80') shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80') shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80') # start vttablet on the destination shard (no db created, # so they're all not serving) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.start_vttablet(wait_for_state=None) for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80', shard_dest_master.tablet_alias], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -40 40-80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # copy the schema utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias, 'test_keyspace/-80'], auto_log=True) # copy the data (will also start filtered replication), reset source # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( ['--cell', 'test_nj', '--command_display_interval', '10ms'], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--offline=false', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_dest_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Update row 2 (provokes an update). shard_dest_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2", write=True) # Insert row 0 (provokes a delete). self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0', 0x5000000000000000) workerclient_proc = utils.run_vtworker_client_bg( ['SplitClone', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Change tablets, which were taken offline, back to rdonly. utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 1, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 2) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check binlog player variables self.check_destination_master(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_0_replica, horizontal=True) self.check_binlog_server_vars(shard_1_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 0 and 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shards') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 10) if v != 100: # small optimization: only do this check if we don't have all the data # already anyway. logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 30) self.check_binlog_player_vars(shard_dest_master, ['test_keyspace/-40', 'test_keyspace/40-80'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_0_replica, horizontal=True, min_statements=1000, min_transactions=1000) self.check_binlog_server_vars(shard_1_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data (after health-checking the destination # rdonly tablets so discovery works) utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias]) logging.debug('Running vtworker SplitDiff on first half') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '0', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) logging.debug('Running vtworker SplitDiff on second half') utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated', '--min_healthy_rdonly_tablets', '1', '--source_uid', '1', 'test_keyspace/-80'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias, 'rdonly'], auto_log=True) # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_dest_master, 3000, 1000) # check destination master query service is not running utils.check_tablet_query_service(self, shard_dest_master, False, False) stream_health = utils.run_vtctl_json(['VtTabletStreamHealth', '-count', '1', shard_dest_master.tablet_alias]) logging.debug('Got health: %s', str(stream_health)) self.assertIn('realtime_stats', stream_health) self.assertNotIn('serving', stream_health) # check the destination master 3 is healthy, even though its query # service is not running (if not healthy this would exception out) shard_dest_master.get_healthz() # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -40 40-80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -40 40-80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # now serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.check_tablet_query_service(self, shard_0_master, False, True) utils.check_tablet_query_service(self, shard_1_master, False, True) # check the binlog players are gone now self.check_no_binlog_player(shard_dest_master) # kill the original tablets in the original shards tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master, shard_1_replica, shard_1_rdonly]) for t in [shard_0_replica, shard_0_rdonly, shard_1_replica, shard_1_rdonly]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) for t in [shard_0_master, shard_1_master]: utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias], auto_log=True) # delete the original shards utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True) utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # kill everything else tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly, shard_dest_master, shard_dest_replica, shard_dest_rdonly])
def _test_reparent_from_outside(self, brutal=False): """This test will start a master and 3 slaves. Then: - one slave will be the new master - one slave will be reparented to that new master - one slave will be busted and dead in the water and we'll call TabletExternallyReparented. Args: brutal: kills the old master first """ utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Reparent as a starting point utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ], auto_log=True) # now manually reparent 1 out of 2 tablets # 62044 will be the new master # 31981 won't be re-parented, so it will be busted # Shutdown the old master first. if not brutal: tablet_62344.mquery('', mysql_flavor().demote_master_commands()) # Get the position of the old master and wait for the new one to catch up. utils.wait_for_replication_pos(tablet_62344, tablet_62044) # Promote the new master. tablet_62044.mquery('', mysql_flavor().promote_slave_commands()) new_pos = mysql_flavor().master_position(tablet_62044) logging.debug('New master position: %s', str(new_pos)) # Use 'localhost' as hostname because Travis CI worker hostnames # are too long for MySQL replication. change_master_cmds = mysql_flavor().change_master_commands( 'localhost', tablet_62044.mysql_port, new_pos) # 62344 will now be a slave of 62044 tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] + change_master_cmds + ['START SLAVE']) # 41983 will be a slave of 62044 tablet_41983.mquery('', ['STOP SLAVE'] + change_master_cmds + ['START SLAVE']) # in brutal mode, we kill the old master first # and delete its tablet record if brutal: tablet_62344.kill_vttablet() utils.run_vtctl( ['DeleteTablet', '-allow_master', tablet_62344.tablet_alias], auto_log=True) base_time = time.time() # update topology with the new server utils.run_vtctl( ['TabletExternallyReparented', tablet_62044.tablet_alias], mode=utils.VTCTL_VTCTL, auto_log=True) self._test_reparent_from_outside_check(brutal, base_time) if not brutal: tablet_62344.kill_vttablet() tablet.kill_tablets([tablet_31981, tablet_62044, tablet_41983])
def test_vertical_split(self): utils.run_vtctl(['CreateKeyspace', 'source_keyspace']) utils.run_vtctl(['CreateKeyspace', '--served_from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace', 'destination_keyspace']) source_master.init_tablet('master', 'source_keyspace', '0') source_replica.init_tablet('replica', 'source_keyspace', '0') source_rdonly1.init_tablet('rdonly', 'source_keyspace', '0') source_rdonly2.init_tablet('rdonly', 'source_keyspace', '0') # rebuild destination keyspace to make sure there is a serving # graph entry, even though there is no tablet yet. utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') destination_master.init_tablet('master', 'destination_keyspace', '0') destination_replica.init_tablet('replica', 'destination_keyspace', '0') destination_rdonly1.init_tablet('rdonly', 'destination_keyspace', '0') destination_rdonly2.init_tablet('rdonly', 'destination_keyspace', '0') utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') # create databases so vttablet can start behaving normally for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: t.create_db('vt_source_keyspace') t.start_vttablet(wait_for_state=None) destination_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [destination_replica, destination_rdonly1, destination_rdonly2]: t.start_vttablet(wait_for_state=None) # wait for the tablets for t in [source_master, source_replica, source_rdonly1, source_rdonly2]: t.wait_for_vttablet_state('SERVING') for t in [destination_master, destination_replica, destination_rdonly1, destination_rdonly2]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl(['InitShardMaster', 'source_keyspace/0', source_master.tablet_alias], auto_log=True) utils.run_vtctl(['InitShardMaster', 'destination_keyspace/0', destination_master.tablet_alias], auto_log=True) # read all the keyspaces, this will populate the topology cache. self._populate_topo_cache() # create the schema on the source keyspace, add some values self._create_source_schema() moving1_first = self._insert_values('moving1', 100) moving2_first = self._insert_values('moving2', 100) staying1_first = self._insert_values('staying1', 100) staying2_first = self._insert_values('staying2', 100) self._check_values(source_master, 'vt_source_keyspace', 'moving1', moving1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'moving2', moving2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying1', staying1_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'staying2', staying2_first, 100) self._check_values(source_master, 'vt_source_keyspace', 'view1', moving1_first, 100) # the worker will do everything. We test with source_reader_count=10 # (down from default=20) as connection pool is not big enough for 20. # min_table_size_for_split is set to 1 as to force a split even on the # small table we have. utils.run_vtctl(['CopySchemaShard', '--tables', 'moving.*,view1', source_rdonly1.tablet_alias, 'destination_keyspace/0'], auto_log=True) utils.run_vtworker(['--cell', 'test_nj', '--command_display_interval', '10ms', 'VerticalSplitClone', '--tables', 'moving.*,view1', '--strategy=-populate_blp_checkpoint', '--source_reader_count', '10', '--min_table_size_for_split', '1', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'], auto_log=True) topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace') # check values are present self._check_values(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first, 100) self._check_values(destination_master, 'vt_destination_keyspace', 'view1', moving1_first, 100) # check the binlog players is running destination_master.wait_for_binlog_player_count(1) # add values to source, make sure they're replicated moving1_first_add1 = self._insert_values('moving1', 100) staying1_first_add1 = self._insert_values('staying1', 100) moving2_first_add1 = self._insert_values('moving2', 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving1', moving1_first_add1, 100) self._check_values_timeout(destination_master, 'vt_destination_keyspace', 'moving2', moving2_first_add1, 100) # use vtworker to compare the data logging.debug("Running vtworker VerticalSplitDiff") utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff', 'destination_keyspace/0'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly1.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl(['ChangeSlaveType', destination_rdonly2.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for destination master tablet, make sure we have it all destination_master_status = destination_master.get_status() self.assertIn('Binlog player state: Running', destination_master_status) self.assertIn('moving.*', destination_master_status) self.assertIn('<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status) self.assertIn('</html>', destination_master_status) # check query service is off on destination master, as filtered # replication is enabled. Even health check should not interfere. destination_master_vars = utils.get_vars(destination_master.port) self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING') # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], expect_fail=True) # migrate rdonly only in test_ny cell, make sure nothing is migrated # in test_nj utils.run_vtctl(['MigrateServedFrom', '--cells=test_ny', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(rdonly): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, None) self._check_blacklisted_tables(source_rdonly2, None) # migrate test_nj only, using command line manual fix command, # and restore it back. keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace']) self.assertEqual(keyspace_json['ServedFromMap']['rdonly']['Cells'], ['test_nj']) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', '-remove', '-cells=test_nj', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace']) self.assertFalse('rdonly' in keyspace_json['ServedFromMap']) utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace', 'destination_keyspace', 'rdonly'], auto_log=True) keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace']) self.assertEqual(keyspace_json['ServedFromMap']['rdonly']['Cells'], None) # now serve rdonly from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['rdonly'], ['master', 'replica'], ['moving1', 'moving2']) # then serve replica from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # move replica back and forth utils.run_vtctl(['MigrateServedFrom', '-reverse', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' + 'ServedFrom(replica): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, None) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'], auto_log=True) self._check_srv_keyspace('ServedFrom(master): source_keyspace\n') self._check_blacklisted_tables(source_master, None) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2']) # then serve master from the destination shards utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'], auto_log=True) self._check_srv_keyspace('') self._check_blacklisted_tables(source_master, ['moving.*', 'view1']) self._check_blacklisted_tables(source_replica, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1']) self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1']) self._check_client_conn_redirection( 'source_keyspace', 'destination_keyspace', ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2']) # check 'vtctl SetShardTabletControl' command works as expected: # clear the rdonly entry, re-add it, and then clear all entries. utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertNotIn('rdonly', shard_json['TabletControlMap']) self.assertIn('replica', shard_json['TabletControlMap']) self.assertIn('master', shard_json['TabletControlMap']) utils.run_vtctl(['SetShardTabletControl', '--tables=moving.*,view1', 'source_keyspace/0', 'rdonly'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(['moving.*', 'view1'], shard_json['TabletControlMap']['rdonly']['BlacklistedTables']) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'rdonly'], auto_log=True) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'replica'], auto_log=True) utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0', 'master'], auto_log=True) shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0']) self.assertEqual(None, shard_json['TabletControlMap']) # check the binlog player is gone now destination_master.wait_for_binlog_player_count(0) # optional method to check the stats are correct self._check_stats() # kill everything tablet.kill_tablets([source_master, source_replica, source_rdonly1, source_rdonly2, destination_master, destination_replica, destination_rdonly1, destination_rdonly2])
def test_reparent_cross_cell(self, shard_id='0'): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Create a few slaves for testing reparenting. Won't be healthy # as replication is not running. tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False) for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id]) self.assertEqual( shard['cells'], ['test_nj', 'test_ny'], 'wrong list of cell in Shard: %s' % str(shard['cells'])) # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/' + shard_id, tablet_62344.tablet_alias ], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_master_tablet(tablet_62344) # Perform a graceful reparent operation to another cell. utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' + shard_id, '-new_master', tablet_31981.tablet_alias ], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_31981) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_0_ny_slave.init_tablet('spare', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_ny_slave.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # we set full_mycnf_args to True as a test in the KIT_BYTES case full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_0_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_ny_slave.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() self._test_keyrange_constraints() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-c0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0') shard_3_master.init_tablet('master', 'test_keyspace', 'c0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'c0-') # start vttablet on the split shards (no db created, # so they're all not serving) shard_3_master.start_vttablet(wait_for_state=None, target_tablet_type='replica') for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-c0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/c0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl([ 'MultiSnapshot', '--spec=80-c0-', '--exclude_tables=unrelated', shard_1_slave1.tablet_alias ], auto_log=True) # the snapshot_copy hook will copy the snapshot files to # VTDATAROOT/tmp/... as a test. We want to use these for one half, # but not for the other, so we test both scenarios. os.unlink( os.path.join( environment.tmproot, "snapshot-from-%s-for-%s.tar" % (shard_1_slave1.tablet_alias, "80-c0"))) # wait for tablet's binlog server service to be enabled after snapshot shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restores: first one from source tablet. We removed the # storage backup, so it's coming from the tablet itself. # we also delay starting the binlog player, then enable it. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint,dontStartBinlogPlayer', 'test_keyspace/80-c0', shard_1_slave1.tablet_alias ], auto_log=True) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if not "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not failed starting yet', timeout) continue logging.debug("shard 2 master is waiting on flag removal, good") break qr = utils.run_vtctl_json([ 'ExecuteFetch', shard_2_master.tablet_alias, 'update _vt.blp_checkpoint set flags="" where source_shard_uid=0' ]) self.assertEqual(qr['RowsAffected'], 1) timeout = 10 while True: shard_2_master_status = shard_2_master.get_status() if "not starting because flag 'DontStart' is set" in shard_2_master_status: timeout = utils.wait_step( 'shard 2 master has not started replication yet', timeout) continue logging.debug("shard 2 master has started replication, good") break # second restore from storage: to be sure, we stop vttablet, and restart # it afterwards shard_1_slave1.kill_vttablet() utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/c0-', shard_1_slave1.tablet_alias ], auto_log=True) shard_1_slave1.start_vttablet(wait_for_state=None) shard_1_slave1.wait_for_binlog_server_state("Enabled") # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl([ 'ValidateSchemaKeyspace', '--exclude_tables=unrelated', 'test_keyspace' ], auto_log=True) # check the binlog players are running and exporting vars shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) self._check_binlog_player_vars(shard_2_master) self._check_binlog_player_vars(shard_3_master) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) self._check_binlog_player_vars(shard_2_master, seconds_behind_master_max=30) self._check_binlog_player_vars(shard_3_master, seconds_behind_master_max=30) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # get status for a destination master tablet, make sure we have it all shard_2_master_status = shard_2_master.get_status() self.assertIn('Binlog player state: Running', shard_2_master_status) self.assertIn( '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # check query service is off on master 2 and master 3, as filtered # replication is enabled. Even health check that is enabled on # master 3 should not interfere. shard_2_master_vars = utils.get_vars(shard_2_master.port) self.assertEqual(shard_2_master_vars['TabletStateName'], 'NOT_SERVING') shard_3_master_vars = utils.get_vars(shard_3_master.port) self.assertEqual(shard_3_master_vars['TabletStateName'], 'NOT_SERVING') # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-c0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n' + 'Partitions(rdonly): -80 80-c0 c0-\n' + 'Partitions(replica): -80 80-c0 c0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # get status for a destination master tablet, make sure it's good shard_2_master_status = shard_2_master.get_status() self.assertIn('No binlog player is running', shard_2_master_status) self.assertIn('</html>', shard_2_master_status) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets([ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]) for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave, shard_1_rdonly ]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def test_resharding(self): utils.run_vtctl([ 'CreateKeyspace', '--sharding_column_name', 'bad_column', '--sharding_column_type', 'bytes', 'test_keyspace' ]) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64' ], expect_fail=True) utils.run_vtctl([ 'SetKeyspaceShardingInfo', '-force', 'test_keyspace', 'keyspace_id', keyspace_id_type ]) shard_0_master.init_tablet('master', 'test_keyspace', '-80') shard_0_replica.init_tablet('replica', 'test_keyspace', '-80') shard_1_master.init_tablet('master', 'test_keyspace', '80-') shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-') shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-') shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-') utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # create databases so vttablet can start behaving normally for t in [ shard_0_master, shard_0_replica, shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None) # wait for the tablets shard_0_master.wait_for_vttablet_state('SERVING') shard_0_replica.wait_for_vttablet_state('SERVING') shard_1_master.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('SERVING') shard_1_slave2.wait_for_vttablet_state('NOT_SERVING') # spare shard_1_rdonly.wait_for_vttablet_state('SERVING') # reparent to make the tablets work utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) # create the tables self._create_schema() self._insert_startup_values() # create the split shards shard_2_master.init_tablet('master', 'test_keyspace', '80-C0') shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0') shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0') shard_3_master.init_tablet('master', 'test_keyspace', 'C0-') shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-') shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-') # start vttablet on the split shards (no db created, # so they're all not serving) for t in [ shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ]: t.start_vttablet(wait_for_state=None) shard_2_master.wait_for_vttablet_state('CONNECTING') shard_2_replica1.wait_for_vttablet_state('NOT_SERVING') shard_2_replica2.wait_for_vttablet_state('NOT_SERVING') shard_3_master.wait_for_vttablet_state('CONNECTING') shard_3_replica.wait_for_vttablet_state('NOT_SERVING') shard_3_rdonly.wait_for_vttablet_state('CONNECTING') utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/80-C0', shard_2_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ReparentShard', '-force', 'test_keyspace/C0-', shard_3_master.tablet_alias ], auto_log=True) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # take the snapshot for the split utils.run_vtctl( ['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias], auto_log=True) # wait for tablet's binlog server service to be enabled after snapshot, # and check all the others while we're at it shard_1_slave1.wait_for_binlog_server_state("Enabled") # perform the restore. utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/80-C0', shard_1_slave1.tablet_alias ], auto_log=True) utils.run_vtctl([ 'ShardMultiRestore', '-strategy=populateBlpCheckpoint', 'test_keyspace/C0-', shard_1_slave1.tablet_alias ], auto_log=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running shard_2_master.wait_for_binlog_player_count(1) shard_3_master.wait_for_binlog_player_count(1) # check that binlog server exported the stats vars self._check_binlog_server_vars(shard_1_slave1) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug("Inserting lots of data on source shard") self._insert_lots(1000) logging.debug("Checking 80 percent of data is sent quickly") self._check_lots_timeout(1000, 80, 5) logging.debug("Checking all data goes through eventually") self._check_lots_timeout(1000, 100, 20) logging.debug("Checking no data was sent the wrong way") self._check_lots_not_present(1000) # use the vtworker checker to compare the data logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.pause("Good time to test vtworker for diffs") # start a thread to insert data into shard_1 in the background # with current time, and monitor the delay insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000, 0x9000000000000000) insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001, 0xD000000000000000) monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low") monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high") # tests a failover switching serving to a different replica utils.run_vtctl( ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica']) utils.run_vtctl( ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare']) shard_1_slave2.wait_for_vttablet_state('SERVING') shard_1_slave1.wait_for_vttablet_state('NOT_SERVING') # test data goes through again logging.debug("Inserting lots of data on source shard") self._insert_lots(1000, base=1000) logging.debug("Checking 80 percent of data was sent quickly") self._check_lots_timeout(1000, 80, 5, base=1000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # then serve replica from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # reparent shard_2 to shard_2_replica1, then insert more data and # see it flow through still utils.run_vtctl([ 'ReparentShard', 'test_keyspace/80-C0', shard_2_replica1.tablet_alias ]) logging.debug( "Inserting lots of data on source shard after reparenting") self._insert_lots(3000, base=2000) logging.debug("Checking 80 percent of data was sent fairly quickly") self._check_lots_timeout(3000, 80, 10, base=2000) # use the vtworker checker to compare the data again logging.debug("Running vtworker SplitDiff") utils.run_vtworker( ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'], auto_log=True) utils.run_vtctl( ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'], auto_log=True) # going to migrate the master now, check the delays monitor_thread_1.done = True monitor_thread_2.done = True insert_thread_1.done = True insert_thread_2.done = True logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u", monitor_thread_1.object_name, monitor_thread_1.max_lag, monitor_thread_1.lag_sum / monitor_thread_1.sample_count) logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u", monitor_thread_2.object_name, monitor_thread_2.max_lag, monitor_thread_2.lag_sum / monitor_thread_2.sample_count) # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'], auto_log=True) utils.check_srv_keyspace('test_nj', 'test_keyspace', 'Partitions(master): -80 80-C0 C0-\n' + 'Partitions(rdonly): -80 80-C0 C0-\n' + 'Partitions(replica): -80 80-C0 C0-\n' + 'TabletTypes: master,rdonly,replica', keyspace_id_type=keyspace_id_type) # check the binlog players are gone now shard_2_master.wait_for_binlog_player_count(0) shard_3_master.wait_for_binlog_player_count(0) # scrap the original tablets in the original shard for t in [ shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly ]: utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True) tablet.kill_tablets( [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly]) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # test RemoveShardCell utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True, expect_fail=True) utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True) shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-']) if shard['Cells']: self.fail("Non-empty Cells record for shard: %s" % str(shard)) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True) # kill everything tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_2_master, shard_2_replica1, shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly ])
def test_reparent_avoid(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. Won't be healthy # as replication is not running. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) for t in [tablet_62344, tablet_62044, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ], auto_log=True) utils.validate_topology(ping_tablets=True) self._check_master_tablet(tablet_62344) # Perform a reparent operation with avoid_master pointing to non-master. It # should succeed without doing anything. utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0', '-avoid_master', tablet_62044.tablet_alias ], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_62344) # Perform a reparent operation with avoid_master pointing to master. utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0', '-avoid_master', tablet_62344.tablet_alias ], auto_log=True) utils.validate_topology() # 62044 is in the same cell and 31981 is in a different cell, so we must # land on 62044 self._check_master_tablet(tablet_62044) # If we kill the tablet in the same cell as master then reparent # -avoid_master will fail. tablet_62344.kill_vttablet() _, stderr = utils.run_vtctl([ 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0', '-avoid_master', tablet_62044.tablet_alias ], auto_log=True, expect_fail=True) self.assertIn('cannot find a tablet to reparent to', stderr) utils.validate_topology() self._check_master_tablet(tablet_62044) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
def test_repeated_init_shard_master(self): """Test that using InitShardMaster can go back and forth between 2 hosts.""" for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, lameduck_period='5s', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') # Tablets are not replicating, so they won't be healthy. for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(t, False) # Pick one master out of the two. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0) # Pick the other one as master, make sure they are still healthy. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62044.tablet_alias ]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'replica', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'master', timeout=0) # Come back to the original guy. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ]) # Run health check on both, make sure they are both healthy. # Also make sure the types are correct. for t in tablet_62344, tablet_62044: utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True) self.check_healthz(t, True) utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0) utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0) # And done. tablet.kill_tablets([tablet_62344, tablet_62044])
def test_resharding(self): global l2vtgate1, l2vtgate2 # create the keyspace with just one shard shard_master.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=0) shard_replica.init_tablet('replica', keyspace='test_keyspace', shard='0', tablet_index=1) shard_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='0', tablet_index=2) for t in [shard_master, shard_replica, shard_rdonly1]: t.create_db('vt_test_keyspace') shard_master.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_replica.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) shard_rdonly1.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('NOT_SERVING') # reparent to make the tablets work utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', shard_master.tablet_alias ], auto_log=True) utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica') utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly') for t in [shard_master, shard_replica, shard_rdonly1]: t.wait_for_vttablet_state('SERVING') # create the tables and add startup values self._create_schema() self._insert_startup_values() # reload schema on all tablets so we can query them for t in [shard_master, shard_replica, shard_rdonly1]: utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True) # We must start vtgate after tablets are up, or else wait until 1min refresh # (that is the tablet_refresh_interval parameter for discovery gateway) # we want cache_ttl at zero so we re-read the topology for every test query. if use_l2vtgate: l2vtgate1 = utils.L2VtGate() l2vtgate1.start( tablets=[shard_master, shard_replica, shard_rdonly1]) l2vtgate1.wait_for_endpoints('test_keyspace.0.master', 1) l2vtgate1.wait_for_endpoints('test_keyspace.0.replica', 1) l2vtgate1.wait_for_endpoints('test_keyspace.0.rdonly', 1) _, addr = l2vtgate1.rpc_endpoint() l2vtgate1_param = '%s|test_keyspace|0' % addr utils.VtGate().start(cache_ttl='0', l2vtgates=[ l2vtgate1_param, ]) else: utils.VtGate().start( cache_ttl='0', tablets=[shard_master, shard_replica, shard_rdonly1]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteShards, # as we're not sharded yet. # we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['shard_part']['shards'][0], '0') # change the schema, backfill keyspace_id, and change schema again self._add_sharding_key_to_schema() self._backfill_keyspace_id(shard_master) self._mark_sharding_key_not_null() # now we can be a sharded keyspace (and propagate to SrvKeyspace) utils.run_vtctl([ 'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col', base_sharding.keyspace_id_type ]) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # run a health check on source replica so it responds to discovery utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias]) # create the split shards shard_0_master.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=0) shard_0_replica.init_tablet('replica', keyspace='test_keyspace', shard='-80', tablet_index=1) shard_0_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='-80', tablet_index=2) shard_1_master.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=0) shard_1_replica.init_tablet('replica', keyspace='test_keyspace', shard='80-', tablet_index=1) shard_1_rdonly1.init_tablet('rdonly', keyspace='test_keyspace', shard='80-', tablet_index=2) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.create_db('vt_test_keyspace') t.start_vttablet(wait_for_state=None, binlog_use_v3_resharding_mode=False) for t in [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]: t.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/-80', shard_0_master.tablet_alias ], auto_log=True) utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/80-', shard_1_master.tablet_alias ], auto_log=True) for t in [shard_0_replica, shard_1_replica]: utils.wait_for_tablet_type(t.tablet_alias, 'replica') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.wait_for_tablet_type(t.tablet_alias, 'rdonly') sharded_tablets = [ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ] for t in sharded_tablets: t.wait_for_vttablet_state('SERVING') # must restart vtgate after tablets are up, or else wait until 1min refresh # we want cache_ttl at zero so we re-read the topology for every test query. utils.vtgate.kill() if use_l2vtgate: l2vtgate1.kill() l2vtgate1 = utils.L2VtGate() l2vtgate1.start(tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1 ], tablet_filters='test_keyspace|0,test_keyspace|-80') l2vtgate1.wait_for_endpoints('test_keyspace.0.master', 1) l2vtgate1.wait_for_endpoints('test_keyspace.0.replica', 1) l2vtgate1.wait_for_endpoints('test_keyspace.0.rdonly', 1) l2vtgate1.wait_for_endpoints('test_keyspace.-80.master', 1) l2vtgate1.wait_for_endpoints('test_keyspace.-80.replica', 1) l2vtgate1.wait_for_endpoints('test_keyspace.-80.rdonly', 1) l2vtgate1.verify_no_endpoint('test_keyspace.80-.master') l2vtgate1.verify_no_endpoint('test_keyspace.80-.replica') l2vtgate1.verify_no_endpoint('test_keyspace.80-.rdonly') # FIXME(alainjobart) we clear tablet_types_to_wait, as this # l2vtgate2 doesn't serve the current test_keyspace shard, which # is test_keyspace.0. This is not ideal, we should re-work # which keyspace/shard a l2vtgate can wait for, as the ones # filtered by tablet_filters. l2vtgate2 = utils.L2VtGate() l2vtgate2.start( tablets=[shard_1_master, shard_1_replica, shard_1_rdonly1], tablet_filters='test_keyspace|80-', tablet_types_to_wait='') l2vtgate2.wait_for_endpoints('test_keyspace.80-.master', 1) l2vtgate2.wait_for_endpoints('test_keyspace.80-.replica', 1) l2vtgate2.wait_for_endpoints('test_keyspace.80-.rdonly', 1) l2vtgate2.verify_no_endpoint('test_keyspace.0.master') l2vtgate2.verify_no_endpoint('test_keyspace.0.replica') l2vtgate2.verify_no_endpoint('test_keyspace.0.rdonly') l2vtgate2.verify_no_endpoint('test_keyspace.-80.master') l2vtgate2.verify_no_endpoint('test_keyspace.-80.replica') l2vtgate2.verify_no_endpoint('test_keyspace.-80.rdonly') _, addr1 = l2vtgate1.rpc_endpoint() _, addr2 = l2vtgate2.rpc_endpoint() l2vtgate1_param1 = '%s|test_keyspace|0' % addr1 l2vtgate1_param2 = '%s|test_keyspace|-80' % addr1 l2vtgate2_param = '%s|test_keyspace|80-' % addr2 utils.VtGate().start(cache_ttl='0', l2vtgates=[ l2vtgate1_param1, l2vtgate1_param2, l2vtgate2_param, ]) else: utils.VtGate().start(cache_ttl='0', tablets=[ shard_master, shard_replica, shard_rdonly1, shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ]) utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges now, # as we are sharded (with just one shard). # again, we have 3 values in the database, asking for 4 splits will get us # a single query. sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 4) self.assertEqual(len(s), 1) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') # There must be one empty KeyRange which represents the full keyspace. self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {}) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # we need to create the schema, and the worker will do data copying for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'): utils.run_vtctl([ 'CopySchemaShard', '--exclude_tables', 'unrelated', shard_rdonly1.tablet_alias, keyspace_shard ], auto_log=True) utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias]) # Run vtworker as daemon for the following SplitClone commands. worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg( [ '--cell', 'test_nj', '--command_display_interval', '10ms', '--use_v3_resharding_mode=false' ], auto_log=True) # Initial clone (online). workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--offline=false', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 3, 0, 0, 0) # Reset vtworker such that we can run the next command. workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port) utils.wait_procs([workerclient_proc]) # Modify the destination shard. SplitClone will revert the changes. # Delete row 1 (provokes an insert). shard_0_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=1', write=True) # Delete row 2 (provokes an insert). shard_1_master.mquery('vt_test_keyspace', 'delete from resharding1 where id=2', write=True) # Update row 3 (provokes an update). shard_1_master.mquery( 'vt_test_keyspace', "update resharding1 set msg='msg-not-3' where id=3", write=True) # Insert row 4 (provokes a delete). self._insert_value(shard_1_master, 'resharding1', 4, 'msg4', 0xD000000000000000) workerclient_proc = utils.run_vtworker_client_bg([ 'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count', '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0' ], worker_rpc_port) utils.wait_procs([workerclient_proc]) self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1', 2, 1, 1, 0) self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1', 0, 0, 0, 3) # Terminate worker daemon because it is no longer needed. utils.kill_sub_process(worker_proc, soft=True) # check the startup values are in the right place self._check_startup_values() # check the schema too utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True) # check the binlog players are running logging.debug('Waiting for binlog players to start on new masters...') self.check_destination_master(shard_0_master, ['test_keyspace/0']) self.check_destination_master(shard_1_master, ['test_keyspace/0']) # check that binlog server exported the stats vars self.check_binlog_server_vars(shard_replica, horizontal=True) # testing filtered replication: insert a bunch of data on shard 1, # check we get most of it after a few seconds, wait for binlog server # timeout, check we get all of it. logging.debug('Inserting lots of data on source shard') self._insert_lots(1000) logging.debug('Checking 80 percent of data is sent quickly') v = self._check_lots_timeout(1000, 80, 5) if v != 100: logging.debug('Checking all data goes through eventually') self._check_lots_timeout(1000, 100, 20) logging.debug('Checking no data was sent the wrong way') self._check_lots_not_present(1000) self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'], seconds_behind_master_max=30) self.check_binlog_server_vars(shard_replica, horizontal=True, min_statements=1000, min_transactions=1000) # use vtworker to compare the data logging.debug('Running vtworker SplitDiff for -80') for t in [shard_0_rdonly1, shard_1_rdonly1]: utils.run_vtctl(['RunHealthCheck', t.tablet_alias]) utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80' ], auto_log=True) logging.debug('Running vtworker SplitDiff for 80-') utils.run_vtworker([ '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff', '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-' ], auto_log=True) utils.pause('Good time to test vtworker for diffs') # get status for the destination master tablet, make sure we have it all self.check_running_binlog_player(shard_0_master, 2000, 2000) self.check_running_binlog_player(shard_1_master, 6000, 2000) # check we can't migrate the master just yet utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], expect_fail=True) # now serve rdonly from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # make sure rdonly tablets are back to serving before hitting vtgate. for t in [shard_0_rdonly1, shard_1_rdonly1]: t.wait_for_vttablet_state('SERVING') if use_l2vtgate: l2vtgate1.wait_for_endpoints('test_keyspace.-80.rdonly', 1) l2vtgate2.wait_for_endpoints('test_keyspace.80-.rdonly', 1) else: utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1) utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1) # check the Map Reduce API works correctly, should use ExecuteKeyRanges # on both destination shards now. # we ask for 2 splits to only have one per shard sql = 'select id, msg from resharding1' s = utils.vtgate.split_query(sql, 'test_keyspace', 2) self.assertEqual(len(s), 2) self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace') self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1) self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1) # then serve replica from the split shards source_tablet = shard_replica destination_tablets = [shard_0_replica, shard_1_replica] utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # move replica back and forth utils.run_vtctl( ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'], auto_log=True) # After a backwards migration, queryservice should be enabled on # source and disabled on destinations utils.check_tablet_query_service(self, source_tablet, True, False) utils.check_tablet_query_services(self, destination_tablets, False, True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True) # After a forwards migration, queryservice should be disabled on # source and enabled on destinations utils.check_tablet_query_service(self, source_tablet, False, True) utils.check_tablet_query_services(self, destination_tablets, True, False) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # then serve master from the split shards utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'], auto_log=True) utils.check_srv_keyspace( 'test_nj', 'test_keyspace', 'Partitions(master): -80 80-\n' 'Partitions(rdonly): -80 80-\n' 'Partitions(replica): -80 80-\n', keyspace_id_type=base_sharding.keyspace_id_type, sharding_column_name='custom_ksid_col') # check the binlog players are gone now self.check_no_binlog_player(shard_0_master) self.check_no_binlog_player(shard_1_master) # make sure we can't delete a shard with tablets utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True) # remove the original tablets in the original shard tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1]) for t in [shard_replica, shard_rdonly1]: utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True) utils.run_vtctl( ['DeleteTablet', '-allow_master', shard_master.tablet_alias], auto_log=True) # rebuild the serving graph, all mentions of the old shards shoud be gone utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True) # delete the original shard utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True) # kill everything else tablet.kill_tablets([ shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master, shard_1_replica, shard_1_rdonly1 ])
def test_health_check(self): # one master, one replica that starts not initialized # (for the replica, we let vttablet do the InitTablet) tablet_62344.init_tablet('master', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None) tablet_62044.start_vttablet(wait_for_state=None, lameduck_period='5s', init_tablet_type='replica', init_keyspace='test_keyspace', init_shard='0') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') self.check_healthz(tablet_62044, False) utils.run_vtctl( ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the unhealthy slave goes to healthy tablet_62044.wait_for_vttablet_state('SERVING') utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) self.check_healthz(tablet_62044, True) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['type'], topodata_pb2.MASTER, 'unexpected master type: %s' % ti['type']) # stop replication at the mysql level. tablet_62044.mquery('', 'stop slave') # vttablet replication_reporter should restart it. utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) # insert something on the master and wait for it on the slave. tablet_62344.mquery('vt_test_keyspace', [ 'create table repl_test_table (id int)', 'insert into repl_test_table values (123)' ], write=True) timeout = 10.0 while True: try: result = tablet_62044.mquery('vt_test_keyspace', 'select * from repl_test_table') if result: self.assertEqual(result[0][0], 123L) break except MySQLdb.ProgrammingError: # Maybe the create table hasn't gone trough yet, we wait more logging.exception( 'got this exception waiting for data, ignoring it') timeout = utils.wait_step( 'slave replication repaired by replication_reporter', timeout) # stop replication, make sure we don't go unhealthy. # (we have a baseline as well, so the time should be good). utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias]) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) self.check_healthz(tablet_62044, True) # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # make sure the health stream is updated health = utils.run_vtctl_json( ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias]) self.assertTrue( ('seconds_behind_master' not in health['realtime_stats']) or (health['realtime_stats']['seconds_behind_master'] < 30), 'got unexpected health: %s' % str(health)) self.assertIn('serving', health) # then restart replication, make sure we stay healthy utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias]) utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias]) # make sure status web page is healthy self.assertIn('>healthy</span></div>', tablet_62044.get_status()) # now test VtTabletStreamHealth returns the right thing stdout, _ = utils.run_vtctl( ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias], trap_output=True, auto_log=True) lines = stdout.splitlines() self.assertEqual(len(lines), 2) for line in lines: logging.debug('Got health: %s', line) data = json.loads(line) self.assertIn('realtime_stats', data) self.assertIn('serving', data) self.assertTrue(data['serving']) self.assertNotIn('health_error', data['realtime_stats']) self.assertNotIn('tablet_externally_reparented_timestamp', data) self.assertEqual('test_keyspace', data['target']['keyspace']) self.assertEqual('0', data['target']['shard']) self.assertEqual(topodata_pb2.REPLICA, data['target']['tablet_type']) # Test that VtTabletStreamHealth reports a QPS >0.0. # Therefore, issue several reads first. # NOTE: This may be potentially flaky because we'll observe a QPS >0.0 # exactly "once" for the duration of one sampling interval (5s) and # after that we'll see 0.0 QPS rates again. If this becomes actually # flaky, we need to read continuously in a separate thread. for _ in range(10): tablet_62044.execute('select 1 from dual') # This may take up to 5 seconds to become true because we sample the query # counts for the rates only every 5 seconds (see query_service_stats.go). timeout = 10 while True: health = utils.run_vtctl_json([ 'VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias ]) if health['realtime_stats'].get('qps', 0.0) > 0.0: break timeout = utils.wait_step('QPS >0.0 seen', timeout) # kill the tablets tablet.kill_tablets([tablet_62344, tablet_62044])
def test_reparent_down_master(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as they are serving tablet_62344.create_db('vt_test_keyspace') tablet_62044.create_db('vt_test_keyspace') tablet_41983.create_db('vt_test_keyspace') tablet_31981.create_db('vt_test_keyspace') # Start up a master mysql and vttablet tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # Create a few slaves for testing reparenting. tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) # wait for all tablets to start for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]: t.wait_for_vttablet_state('NOT_SERVING') # Force the slaves to reparent assuming that all the datasets are # identical. utils.run_vtctl([ 'InitShardMaster', '-force', 'test_keyspace/0', tablet_62344.tablet_alias ], auto_log=True) utils.validate_topology() tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test) # Make the current master agent and database unavailable. tablet_62344.kill_vttablet() tablet_62344.shutdown_mysql().wait() # Perform a planned reparent operation, will try to contact # the current master and fail somewhat quickly _, stderr = utils.run_vtctl([ '-wait-time', '5s', 'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0', '-new_master', tablet_62044.tablet_alias ], expect_fail=True) self.assertIn('DemoteMaster failed', stderr) # Run forced reparent operation, this should now proceed unimpeded. utils.run_vtctl([ 'EmergencyReparentShard', '-keyspace_shard', 'test_keyspace/0', '-new_master', tablet_62044.tablet_alias ], auto_log=True) utils.validate_topology() self._check_master_tablet(tablet_62044) # insert data into the new master, check the connected slaves work self._populate_vt_insert_test(tablet_62044, 2) self._check_vt_insert_test(tablet_41983, 2) self._check_vt_insert_test(tablet_31981, 2) # bring back the old master as a slave, check that it catches up tablet_62344.start_mysql().wait() tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True, wait_for_start=False) self._check_vt_insert_test(tablet_62344, 2) tablet.kill_tablets( [tablet_62344, tablet_62044, tablet_41983, tablet_31981])