Ejemplo n.º 1
0
def tearDownModule():
  global vtgate_server
  logging.debug("in tearDownModule")
  if utils.options.skip_teardown:
    return
  logging.debug("Tearing down the servers and setup")
  utils.vtgate_kill(vtgate_server)
  tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master,
                       shard_1_replica])
  teardown_procs = [shard_0_master.teardown_mysql(),
                    shard_0_replica.teardown_mysql(),
                    shard_1_master.teardown_mysql(),
                    shard_1_replica.teardown_mysql(),
                   ]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server_teardown()

  utils.kill_sub_processes()
  utils.remove_tmp_files()

  shard_0_master.remove_tree()
  shard_0_replica.remove_tree()
  shard_1_master.remove_tree()
  shard_1_replica.remove_tree()
Ejemplo n.º 2
0
    def test_reparent_cross_cell(self, shard_id="0"):
        utils.run_vtctl("CreateKeyspace test_keyspace")

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db("vt_test_keyspace")
        tablet_62044.create_db("vt_test_keyspace")
        tablet_41983.create_db("vt_test_keyspace")
        tablet_31981.create_db("vt_test_keyspace")

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet("master", "test_keyspace", shard_id, start=True)
        if environment.topo_server_implementation == "zookeeper":
            shard = utils.run_vtctl_json(["GetShard", "test_keyspace/" + shard_id])
            self.assertEqual(shard["Cells"], ["test_nj"], "wrong list of cell in Shard: %s" % str(shard["Cells"]))

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False)
        tablet_41983.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False)
        tablet_31981.init_tablet("replica", "test_keyspace", shard_id, start=True, wait_for_start=False)
        for t in [tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state("SERVING")
        if environment.topo_server_implementation == "zookeeper":
            shard = utils.run_vtctl_json(["GetShard", "test_keyspace/" + shard_id])
            self.assertEqual(
                shard["Cells"], ["test_nj", "test_ny"], "wrong list of cell in Shard: %s" % str(shard["Cells"])
            )

        # Recompute the shard layout node - until you do that, it might not be valid.
        utils.run_vtctl("RebuildShardGraph test_keyspace/" + shard_id)
        utils.validate_topology()

        # Force the slaves to reparent assuming that all the datasets are identical.
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.reset_replication()
        utils.pause("force ReparentShard?")
        utils.run_vtctl("ReparentShard -force test_keyspace/%s %s" % (shard_id, tablet_62344.tablet_alias))
        utils.validate_topology(ping_tablets=True)

        self._check_db_addr(shard_id, "master", tablet_62344.port)

        # Verify MasterCell is properly set
        srvShard = utils.run_vtctl_json(["GetSrvShard", "test_nj", "test_keyspace/%s" % (shard_id)])
        self.assertEqual(srvShard["MasterCell"], "test_nj")
        srvShard = utils.run_vtctl_json(["GetSrvShard", "test_ny", "test_keyspace/%s" % (shard_id)])
        self.assertEqual(srvShard["MasterCell"], "test_nj")

        # Perform a graceful reparent operation to another cell.
        utils.pause("graceful ReparentShard?")
        utils.run_vtctl("ReparentShard test_keyspace/%s %s" % (shard_id, tablet_31981.tablet_alias), auto_log=True)
        utils.validate_topology()

        self._check_db_addr(shard_id, "master", tablet_31981.port, cell="test_ny")

        # Verify MasterCell is set to new cell.
        srvShard = utils.run_vtctl_json(["GetSrvShard", "test_nj", "test_keyspace/%s" % (shard_id)])
        self.assertEqual(srvShard["MasterCell"], "test_ny")
        srvShard = utils.run_vtctl_json(["GetSrvShard", "test_ny", "test_keyspace/%s" % (shard_id)])
        self.assertEqual(srvShard["MasterCell"], "test_ny")

        tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
Ejemplo n.º 3
0
 def teardown(self):
   all_tablets = self.tablet_map.values()
   tablet.kill_tablets(all_tablets)
   teardown_procs = [t.teardown_mysql() for t in all_tablets]
   utils.wait_procs(teardown_procs, raise_on_error=False)
   for t in all_tablets:
     t.remove_tree()
Ejemplo n.º 4
0
def _teardown_shard_2():
    tablet.kill_tablets(shard_2_tablets)

    utils.run_vtctl(["DeleteShard", "-recursive", "test_keyspace/2"], auto_log=True)

    for t in shard_2_tablets:
        t.clean_dbs()
Ejemplo n.º 5
0
 def tearDown(self):
   # kill everything
   tablet.kill_tablets([source_master, source_replica, source_rdonly1,
                        source_rdonly2, destination_master,
                        destination_replica, destination_rdonly1,
                        destination_rdonly2])
   utils.vtgate.kill()
Ejemplo n.º 6
0
    def test_no_mysql_healthcheck(self):
        """This test starts a vttablet with no mysql port, while mysql is down.
    It makes sure vttablet will start properly and be unhealthy.
    Then we start mysql, and make sure vttablet becomes healthy.
    """
        # we need replication to be enabled, so the slave tablet can be healthy.
        for t in tablet_62344, tablet_62044:
            t.create_db("vt_test_keyspace")
        pos = mysql_flavor().master_position(tablet_62344)
        changeMasterCmds = mysql_flavor().change_master_commands(utils.hostname, tablet_62344.mysql_port, pos)
        tablet_62044.mquery("", ["RESET MASTER", "RESET SLAVE"] + changeMasterCmds + ["START SLAVE"])

        # now shutdown all mysqld
        shutdown_procs = [tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql()]
        utils.wait_procs(shutdown_procs)

        # start the tablets, wait for them to be NOT_SERVING (mysqld not there)
        tablet_62344.init_tablet("master", "test_keyspace", "0")
        tablet_62044.init_tablet("spare", "test_keyspace", "0", include_mysql_port=False)
        for t in tablet_62344, tablet_62044:
            t.start_vttablet(
                wait_for_state=None, target_tablet_type="replica", full_mycnf_args=True, include_mysql_port=False
            )
        for t in tablet_62344, tablet_62044:
            t.wait_for_vttablet_state("NOT_SERVING")
            self.check_healthz(t, False)

        # restart mysqld
        start_procs = [tablet_62344.start_mysql(), tablet_62044.start_mysql()]
        utils.wait_procs(start_procs)

        # the master should still be healthy
        utils.run_vtctl(["RunHealthCheck", tablet_62344.tablet_alias, "replica"], auto_log=True)
        self.check_healthz(tablet_62344, True)

        # the slave won't be healthy at first, as replication is not running
        utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True)
        self.check_healthz(tablet_62044, False)
        tablet_62044.wait_for_vttablet_state("NOT_SERVING")

        # restart replication
        tablet_62044.mquery("", ["START SLAVE"])

        # wait for the tablet to become healthy and fix its mysql port
        utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True)
        tablet_62044.wait_for_vttablet_state("SERVING")
        self.check_healthz(tablet_62044, True)

        for t in tablet_62344, tablet_62044:
            # wait for mysql port to show up
            timeout = 10
            while True:
                ti = utils.run_vtctl_json(["GetTablet", t.tablet_alias])
                if "mysql" in ti["Portmap"]:
                    break
                timeout = utils.wait_step("mysql port in tablet record", timeout)
            self.assertEqual(ti["Portmap"]["mysql"], t.mysql_port)

        # all done
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 7
0
def tearDownModule():
  if utils.options.skip_teardown:
    return

  tablet.kill_tablets([shard_0_master, shard_0_replica,
                       shard_1_master, shard_1_replica])
  teardown_procs = [
      shard_0_master.teardown_mysql(),
      shard_0_replica.teardown_mysql(),
      shard_1_master.teardown_mysql(),
      shard_1_replica.teardown_mysql(),
      unsharded_master.teardown_mysql(),
      unsharded_replica.teardown_mysql(),
      ]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server().teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()

  shard_0_master.remove_tree()
  shard_0_replica.remove_tree()
  shard_1_master.remove_tree()
  shard_1_replica.remove_tree()
  unsharded_master.remove_tree()
  unsharded_replica.remove_tree()
Ejemplo n.º 8
0
def tearDownModule():
  if utils.options.skip_teardown:
    return

  tablet.kill_tablets([src_master, src_replica, src_rdonly1, src_rdonly2,
                       dst_master, dst_replica])

  teardown_procs = [
      src_master.teardown_mysql(),
      src_replica.teardown_mysql(),
      src_rdonly1.teardown_mysql(),
      src_rdonly2.teardown_mysql(),
      dst_master.teardown_mysql(),
      dst_replica.teardown_mysql(),
      ]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server().teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()

  src_master.remove_tree()
  src_replica.remove_tree()
  src_rdonly1.remove_tree()
  src_rdonly2.remove_tree()
  dst_master.remove_tree()
  dst_replica.remove_tree()
Ejemplo n.º 9
0
  def test_primecache(self):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    master.init_tablet( 'master',  'test_keyspace', '0')
    replica.init_tablet('idle')

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    master.create_db('vt_test_keyspace')
    master.start_vttablet(wait_for_state=None)
    replica.start_vttablet(wait_for_state=None)

    master.wait_for_vttablet_state('SERVING')
    replica.wait_for_vttablet_state('NOT_SERVING') # DB doesn't exist

    self._create_data()

    # we use clone to not prime the mysql cache on the slave db
    utils.run_vtctl(['Clone', '-force', '-server-mode',
                     master.tablet_alias, replica.tablet_alias],
                    auto_log=True)

    # sync the buffer cache, and clear it. This will prompt for user's password
    utils.run(['sync'])
    utils.run(['sudo', 'bash', '-c', 'echo 1 > /proc/sys/vm/drop_caches'])

    # we can now change data on the master for 30s, while slave is stopped.
    # master's binlog will be in OS buffer cache now.
    replica.mquery('', 'slave stop')
    self._change_random_data()

    use_primecache = True # easy to test without
    if use_primecache:
      # starting vtprimecache, sleeping for a couple seconds
      args = environment.binary_args('vtprimecache') + [
              '-db-config-dba-uname', 'vt_dba',
              '-db-config-dba-charset', 'utf8',
              '-db-config-dba-dbname', 'vt_test_keyspace',
              '-db-config-app-uname', 'vt_app',
              '-db-config-app-charset', 'utf8',
              '-db-config-app-dbname', 'vt_test_keyspace',
              '-relay_logs_path', replica.tablet_dir+'/relay-logs',
              '-mysql_socket_file', replica.tablet_dir+'/mysql.sock',
              '-log_dir', environment.vtlogroot,
              '-worker_count', '4',
              '-alsologtostderr',
             ]
      vtprimecache = utils.run_bg(args)
      time.sleep(2)

    # start slave, see how longs it takes to catch up on replication
    replica.mquery('', 'slave start')
    self.catch_up()

    if use_primecache:
      # TODO(alainjobart): read and check stats
      utils.kill_sub_process(vtprimecache)

    tablet.kill_tablets([master, replica])
Ejemplo n.º 10
0
    def test_health_check_worker_state_does_not_shutdown_query_service(self):
        # This test is similar to test_health_check, but has the following
        # differences:
        # - the second tablet is an 'rdonly' and not a 'replica'
        # - the second tablet will be set to 'worker' and we expect that
        #   the query service won't be shutdown

        # Setup master and rdonly tablets.
        tablet_62344.init_tablet("master", "test_keyspace", "0")

        for t in tablet_62344, tablet_62044:
            t.create_db("vt_test_keyspace")

        tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type="replica")
        tablet_62044.start_vttablet(
            wait_for_state=None, target_tablet_type="rdonly", init_keyspace="test_keyspace", init_shard="0"
        )

        tablet_62344.wait_for_vttablet_state("SERVING")
        tablet_62044.wait_for_vttablet_state("NOT_SERVING")
        self.check_healthz(tablet_62044, False)

        # Enable replication.
        utils.run_vtctl(["InitShardMaster", "test_keyspace/0", tablet_62344.tablet_alias])
        # Trigger healthcheck to save time waiting for the next interval.
        utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"])
        utils.wait_for_tablet_type(tablet_62044.tablet_alias, "rdonly")
        self.check_healthz(tablet_62044, True)
        tablet_62044.wait_for_vttablet_state("SERVING")

        # Change from rdonly to worker and stop replication. (These
        # actions are similar to the SplitClone vtworker command
        # implementation.)  The tablet will become unhealthy, but the
        # query service is still running.
        utils.run_vtctl(["ChangeSlaveType", tablet_62044.tablet_alias, "worker"])
        utils.run_vtctl(["StopSlave", tablet_62044.tablet_alias])
        # Trigger healthcheck explicitly to avoid waiting for the next interval.
        utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"])
        utils.wait_for_tablet_type(tablet_62044.tablet_alias, "worker")
        self.check_healthz(tablet_62044, False)
        # Make sure that replication got disabled.
        self.assertIn(
            ">unhealthy: replication_reporter: " "Replication is not running</span></div>", tablet_62044.get_status()
        )
        # Query service is still running.
        tablet_62044.wait_for_vttablet_state("SERVING")

        # Restart replication. Tablet will become healthy again.
        utils.run_vtctl(["ChangeSlaveType", tablet_62044.tablet_alias, "spare"])
        utils.wait_for_tablet_type(tablet_62044.tablet_alias, "spare")
        utils.run_vtctl(["StartSlave", tablet_62044.tablet_alias])
        utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "rdonly"])
        utils.wait_for_tablet_type(tablet_62044.tablet_alias, "rdonly")
        self.check_healthz(tablet_62044, True)
        tablet_62044.wait_for_vttablet_state("SERVING")

        # kill the tablets
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 11
0
 def shutdown(self):
   tablet.kill_tablets(self.tablets)
   teardown_procs = [t.teardown_mysql() for t in self.tablets]
   utils.wait_procs(teardown_procs, raise_on_error=False)
   environment.topo_server().teardown()
   utils.kill_sub_processes()
   utils.remove_tmp_files()
   for t in self.tablets:
     t.remove_tree()
Ejemplo n.º 12
0
  def test_no_mysql_healthcheck(self):
    """This test starts a vttablet with no mysql port, while mysql is down.
    It makes sure vttablet will start properly and be unhealthy.
    Then we start mysql, and make sure vttablet becomes healthy.
    """
    # we need replication to be enabled, so the slave tablet can be healthy.
    for t in tablet_62344, tablet_62044:
      t.create_db('vt_test_keyspace')
    pos = mysql_flavor().master_position(tablet_62344)
    changeMasterCmds = mysql_flavor().change_master_commands(
                            utils.hostname,
                            tablet_62344.mysql_port,
                            pos)
    tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                        changeMasterCmds +
                        ['START SLAVE'])

    # now shutdown all mysqld
    shutdown_procs = [
        tablet_62344.shutdown_mysql(),
        tablet_62044.shutdown_mysql(),
        ]
    utils.wait_procs(shutdown_procs)

    # start the tablets, wait for them to be NOT_SERVING (mysqld not there)
    tablet_62344.init_tablet('master', 'test_keyspace', '0')
    tablet_62044.init_tablet('spare', 'test_keyspace', '0',
                             include_mysql_port=False)
    for t in tablet_62344, tablet_62044:
      t.start_vttablet(wait_for_state=None,
                       target_tablet_type='replica',
                       full_mycnf_args=True, include_mysql_port=False)
    for t in tablet_62344, tablet_62044:
      t.wait_for_vttablet_state('NOT_SERVING')

    # restart mysqld
    start_procs = [
        tablet_62344.start_mysql(),
        tablet_62044.start_mysql(),
        ]
    utils.wait_procs(start_procs)

    # wait for the tablets to become healthy and fix their mysql port
    for t in tablet_62344, tablet_62044:
      t.wait_for_vttablet_state('SERVING')
    for t in tablet_62344, tablet_62044:
      # wait for mysql port to show up
      timeout = 10
      while True:
        ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias])
        if 'mysql' in ti['Portmap']:
          break
        timeout = utils.wait_step('mysql port in tablet record', timeout)
      self.assertEqual(ti['Portmap']['mysql'], t.mysql_port)

    # all done
    tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 13
0
  def test_reparent_lag_slave(self, shard_id='0'):
    utils.run_vtctl('CreateKeyspace test_keyspace')

    # create the database so vttablets start, as they are serving
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62044.create_db('vt_test_keyspace')
    tablet_41983.create_db('vt_test_keyspace')
    tablet_31981.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True, wait_for_start=False)

    # Create a few slaves for testing reparenting.
    tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True, wait_for_start=False)
    tablet_41983.init_tablet('lag', 'test_keyspace', shard_id, start=True, wait_for_start=False)

    # wait for all tablets to start
    for t in [tablet_62344, tablet_62044, tablet_31981]:
      t.wait_for_vttablet_state("SERVING")
    tablet_41983.wait_for_vttablet_state("NOT_SERVING")

    # Recompute the shard layout node - until you do that, it might not be valid.
    utils.run_vtctl('RebuildShardGraph test_keyspace/' + shard_id)
    utils.validate_topology()

    # Force the slaves to reparent assuming that all the datasets are identical.
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.reset_replication()
    utils.run_vtctl('ReparentShard -force test_keyspace/%s %s' % (shard_id, tablet_62344.tablet_alias))
    utils.validate_topology(ping_tablets=True)

    tablet_62344.create_db('vt_test_keyspace')
    tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test)

    tablet_41983.mquery('', 'stop slave')
    for q in self._populate_vt_insert_test:
      tablet_62344.mquery('vt_test_keyspace', q, write=True)

    # Perform a graceful reparent operation.
    utils.run_vtctl('ReparentShard test_keyspace/%s %s' % (shard_id, tablet_62044.tablet_alias))

    tablet_41983.mquery('', 'start slave')
    time.sleep(1)

    utils.pause("check orphan")

    utils.run_vtctl('ReparentTablet %s' % tablet_41983.tablet_alias)

    result = tablet_41983.mquery('vt_test_keyspace', 'select msg from vt_insert_test where id=1')
    if len(result) != 1:
      self.fail('expected 1 row from vt_insert_test: %s' % str(result))

    utils.pause("check lag reparent")

    tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983, tablet_31981])
Ejemplo n.º 14
0
  def test_health_check_drained_state_does_not_shutdown_query_service(self):
    # This test is similar to test_health_check, but has the following
    # differences:
    # - the second tablet is an 'rdonly' and not a 'replica'
    # - the second tablet will be set to 'drained' and we expect that
    #   the query service won't be shutdown

    # Setup master and rdonly tablets.
    tablet_62344.init_tablet('replica', 'test_keyspace', '0')

    for t in tablet_62344, tablet_62044:
      t.create_db('vt_test_keyspace')

    # Note we only have a master and a rdonly. So we can't enable
    # semi-sync in this case, as the rdonly slaves don't semi-sync ack.
    tablet_62344.start_vttablet(wait_for_state=None, enable_semi_sync=False)
    tablet_62044.start_vttablet(wait_for_state=None,
                                init_tablet_type='rdonly',
                                init_keyspace='test_keyspace',
                                init_shard='0',
                                enable_semi_sync=False)

    tablet_62344.wait_for_vttablet_state('NOT_SERVING')
    tablet_62044.wait_for_vttablet_state('NOT_SERVING')
    self.check_healthz(tablet_62044, False)

    # Enable replication.
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias])

    # Trigger healthcheck to save time waiting for the next interval.
    utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])
    tablet_62044.wait_for_vttablet_state('SERVING')
    self.check_healthz(tablet_62044, True)

    # Change from rdonly to drained and stop replication. (These
    # actions are similar to the SplitClone vtworker command
    # implementation.)  The tablet will stay healthy, and the
    # query service is still running.
    utils.run_vtctl(['ChangeSlaveType', tablet_62044.tablet_alias, 'drained'])
    utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias])
    # Trigger healthcheck explicitly to avoid waiting for the next interval.
    utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])
    utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'drained')
    self.check_healthz(tablet_62044, True)
    # Query service is still running.
    tablet_62044.wait_for_vttablet_state('SERVING')

    # Restart replication. Tablet will become healthy again.
    utils.run_vtctl(['ChangeSlaveType', tablet_62044.tablet_alias, 'rdonly'])
    utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias])
    utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])
    self.check_healthz(tablet_62044, True)

    # kill the tablets
    tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 15
0
def _teardown_shard_2():
  tablet.kill_tablets(shard_2_tablets)

  utils.run_vtctl(
      ['DeleteShard', '-recursive', 'test_keyspace/2'], auto_log=True)

  for t in shard_2_tablets:
    t.reset_replication()
    t.set_semi_sync_enabled(master=False)
    t.clean_dbs()
Ejemplo n.º 16
0
  def test_reparent_cross_cell(self, shard_id='0'):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    # create the database so vttablets start, as they are serving
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62044.create_db('vt_test_keyspace')
    tablet_41983.create_db('vt_test_keyspace')
    tablet_31981.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
    self.assertEqual(shard['cells'], ['test_nj'],
                     'wrong list of cell in Shard: %s' % str(shard['cells']))

    # Create a few slaves for testing reparenting. Won't be healthy
    # as replication is not running.
    tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_62344.wait_for_vttablet_state('SERVING')
    for t in [tablet_62044, tablet_41983, tablet_31981]:
      t.wait_for_vttablet_state('NOT_SERVING')
    shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
    self.assertEqual(
        shard['cells'], ['test_nj', 'test_ny'],
        'wrong list of cell in Shard: %s' % str(shard['cells']))

    utils.validate_topology()

    # Force the slaves to reparent assuming that all the datasets are
    # identical.
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.reset_replication()
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/' + shard_id,
                     tablet_62344.tablet_alias], auto_log=True)
    utils.validate_topology(ping_tablets=True)

    self._check_master_tablet(tablet_62344)

    # Perform a graceful reparent operation to another cell.
    utils.pause('test_reparent_cross_cell PlannedReparentShard')
    utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id,
                     tablet_31981.tablet_alias], auto_log=True)
    utils.validate_topology()

    self._check_master_tablet(tablet_31981)

    tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983,
                         tablet_31981])
Ejemplo n.º 17
0
  def test_repeated_init_shard_master(self):
    """Test that using InitShardMaster can go back and forth between 2 hosts."""
    for t in tablet_62344, tablet_62044:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None,
                       lameduck_period='5s',
                       init_tablet_type='replica',
                       init_keyspace='test_keyspace',
                       init_shard='0')

    # Tablets are not replicating, so they won't be healthy.
    for t in tablet_62344, tablet_62044:
      t.wait_for_vttablet_state('NOT_SERVING')
      self.check_healthz(t, False)

    # Pick one master out of the two.
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias])

    # Run health check on both, make sure they are both healthy.
    # Also make sure the types are correct.
    for t in tablet_62344, tablet_62044:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True)
      self.check_healthz(t, True)
    utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0)
    utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0)

    # Pick the other one as master, make sure they are still healthy.
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62044.tablet_alias])

    # Run health check on both, make sure they are both healthy.
    # Also make sure the types are correct.
    for t in tablet_62344, tablet_62044:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True)
      self.check_healthz(t, True)
    utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'replica', timeout=0)
    utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'master', timeout=0)

    # Come back to the original guy.
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias])

    # Run health check on both, make sure they are both healthy.
    # Also make sure the types are correct.
    for t in tablet_62344, tablet_62044:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True)
      self.check_healthz(t, True)
    utils.wait_for_tablet_type(tablet_62344.tablet_alias, 'master', timeout=0)
    utils.wait_for_tablet_type(tablet_62044.tablet_alias, 'replica', timeout=0)

    # And done.
    tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 18
0
 def shutdown(self):
   # Explicitly kill vtgate first because StreamingServerShutdownIT.java expects an EOF from the vtgate client
   # and not an error that vttablet killed the query (which is seen when vtgate is killed last).
   utils.vtgate.kill()
   tablet.kill_tablets(self.tablets)
   teardown_procs = [t.teardown_mysql() for t in self.tablets]
   utils.wait_procs(teardown_procs, raise_on_error=False)
   environment.topo_server().teardown()
   utils.kill_sub_processes()
   utils.remove_tmp_files()
   for t in self.tablets:
     t.remove_tree()
Ejemplo n.º 19
0
  def tearDown(self):
    # kill all tablets
    tablet.kill_tablets(initial_tablets)

    for t in initial_tablets:
      t.reset_replication()
      t.set_semi_sync_enabled(master=False)
      t.clean_dbs()

    utils.run_vtctl(['DeleteShard', '-recursive', '-even_if_serving',
                     test_keyspace + '/0'], auto_log=True)
    utils.run_vtctl(['DeleteShard', '-recursive', '-even_if_serving',
                     test_keyspace + '/1'], auto_log=True)
Ejemplo n.º 20
0
  def test_health_check(self):
    utils.run_vtctl('CreateKeyspace test_keyspace')

    # one master, one replica that starts in spare
    tablet_62344.init_tablet('master', 'test_keyspace', '0')
    tablet_62044.init_tablet('spare', 'test_keyspace', '0')

    for t in tablet_62344, tablet_62044:
      t.create_db('vt_test_keyspace')

    tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica')
    tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica')

    tablet_62344.wait_for_vttablet_state('SERVING')
    tablet_62044.wait_for_vttablet_state('NOT_SERVING')

    utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias])

    # make sure the 'spare' slave goes to 'replica'
    timeout = 10
    while True:
      ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias])
      if ti['Type'] == "replica":
        logging.info("Slave tablet went to replica, good")
        break
      timeout = utils.wait_step('slave tablet going to replica', timeout)

    # make sure the master is still master
    ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias])
    self.assertEqual(ti['Type'], 'master', "unexpected master type: %s" % ti['Type'])

    # stop replication on the slave, see it trigger the slave going
    # slightly unhealthy
    tablet_62044.mquery('', 'stop slave')
    timeout = 10
    while True:
      ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias])
      if 'Health' in ti and ti['Health']:
        if 'replication_lag' in ti['Health']:
          if ti['Health']['replication_lag'] == 'high':
            logging.info("Slave tablet replication_lag went to high, good")
            break
      timeout = utils.wait_step('slave has high replication lag', timeout)

    # make sure the serving graph was updated
    ep = utils.run_vtctl_json(['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica'])
    if not ep['entries'][0]['health']:
      self.fail('Replication lag parameter not propagated to serving graph: %s' % str(ep))
    self.assertEqual(ep['entries'][0]['health']['replication_lag'], 'high', 'Replication lag parameter not propagated to serving graph: %s' % str(ep))

    tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 21
0
  def _test_reparent_slave_offline(self, shard_id='0'):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    # create the database so vttablets start, as they are serving
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62044.create_db('vt_test_keyspace')
    tablet_41983.create_db('vt_test_keyspace')
    tablet_31981.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)

    # Create a few slaves for testing reparenting.
    tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)

    # wait for all tablets to start
    tablet_62344.wait_for_vttablet_state('SERVING')
    for t in [tablet_62044, tablet_41983, tablet_31981]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # Recompute the shard layout node - until you do that, it might not be
    # valid.
    utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id])
    utils.validate_topology()

    # Force the slaves to reparent assuming that all the datasets are
    # identical.
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.reset_replication()
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/' + shard_id,
                     tablet_62344.tablet_alias])
    utils.validate_topology(ping_tablets=True)

    self._check_db_addr(shard_id, 'master', tablet_62344.port)

    # Kill one tablet so we seem offline
    tablet_31981.kill_vttablet()

    # Perform a graceful reparent operation.
    utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/' + shard_id,
                     tablet_62044.tablet_alias])
    self._check_db_addr(shard_id, 'master', tablet_62044.port)

    tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])
Ejemplo n.º 22
0
def tearDownModule():
    if utils.options.skip_teardown:
        return
    logging.debug("Tearing down the servers and setup")
    tablet.Tablet.tablets_running = 2
    tablet.kill_tablets([master_tablet, replica_tablet])
    teardown_procs = [master_tablet.teardown_mysql(), replica_tablet.teardown_mysql()]
    utils.wait_procs(teardown_procs, raise_on_error=False)

    environment.topo_server().teardown()
    utils.kill_sub_processes()
    utils.remove_tmp_files()
    master_tablet.remove_tree()
    replica_tablet.remove_tree()
Ejemplo n.º 23
0
def tearDownModule():
  utils.required_teardown()
  if utils.options.skip_teardown:
    return
  logging.debug('Tearing down the servers and setup')
  tablet.kill_tablets(all_tablets)
  utils.wait_procs([t.teardown_mysql() for t in all_tablets],
                   raise_on_error=False)

  environment.topo_server().teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()
  for t in all_tablets:
    t.remove_tree()
Ejemplo n.º 24
0
  def _test_vtctl_clone(self, server_mode):
    if server_mode:
      clone_flags = ['-server-mode']
    else:
      clone_flags = []

    # Start up a master mysql and vttablet
    utils.run_vtctl(['CreateKeyspace', 'snapshot_test'])

    tablet_62344.init_tablet('master', 'snapshot_test', '0')
    utils.run_vtctl(['RebuildShardGraph', 'snapshot_test/0'])
    utils.validate_topology()

    tablet_62344.populate('vt_snapshot_test', self._create_vt_insert_test,
                          self._populate_vt_insert_test)
    tablet_62344.start_vttablet()

    tablet_31981.create_db('vt_snapshot_test')
    tablet_31981.init_tablet('idle', start=True)

    # small test to make sure the directory validation works
    snapshot_dir = os.path.join(environment.vtdataroot, 'snapshot')
    utils.run("rm -rf %s" % snapshot_dir)
    utils.run("mkdir -p %s" % snapshot_dir)
    utils.run("chmod -w %s" % snapshot_dir)
    out, err = utils.run_vtctl(['Clone', '-force'] + clone_flags +
                               [tablet_62344.tablet_alias,
                                tablet_31981.tablet_alias],
                               log_level='INFO', expect_fail=True)
    if "Cannot validate snapshot directory" not in err:
      self.fail("expected validation error: %s" % err)
    if "Un-reserved test_ny-0000031981" not in err:
      self.fail("expected Un-reserved: %s" % err)
    logging.debug("Failed Clone output: " + err)
    utils.run("chmod +w %s" % snapshot_dir)

    call(["touch", "/tmp/vtSimulateFetchFailures"])
    utils.run_vtctl(['Clone', '-force'] + clone_flags +
                    [tablet_62344.tablet_alias, tablet_31981.tablet_alias],
                    auto_log=True)
    self._check_shard()

    utils.pause("look at logs!")
    tablet_31981.assert_table_count('vt_snapshot_test', 'vt_insert_test', 4)
    tablet_62344.assert_table_count('vt_snapshot_test', 'vt_insert_test', 4)

    utils.validate_topology()

    tablet.kill_tablets([tablet_62344, tablet_31981])
Ejemplo n.º 25
0
  def test_health_check_uid_collision(self):
    # If two tablets are running with the same UID, we should prevent the
    # healthcheck on the older one from modifying the tablet record after the
    # record has been claimed by a newer instance.
    tablet_62344.init_tablet('master', 'test_keyspace', '0')
    for t in tablet_62344, tablet_62044:
      t.create_db('vt_test_keyspace')

    # Before starting tablets, simulate another tablet
    # owning the replica's record.
    utils.run_vtctl(['InitTablet', '-allow_update', '-hostname', 'localhost',
                     '-keyspace', 'test_keyspace', '-shard', '0', '-port', '0',
                     '-parent', tablet_62044.tablet_alias, 'replica'])

    # Set up tablets.
    tablet_62344.start_vttablet(wait_for_state=None,
                                target_tablet_type='replica')
    tablet_62044.start_vttablet(wait_for_state=None,
                                target_tablet_type='replica',
                                init_keyspace='test_keyspace',
                                init_shard='0')
    tablet_62344.wait_for_vttablet_state('SERVING')
    tablet_62044.wait_for_vttablet_state('NOT_SERVING')
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/0',
                     tablet_62344.tablet_alias])
    tablet_62044.wait_for_vttablet_state('SERVING')

    # Check that the tablet owns the record.
    tablet_record = utils.run_vtctl_json(['GetTablet',
                                          tablet_62044.tablet_alias])
    self.assertEquals(tablet_record['port_map']['vt'], tablet_62044.port,
                      "tablet didn't take over the record")

    # Take away ownership again.
    utils.run_vtctl(['InitTablet', '-allow_update', '-hostname', 'localhost',
                     '-keyspace', 'test_keyspace', '-shard', '0', '-port', '0',
                     '-parent', tablet_62044.tablet_alias, 'replica'])

    # Tell the tablets to shutdown gracefully,
    # which normally includes going SPARE.
    tablet.kill_tablets([tablet_62344, tablet_62044])

    # Make sure the tablet record hasn't been touched.
    tablet_record = utils.run_vtctl_json(['GetTablet',
                                          tablet_62044.tablet_alias])
    self.assertEquals(tablet_record['type'],
                      tablet_62044.tablet_type_value['REPLICA'],
                      'tablet changed record without owning it')
Ejemplo n.º 26
0
    def _test_vtctl_clone(self, server_mode):
        if server_mode:
            clone_flags = ["-server-mode"]
        else:
            clone_flags = []

        # Start up a master mysql and vttablet
        utils.run_vtctl(["CreateKeyspace", "snapshot_test"])

        tablet_62344.init_tablet("master", "snapshot_test", "0")
        utils.run_vtctl(["RebuildShardGraph", "snapshot_test/0"])
        utils.validate_topology()

        tablet_62344.populate("vt_snapshot_test", self._create_vt_insert_test, self._populate_vt_insert_test)
        tablet_62344.start_vttablet()

        tablet_62044.create_db("vt_snapshot_test")
        tablet_62044.init_tablet("idle", start=True)

        # small test to make sure the directory validation works
        snapshot_dir = os.path.join(environment.vtdataroot, "snapshot")
        utils.run("rm -rf %s" % snapshot_dir)
        utils.run("mkdir -p %s" % snapshot_dir)
        utils.run("chmod -w %s" % snapshot_dir)
        out, err = utils.run_vtctl(
            ["Clone", "-force"] + clone_flags + [tablet_62344.tablet_alias, tablet_62044.tablet_alias],
            log_level="INFO",
            expect_fail=True,
        )
        if "Cannot validate snapshot directory" not in err:
            self.fail("expected validation error: %s" % err)
        if "Un-reserved test_nj-0000062044" not in err:
            self.fail("expected Un-reserved: %s" % err)
        logging.debug("Failed Clone output: " + err)
        utils.run("chmod +w %s" % snapshot_dir)

        call(["touch", "/tmp/vtSimulateFetchFailures"])
        utils.run_vtctl(
            ["Clone", "-force"] + clone_flags + [tablet_62344.tablet_alias, tablet_62044.tablet_alias], auto_log=True
        )

        utils.pause("look at logs!")
        tablet_62044.assert_table_count("vt_snapshot_test", "vt_insert_test", 4)
        tablet_62344.assert_table_count("vt_snapshot_test", "vt_insert_test", 4)

        utils.validate_topology()

        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 27
0
def tearDownModule():
  utils.required_teardown()
  if utils.options.skip_teardown:
    return

  tablet.kill_tablets(all_tablets)

  teardown_procs = [t.teardown_mysql() for t in all_tablets]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server().teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()

  for t in all_tablets:
    t.remove_tree()
Ejemplo n.º 28
0
  def test_repeated_init_shard_master(self):
    for t in tablet_62344, tablet_62044:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None,
                       target_tablet_type='replica',
                       lameduck_period='5s',
                       init_keyspace='test_keyspace',
                       init_shard='0')

    # tablets are not replicating, so they won't be healthy
    for t in tablet_62344, tablet_62044:
      t.wait_for_vttablet_state('NOT_SERVING')
      self.check_healthz(t, False)

    # pick one master out of the two
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias])

    # run health check on both, make sure they are both healthy
    for t in tablet_62344, tablet_62044:
          utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica'],
                          auto_log=True)
          self.check_healthz(t, True)

    # pick the other one as master, make sure they are still healthy
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62044.tablet_alias])

    # run health check on both, make sure they are both healthy
    for t in tablet_62344, tablet_62044:
          utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica'],
                          auto_log=True)
          self.check_healthz(t, True)

    # and come back to the original guy
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias])

    # run health check on both, make sure they are both healthy
    for t in tablet_62344, tablet_62044:
          utils.run_vtctl(['RunHealthCheck', t.tablet_alias, 'replica'],
                          auto_log=True)
          self.check_healthz(t, True)

    # and done
    tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 29
0
def tearDownModule():
  if utils.options.skip_teardown:
    return

  tablet.kill_tablets([master_tablet, replica_tablet])

  teardown_procs = [
      master_tablet.teardown_mysql(),
      replica_tablet.teardown_mysql(),
      ]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server_teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()

  master_tablet.remove_tree()
  replica_tablet.remove_tree()
Ejemplo n.º 30
0
def tearDownModule():
  global __tablets
  if utils.options.skip_teardown:
    return
  if __tablets is not None:
    tablet.kill_tablets(__tablets)
    teardown_procs = []
    for t in __tablets:
      teardown_procs.append(t.teardown_mysql())
    utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server().teardown()

  utils.kill_sub_processes()
  utils.remove_tmp_files()

  if __tablets is not None:
    for t in __tablets:
      t.remove_tree()
Ejemplo n.º 31
0
    def _test_reparent_graceful(self, shard_id):
        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
        self.assertEqual(
            shard['cells'], ['test_nj'],
            'wrong list of cell in Shard: %s' % str(shard['cells']))

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        for t in [tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
        self.assertEqual(
            shard['cells'], ['test_nj', 'test_ny'],
            'wrong list of cell in Shard: %s' % str(shard['cells']))

        # Force the slaves to reparent assuming that all the datasets are
        # identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/' + shard_id,
            tablet_62344.tablet_alias
        ])
        utils.validate_topology(ping_tablets=True)
        tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test)

        self._check_master_tablet(tablet_62344)

        utils.validate_topology()

        # Run this to make sure it succeeds.
        stdout, _ = utils.run_vtctl(
            ['ShardReplicationPositions', 'test_keyspace/' + shard_id],
            trap_output=True)
        lines = stdout.splitlines()
        self.assertEqual(len(lines), 4)  # one master, three slaves
        self.assertIn('master', lines[0])  # master first

        # Perform a graceful reparent operation.
        utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' +
            shard_id, '-new_master', tablet_62044.tablet_alias
        ],
                        auto_log=True)
        utils.validate_topology()

        self._check_master_tablet(tablet_62044)

        # insert data into the new master, check the connected slaves work
        self._populate_vt_insert_test(tablet_62044, 1)
        self._check_vt_insert_test(tablet_41983, 1)
        self._check_vt_insert_test(tablet_62344, 1)

        tablet.kill_tablets(
            [tablet_62344, tablet_62044, tablet_41983, tablet_31981])

        # Test address correction.
        new_port = environment.reserve_ports(1)
        tablet_62044.start_vttablet(port=new_port)

        # Wait until the new address registers.
        timeout = 30.0
        while True:
            try:
                self._check_master_tablet(tablet_62044, port=new_port)
                break
            except protocols_flavor().client_error_exception_type():
                timeout = utils.wait_step('waiting for new port to register',
                                          timeout,
                                          sleep_time=0.1)

        tablet_62044.kill_vttablet()
Ejemplo n.º 32
0
  def test_sharding(self):

    shard_0_master.init_tablet('master', 'test_keyspace', '-80')
    shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
    shard_1_master.init_tablet('master', 'test_keyspace', '80-')
    shard_1_replica.init_tablet('replica', 'test_keyspace', '80-')

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    # run checks now before we start the tablets
    utils.validate_topology()

    # create databases, start the tablets, wait for them to start
    for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None)
    for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]:
      t.wait_for_vttablet_state('SERVING')

    # apply the schema on the first shard through vtctl, so all tablets
    # are the same.
    shard_0_master.mquery('vt_test_keyspace',
                          create_vt_select_test.replace('\n', ''), write=True)
    shard_0_replica.mquery('vt_test_keyspace',
                           create_vt_select_test.replace('\n', ''), write=True)

    # apply the schema on the second shard.
    shard_1_master.mquery(
        'vt_test_keyspace',
        create_vt_select_test_reverse.replace('\n', ''), write=True)
    shard_1_replica.mquery(
        'vt_test_keyspace',
        create_vt_select_test_reverse.replace('\n', ''), write=True)

    for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]:
      utils.run_vtctl(['ReloadSchema', t.tablet_alias])

    # start vtgate, we'll use it later
    utils.VtGate().start()

    for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]:
      t.reset_replication()
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80',
                     shard_0_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-',
                     shard_1_master.tablet_alias], auto_log=True)

    # insert some values directly (db is RO after minority reparent)
    # FIXME(alainjobart) these values don't match the shard map
    utils.run_vtctl(['SetReadWrite', shard_0_master.tablet_alias])
    utils.run_vtctl(['SetReadWrite', shard_1_master.tablet_alias])
    shard_0_master.mquery(
        'vt_test_keyspace',
        "insert into vt_select_test (id, msg) values (1, 'test 1')",
        write=True)
    shard_1_master.mquery(
        'vt_test_keyspace',
        "insert into vt_select_test (id, msg) values (10, 'test 10')",
        write=True)

    utils.validate_topology(ping_tablets=True)

    utils.pause('Before the sql scatter query')

    # make sure the '1' value was written on first shard
    rows = shard_0_master.mquery(
        'vt_test_keyspace', 'select id, msg from vt_select_test order by id')
    self.assertEqual(rows, ((1, 'test 1'),),
                     'wrong mysql_query output: %s' % str(rows))

    utils.pause('After db writes')

    # throw in some schema validation step
    # we created the schema differently, so it should show
    utils.run_vtctl(['ValidateSchemaShard', 'test_keyspace/-80'])
    utils.run_vtctl(['ValidateSchemaShard', 'test_keyspace/80-'])
    out, err = utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'],
                               trap_output=True, raise_on_error=False)
    if ('test_nj-0000062344 and test_nj-0000062346 disagree on schema '
        'for table vt_select_test:\nCREATE TABLE' not in err or
        'test_nj-0000062344 and test_nj-0000062347 disagree on schema '
        'for table vt_select_test:\nCREATE TABLE' not in err):
      self.fail('wrong ValidateSchemaKeyspace output: ' + err)

    # validate versions
    utils.run_vtctl(['ValidateVersionShard', 'test_keyspace/-80'],
                    auto_log=True)
    utils.run_vtctl(['ValidateVersionKeyspace', 'test_keyspace'], auto_log=True)

    # show and validate permissions
    utils.run_vtctl(['GetPermissions', 'test_nj-0000062344'], auto_log=True)
    utils.run_vtctl(['ValidatePermissionsShard', 'test_keyspace/-80'],
                    auto_log=True)
    utils.run_vtctl(['ValidatePermissionsKeyspace', 'test_keyspace'],
                    auto_log=True)

    if environment.topo_server().flavor() == 'zookeeper':
      # and create zkns on this complex keyspace, make sure a few
      # files are created
      utils.run_vtctl(['ExportZknsForKeyspace', 'test_keyspace'])
      out, err = utils.run(
          environment.binary_argstr('zk') +
          ' ls -R /zk/test_nj/zk?s/vt/test_keysp*', trap_output=True)
      lines = out.splitlines()
      for base in ['-80', '80-']:
        for db_type in ['master', 'replica']:
          for sub_path in ['', '.vdns', '/0', '/vt.vdns']:
            expected = ('/zk/test_nj/zkns/vt/test_keyspace/' + base + '/' +
                        db_type + sub_path)
            if expected not in lines:
              self.fail('missing zkns part:\n%s\nin:%s' %(expected, out))

    # connect to the tablets directly, make sure they know / validate
    # their own shard
    sql = 'select id, msg from vt_select_test order by id'

    qr = shard_0_master.execute(sql)
    self.assertEqual(qr['Rows'], [['1', 'test 1'],])

    qr = shard_1_master.execute(sql)
    self.assertEqual(qr['Rows'], [['10', 'test 10'],])

    _, stderr = utils.run_vtctl(['VtTabletExecute',
                                 '-keyspace', 'test_keyspace',
                                 '-shard', '-90',
                                 shard_0_master.tablet_alias, sql],
                                expect_fail=True)
    self.assertIn('fatal: Shard mismatch, expecting -80, received -90', stderr)

    utils.vtgate.kill()
    tablet.kill_tablets([shard_0_master, shard_0_replica, shard_1_master,
                         shard_1_replica])
Ejemplo n.º 33
0
    def test_resharding(self):
        # we're going to reparent and swap these two
        global shard_2_master, shard_2_replica1

        utils.run_vtctl([
            'CreateKeyspace', '--sharding_column_name', 'bad_column',
            '--sharding_column_type', 'bytes', 'test_keyspace'
        ])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_sharding_key',
            'uint64'
        ],
                        expect_fail=True)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'custom_sharding_key', keyspace_id_type
        ])

        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')
        shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        ks = utils.run_vtctl_json(
            ['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
        self.assertEqual(ks['sharding_column_name'], 'custom_sharding_key')

        # we set full_mycnf_args to True as a test in the KIT_BYTES case
        full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES

        # create databases so vttablet can start behaving somewhat normally
        for t in [
                shard_0_master, shard_0_replica, shard_0_ny_rdonly,
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_rdonly, shard_1_rdonly1
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             full_mycnf_args=full_mycnf_args)

        # wait for the tablets (replication is not setup, the slaves won't be
        # healthy)
        shard_0_master.wait_for_vttablet_state('SERVING')
        shard_0_replica.wait_for_vttablet_state('NOT_SERVING')
        shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
        shard_1_master.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')
        shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
        shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias
        ],
                        auto_log=True)

        # check the shards
        shards = utils.run_vtctl_json(
            ['FindAllShardsInKeyspace', 'test_keyspace'])
        self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards))
        self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards))
        self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards))

        # create the tables
        self._create_schema()
        self._insert_startup_values()
        self._test_keyrange_constraints()

        # run a health check on source replicas so they respond to discovery
        # (for binlog players) and on the source rdonlys (for workers)
        for t in [shard_0_replica, shard_1_slave1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
        for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

        # create the split shards
        shard_2_master.init_tablet('master', 'test_keyspace', '80-c0')
        shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_3_master.init_tablet('master', 'test_keyspace', 'c0-')
        shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-')
        shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        shard_2_master.start_vttablet(wait_for_state=None)
        shard_3_master.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_replica1, shard_2_replica2, shard_3_replica,
                shard_3_rdonly1
        ]:
            t.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_master, shard_2_replica1, shard_2_replica2,
                shard_3_master, shard_3_replica, shard_3_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/80-c0',
            shard_2_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias
        ],
                        auto_log=True)

        # check the shards
        shards = utils.run_vtctl_json(
            ['FindAllShardsInKeyspace', 'test_keyspace'])
        for s in ['-80', '80-', '80-c0', 'c0-']:
            self.assertIn(s, shards, 'unexpected shards: %s' % str(shards))
        self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards))

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')

        # disable shard_1_slave2, so we're sure filtered replication will go
        # from shard_1_slave1
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')

        # we need to create the schema, and the worker will do data copying
        for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_1_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)

        # the worker will do everything. We test with source_reader_count=10
        # (down from default=20) as connection pool is not big enough for 20.
        # min_table_size_for_split is set to 1 as to force a split even on the
        # small table we have.
        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        utils.run_vtworker([
            '--cell', 'test_nj', '--command_display_interval', '10ms',
            'SplitClone', '--exclude_tables', 'unrelated',
            '--source_reader_count', '10', '--min_table_size_for_split', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        # TODO(alainjobart): experiment with the dontStartBinlogPlayer option

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl([
            'ValidateSchemaKeyspace', '--exclude_tables=unrelated',
            'test_keyspace'
        ],
                        auto_log=True)

        # check the binlog players are running and exporting vars
        self.check_destination_master(shard_2_master, ['test_keyspace/80-'])
        self.check_destination_master(shard_3_master, ['test_keyspace/80-'])

        # check that binlog server exported the stats vars
        self.check_binlog_server_vars(shard_1_slave1, horizontal=True)

        # Check that the throttler was enabled.
        self.check_binlog_throttler(shard_2_master.rpc_endpoint(),
                                    ['BinlogPlayer/0'], 9999)
        self.check_binlog_throttler(shard_3_master.rpc_endpoint(),
                                    ['BinlogPlayer/0'], 9999)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000)
        logging.debug('Checking 80 percent of data is sent quickly')
        v = self._check_lots_timeout(1000, 80, 5)
        if v != 100:
            # small optimization: only do this check if we don't have all the data
            # already anyway.
            logging.debug('Checking all data goes through eventually')
            self._check_lots_timeout(1000, 100, 20)
        logging.debug('Checking no data was sent the wrong way')
        self._check_lots_not_present(1000)
        self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'],
                                      seconds_behind_master_max=30)
        self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'],
                                      seconds_behind_master_max=30)
        self.check_binlog_server_vars(shard_1_slave1,
                                      horizontal=True,
                                      min_statements=1000,
                                      min_transactions=1000)

        # use vtworker to compare the data (after health-checking the destination
        # rdonly tablets so discovery works)
        utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias])
        logging.debug('Running vtworker SplitDiff')
        utils.run_vtworker([
            '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause('Good time to test vtworker for diffs')

        # get status for destination master tablets, make sure we have it all
        self.check_running_binlog_player(shard_2_master, 4000, 2000)
        self.check_running_binlog_player(shard_3_master, 4000, 2000)

        # start a thread to insert data into shard_1 in the background
        # with current time, and monitor the delay
        insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 10000,
                                       0x9000000000000000)
        insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 10001,
                                       0xD000000000000000)
        monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low')
        monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high')

        # tests a failover switching serving to a different replica
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
        utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias])

        # test data goes through again
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000, base=1000)
        logging.debug('Checking 80 percent of data was sent quickly')
        self._check_lots_timeout(1000, 80, 5, base=1000)
        self.check_binlog_server_vars(shard_1_slave2,
                                      horizontal=True,
                                      min_statements=800,
                                      min_transactions=800)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        expect_fail=True)

        # check query service is off on master 2 and master 3, as filtered
        # replication is enabled. Even health check that is enabled on
        # master 3 should not interfere (we run it to be sure).
        utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias],
                        auto_log=True)
        for master in [shard_2_master, shard_3_master]:
            utils.check_tablet_query_service(self, master, False, False)
            stream_health = utils.run_vtctl_json(
                ['VtTabletStreamHealth', '-count', '1', master.tablet_alias])
            logging.debug('Got health: %s', str(stream_health))
            self.assertIn('realtime_stats', stream_health)
            self.assertNotIn('serving', stream_health)

        # check the destination master 3 is healthy, even though its query
        # service is not running (if not healthy this would exception out)
        shard_3_master.get_healthz()

        # now serve rdonly from the split shards, in test_nj only
        utils.run_vtctl([
            'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-',
            'rdonly'
        ],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')
        utils.check_srv_keyspace('test_ny',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')
        utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

        # now serve rdonly from the split shards, everywhere
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')
        utils.check_srv_keyspace('test_ny',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')
        utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True)
        utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

        # then serve replica from the split shards
        destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-']

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-c0 c0-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')
        utils.check_tablet_query_service(self, shard_1_slave2, False, True)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
            auto_log=True)
        # After a backwards migration, queryservice should be enabled on
        # source and disabled on destinations
        utils.check_tablet_query_service(self, shard_1_slave2, True, False)
        # Destination tablets would have query service disabled for other
        # reasons than the migration, so check the shard record instead of
        # the tablets directly.
        utils.check_shard_query_services(self, destination_shards,
                                         topodata_pb2.REPLICA, False)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        # After a forwards migration, queryservice should be disabled on
        # source and enabled on destinations
        utils.check_tablet_query_service(self, shard_1_slave2, False, True)
        # Destination tablets would have query service disabled for other
        # reasons than the migration, so check the shard record instead of
        # the tablets directly
        utils.check_shard_query_services(self, destination_shards,
                                         topodata_pb2.REPLICA, True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-c0 c0-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')

        # reparent shard_2 to shard_2_replica1, then insert more data and
        # see it flow through still
        utils.run_vtctl([
            'PlannedReparentShard', 'test_keyspace/80-c0',
            shard_2_replica1.tablet_alias
        ])

        # update our test variables to point at the new master
        shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master

        logging.debug(
            'Inserting lots of data on source shard after reparenting')
        self._insert_lots(3000, base=2000)
        logging.debug('Checking 80 percent of data was sent fairly quickly')
        self._check_lots_timeout(3000, 80, 10, base=2000)

        # use vtworker to compare the data again
        logging.debug('Running vtworker SplitDiff')
        utils.run_vtworker([
            '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        # going to migrate the master now, check the delays
        monitor_thread_1.done = True
        monitor_thread_2.done = True
        insert_thread_1.done = True
        insert_thread_2.done = True
        logging.debug('DELAY 1: %s max_lag=%d avg_lag=%d',
                      monitor_thread_1.object_name, monitor_thread_1.max_lag,
                      monitor_thread_1.lag_sum / monitor_thread_1.sample_count)
        logging.debug('DELAY 2: %s max_lag=%d avg_lag=%d',
                      monitor_thread_2.object_name, monitor_thread_2.max_lag,
                      monitor_thread_2.lag_sum / monitor_thread_2.sample_count)

        # mock with the SourceShard records to test 'vtctl SourceShardDelete'
        # and 'vtctl SourceShardAdd'
        utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'],
                        auto_log=True)
        utils.run_vtctl([
            'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0',
            'test_keyspace/80-'
        ],
                        auto_log=True)

        # then serve master from the split shards, make sure the source master's
        # query service is now turned off
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-c0 c0-\n'
                                 'Partitions(rdonly): -80 80-c0 c0-\n'
                                 'Partitions(replica): -80 80-c0 c0-\n',
                                 keyspace_id_type=keyspace_id_type,
                                 sharding_column_name='custom_sharding_key')
        utils.check_tablet_query_service(self, shard_1_master, False, True)

        # check the binlog players are gone now
        self.check_no_binlog_player(shard_2_master)
        self.check_no_binlog_player(shard_3_master)

        # delete the original tablets in the original shard
        tablet.kill_tablets([
            shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
            shard_1_rdonly1
        ])
        for t in [
                shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
                shard_1_rdonly1
        ]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
        utils.run_vtctl(
            ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias],
            auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # test RemoveShardCell
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'],
                        auto_log=True,
                        expect_fail=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'],
                        auto_log=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'],
                        auto_log=True)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
        self.assertNotIn('cells', shard)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)

        # kill everything
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master,
            shard_2_replica1, shard_2_replica2, shard_3_master,
            shard_3_replica, shard_3_rdonly1
        ])
Ejemplo n.º 34
0
    def _test_reparent_slave_offline(self, shard_id='0'):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('master',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('SERVING')

        # Recompute the shard layout node - until you do that, it might not be
        # valid.
        utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id])
        utils.validate_topology()

        # Force the slaves to reparent assuming that all the datasets are identical.
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.reset_replication()
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/' + shard_id,
            tablet_62344.tablet_alias
        ])
        utils.validate_topology(ping_tablets=True)

        self._check_db_addr(shard_id, 'master', tablet_62344.port)

        # Kill one tablet so we seem offline
        tablet_31981.kill_vttablet()

        # Perform a graceful reparent operation.
        utils.run_vtctl([
            'ReparentShard', 'test_keyspace/' + shard_id,
            tablet_62044.tablet_alias
        ])

        tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])
Ejemplo n.º 35
0
    def _test_reparent_from_outside(self, brutal=False, rpc=False):
        """This test will start a master and 3 slaves. Then:
    - one slave will be the new master
    - one slave will be reparented to that new master
    - one slave will be busted and ded in the water
    and we'll call ShardExternallyReparented.

    Args:
      brutal: scraps the old master first
      rpc: sends an RPC to the new master instead of doing the work.
    """
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('master',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('SERVING')

        # Reparent as a starting point
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.reset_replication()
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ],
                        auto_log=True)

        # now manually reparent 1 out of 2 tablets
        # 62044 will be the new master
        # 31981 won't be re-parented, so it will be busted
        tablet_62044.mquery('', mysql_flavor().promote_slave_commands())
        new_pos = mysql_flavor().master_position(tablet_62044)
        logging.debug('New master position: %s', str(new_pos))
        changeMasterCmds = mysql_flavor().change_master_commands(
            utils.hostname, tablet_62044.mysql_port, new_pos)

        # 62344 will now be a slave of 62044
        tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                            changeMasterCmds + ['START SLAVE'])

        # 41983 will be a slave of 62044
        tablet_41983.mquery('', ['STOP SLAVE'] + changeMasterCmds +
                            ['START SLAVE'])

        # in brutal mode, we scrap the old master first
        if brutal:
            tablet_62344.scrap(force=True)
            # we have some automated tools that do this too, so it's good to simulate
            if environment.topo_server().flavor() == 'zookeeper':
                utils.run(
                    environment.binary_args('zk') +
                    ['rm', '-rf', tablet_62344.zk_tablet_path])

        # update zk with the new graph
        extra_args = []
        if rpc:
            extra_args = ['-use_rpc']
        utils.run_vtctl(['ShardExternallyReparented'] + extra_args +
                        ['test_keyspace/0', tablet_62044.tablet_alias],
                        mode=utils.VTCTL_VTCTL,
                        auto_log=True)

        self._test_reparent_from_outside_check(brutal)

        utils.run_vtctl(
            ['RebuildReplicationGraph', 'test_nj', 'test_keyspace'])

        self._test_reparent_from_outside_check(brutal)

        tablet.kill_tablets(
            [tablet_31981, tablet_62344, tablet_62044, tablet_41983])
Ejemplo n.º 36
0
    def test_health_check(self):
        # one master, one replica that starts in spare
        # (for the replica, we let vttablet do the InitTablet)
        tablet_62344.init_tablet('master', 'test_keyspace', '0')

        for t in tablet_62344, tablet_62044:
            t.create_db('vt_test_keyspace')

        tablet_62344.start_vttablet(wait_for_state=None,
                                    target_tablet_type='replica')
        tablet_62044.start_vttablet(wait_for_state=None,
                                    target_tablet_type='replica',
                                    lameduck_period='5s',
                                    init_keyspace='test_keyspace',
                                    init_shard='0')

        tablet_62344.wait_for_vttablet_state('SERVING')
        tablet_62044.wait_for_vttablet_state('NOT_SERVING')
        self.check_healthz(tablet_62044, False)

        utils.run_vtctl(
            ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias])

        # make sure the 'spare' slave goes to 'replica'
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'replica')
        self.check_healthz(tablet_62044, True)

        # make sure the master is still master
        ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias])
        self.assertEqual(ti['type'], tablet.Tablet.tablet_type_value['MASTER'],
                         'unexpected master type: %s' % ti['type'])

        # stop replication, make sure we go unhealthy.
        utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias])
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'spare')
        self.check_healthz(tablet_62044, False)

        # make sure the serving graph was updated
        timeout = 10
        while True:
            try:
                utils.run_vtctl_json(
                    ['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica'])
            except:
                logging.debug('Tablet is gone from serving graph, good')
                break
            timeout = utils.wait_step(
                'Stopped replication didn\'t trigger removal from serving graph',
                timeout)

        # make sure status web page is unhappy
        self.assertIn(
            '>unhealthy: replication_reporter: '
            'Replication is not running</span></div>',
            tablet_62044.get_status())

        # make sure the health stream is updated
        health = utils.run_vtctl_json(
            ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias])
        self.assertIn('replication_reporter: Replication is not running',
                      health['realtime_stats']['health_error'])

        # then restart replication, and write data, make sure we go back to healthy
        utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias])
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'replica')

        # make sure status web page is healthy
        self.assertIn('>healthy</span></div>', tablet_62044.get_status())

        # make sure the vars is updated
        v = utils.get_vars(tablet_62044.port)
        self.assertEqual(v['LastHealthMapCount'], 0)

        # now test VtTabletStreamHealth returns the right thing
        stdout, stderr = utils.run_vtctl(
            ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias],
            trap_output=True,
            auto_log=True)
        lines = stdout.splitlines()
        self.assertEqual(len(lines), 2)
        for line in lines:
            logging.debug('Got health: %s', line)
            data = json.loads(line)
            self.assertIn('realtime_stats', data)
            self.assertNotIn('health_error', data['realtime_stats'])
            self.assertNotIn('tablet_externally_reparented_timestamp', data)
            self.assertEqual('test_keyspace', data['target']['keyspace'])
            self.assertEqual('0', data['target']['shard'])
            self.assertEqual(3, data['target']['tablet_type'])

        # kill the tablets
        tablet.kill_tablets([tablet_62344, tablet_62044])

        # the replica was in lameduck for 5 seconds, should have been enough
        # to reset its state to spare
        ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias])
        self.assertEqual(
            ti['type'], tablet.Tablet.tablet_type_value['SPARE'],
            "tablet didn't go to spare while in lameduck mode: %s" % str(ti))
Ejemplo n.º 37
0
    def test_no_mysql_healthcheck(self):
        """This test starts a vttablet with no mysql port, while mysql is down.
    It makes sure vttablet will start properly and be unhealthy.
    Then we start mysql, and make sure vttablet becomes healthy.
    """
        # we need replication to be enabled, so the slave tablet can be healthy.
        for t in tablet_62344, tablet_62044:
            t.create_db('vt_test_keyspace')
        pos = mysql_flavor().master_position(tablet_62344)
        # Use 'localhost' as hostname because Travis CI worker hostnames
        # are too long for MySQL replication.
        changeMasterCmds = mysql_flavor().change_master_commands(
            'localhost', tablet_62344.mysql_port, pos)
        tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                            changeMasterCmds + ['START SLAVE'])

        # now shutdown all mysqld
        shutdown_procs = [
            tablet_62344.shutdown_mysql(),
            tablet_62044.shutdown_mysql(),
        ]
        utils.wait_procs(shutdown_procs)

        # start the tablets, wait for them to be NOT_SERVING (mysqld not there)
        tablet_62344.init_tablet('master', 'test_keyspace', '0')
        tablet_62044.init_tablet('spare',
                                 'test_keyspace',
                                 '0',
                                 include_mysql_port=False)
        for t in tablet_62344, tablet_62044:
            t.start_vttablet(wait_for_state=None,
                             target_tablet_type='replica',
                             full_mycnf_args=True,
                             include_mysql_port=False)
        for t in tablet_62344, tablet_62044:
            t.wait_for_vttablet_state('NOT_SERVING')
            self.check_healthz(t, False)

        # restart mysqld
        start_procs = [
            tablet_62344.start_mysql(),
            tablet_62044.start_mysql(),
        ]
        utils.wait_procs(start_procs)

        # the master should still be healthy
        utils.run_vtctl(
            ['RunHealthCheck', tablet_62344.tablet_alias, 'replica'],
            auto_log=True)
        self.check_healthz(tablet_62344, True)

        # the slave won't be healthy at first, as replication is not running
        utils.run_vtctl(
            ['RunHealthCheck', tablet_62044.tablet_alias, 'replica'],
            auto_log=True)
        self.check_healthz(tablet_62044, False)
        tablet_62044.wait_for_vttablet_state('NOT_SERVING')

        # restart replication
        tablet_62044.mquery('', ['START SLAVE'])

        # wait for the tablet to become healthy and fix its mysql port
        utils.run_vtctl(
            ['RunHealthCheck', tablet_62044.tablet_alias, 'replica'],
            auto_log=True)
        tablet_62044.wait_for_vttablet_state('SERVING')
        self.check_healthz(tablet_62044, True)

        for t in tablet_62344, tablet_62044:
            # wait for mysql port to show up
            timeout = 10
            while True:
                ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias])
                if 'mysql' in ti['port_map']:
                    break
                timeout = utils.wait_step('mysql port in tablet record',
                                          timeout)
            self.assertEqual(ti['port_map']['mysql'], t.mysql_port)

        # all done
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 38
0
  def _test_reparent_graceful(self, shard_id):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    # create the database so vttablets start, as they are serving
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62044.create_db('vt_test_keyspace')
    tablet_41983.create_db('vt_test_keyspace')
    tablet_31981.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True)
    if environment.topo_server().flavor() == 'zookeeper':
      shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
      self.assertEqual(shard['Cells'], ['test_nj'],
                       'wrong list of cell in Shard: %s' % str(shard['Cells']))

    # Create a few slaves for testing reparenting.
    tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    for t in [tablet_62044, tablet_41983, tablet_31981]:
      t.wait_for_vttablet_state('SERVING')
    if environment.topo_server().flavor() == 'zookeeper':
      shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
      self.assertEqual(shard['Cells'], ['test_nj', 'test_ny'],
                       'wrong list of cell in Shard: %s' % str(shard['Cells']))

    # Recompute the shard layout node - until you do that, it might not be
    # valid.
    utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id])
    utils.validate_topology()

    # Force the slaves to reparent assuming that all the datasets are identical.
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.reset_replication()
    utils.pause('force ReparentShard?')
    utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/' + shard_id,
                     tablet_62344.tablet_alias])
    utils.validate_topology(ping_tablets=True)

    self._check_db_addr(shard_id, 'master', tablet_62344.port)

    # Verify MasterCell is set to new cell.
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_nj')
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_nj')

    # Convert two replica to spare. That should leave only one node serving traffic,
    # but still needs to appear in the replication graph.
    utils.run_vtctl(['ChangeSlaveType', tablet_41983.tablet_alias, 'spare'])
    utils.run_vtctl(['ChangeSlaveType', tablet_31981.tablet_alias, 'spare'])
    utils.validate_topology()
    self._check_db_addr(shard_id, 'replica', tablet_62044.port)

    # Run this to make sure it succeeds.
    utils.run_vtctl(['ShardReplicationPositions', 'test_keyspace/' + shard_id],
                    stdout=utils.devnull)

    # Perform a graceful reparent operation.
    utils.pause('graceful ReparentShard?')
    utils.run_vtctl(['ReparentShard', 'test_keyspace/' + shard_id,
                     tablet_62044.tablet_alias], auto_log=True)
    utils.validate_topology()

    self._check_db_addr(shard_id, 'master', tablet_62044.port)

    # Verify MasterCell is set to new cell.
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_nj')
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_nj')

    tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983,
                         tablet_31981])

    # Test address correction.
    new_port = environment.reserve_ports(1)
    tablet_62044.start_vttablet(port=new_port)

    # Wait until the new address registers.
    timeout = 30.0
    while True:
      try:
        self._check_db_addr(shard_id, 'master', new_port)
        break
      except:
        timeout = utils.wait_step('waiting for new port to register',
                                  timeout, sleep_time=0.1)

    tablet_62044.kill_vttablet()
Ejemplo n.º 39
0
  def test_reparent_cross_cell(self, shard_id='0'):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    # create the database so vttablets start, as they are serving
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62044.create_db('vt_test_keyspace')
    tablet_41983.create_db('vt_test_keyspace')
    tablet_31981.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('master', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
    self.assertEqual(shard['Cells'], ['test_nj'],
                     'wrong list of cell in Shard: %s' % str(shard['Cells']))

    # Create a few slaves for testing reparenting.
    tablet_62044.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_41983.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', shard_id, start=True,
                             wait_for_start=False)
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.wait_for_vttablet_state('SERVING')
    shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
    self.assertEqual(
        shard['Cells'], ['test_nj', 'test_ny'],
        'wrong list of cell in Shard: %s' % str(shard['Cells']))

    # Recompute the shard layout node - until you do that, it might not be
    # valid.
    utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id])
    utils.validate_topology()

    # Force the slaves to reparent assuming that all the datasets are identical.
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.reset_replication()
    utils.pause('force ReparentShard?')
    utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/' + shard_id,
                     tablet_62344.tablet_alias])
    utils.validate_topology(ping_tablets=True)

    self._check_db_addr(shard_id, 'master', tablet_62344.port)

    # Verify MasterCell is properly set
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_nj')
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_nj')

    # Perform a graceful reparent operation to another cell.
    utils.pause('graceful ReparentShard?')
    utils.run_vtctl(['ReparentShard', 'test_keyspace/' + shard_id,
                     tablet_31981.tablet_alias], auto_log=True)
    utils.validate_topology()

    self._check_db_addr(shard_id, 'master', tablet_31981.port, cell='test_ny')

    # Verify MasterCell is set to new cell.
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_nj',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_ny')
    srvShard = utils.run_vtctl_json(['GetSrvShard', 'test_ny',
                                     'test_keyspace/%s' % (shard_id)])
    self.assertEqual(srvShard['MasterCell'], 'test_ny')

    tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983,
                         tablet_31981])
Ejemplo n.º 40
0
  def test_reparent_down_master(self):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    # create the database so vttablets start, as they are serving
    tablet_62344.create_db('vt_test_keyspace')
    tablet_62044.create_db('vt_test_keyspace')
    tablet_41983.create_db('vt_test_keyspace')
    tablet_31981.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True,
                             wait_for_start=False)

    # Create a few slaves for testing reparenting.
    tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)
    tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)

    # wait for all tablets to start
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.wait_for_vttablet_state('SERVING')

    # Recompute the shard layout node - until you do that, it might not be
    # valid.
    utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/0'])
    utils.validate_topology()

    # Force the slaves to reparent assuming that all the datasets are identical.
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.reset_replication()
    utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias], auto_log=True)
    utils.validate_topology()

    # Make the master agent and database unavailable.
    tablet_62344.kill_vttablet()
    tablet_62344.shutdown_mysql().wait()

    self._check_db_addr('0', 'master', tablet_62344.port)

    # Perform a reparent operation - the Validate part will try to ping
    # the master and fail somewhat quickly
    stdout, stderr = utils.run_vtctl(['-wait-time', '5s', 'ReparentShard',
                                      'test_keyspace/0',
                                      tablet_62044.tablet_alias],
                                     expect_fail=True)
    logging.debug('Failed ReparentShard output:\n' + stderr)
    if 'ValidateShard verification failed' not in stderr:
      self.fail(
          "didn't find the right error strings in failed ReparentShard: " +
          stderr)

    # Should fail to connect and fail
    stdout, stderr = utils.run_vtctl(['-wait-time', '10s', 'ScrapTablet',
                                      tablet_62344.tablet_alias],
                                     expect_fail=True)
    logging.debug('Failed ScrapTablet output:\n' + stderr)
    if 'connection refused' not in stderr and protocols_flavor().rpc_timeout_message() not in stderr:
      self.fail("didn't find the right error strings in failed ScrapTablet: " +
                stderr)

    # Force the scrap action in zk even though tablet is not accessible.
    tablet_62344.scrap(force=True)

    # Re-run reparent operation, this should now proceed unimpeded.
    utils.run_vtctl(['ReparentShard', 'test_keyspace/0',
                     tablet_62044.tablet_alias], auto_log=True)

    utils.validate_topology()
    self._check_db_addr('0', 'master', tablet_62044.port)

    utils.run_vtctl(['ChangeSlaveType', '-force', tablet_62344.tablet_alias,
                     'idle'])

    idle_tablets, _ = utils.run_vtctl(['ListAllTablets', 'test_nj'],
                                      trap_output=True)
    if '0000062344 <null> <null> idle' not in idle_tablets:
      self.fail('idle tablet not found: %s' % idle_tablets)

    tablet.kill_tablets([tablet_62044, tablet_41983, tablet_31981])

    # so the other tests don't have any surprise
    tablet_62344.start_mysql().wait()
Ejemplo n.º 41
0
    def test_resharding(self):
        # we're going to reparent and swap these two
        global shard_2_master, shard_2_replica1

        utils.run_vtctl([
            'CreateKeyspace', '--sharding_column_name', 'bad_column',
            '--sharding_column_type', 'bytes', 'test_keyspace'
        ])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col',
            'uint64'
        ],
                        expect_fail=True)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'custom_ksid_col', base_sharding.keyspace_id_type
        ])

        shard_0_master.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')
        shard_1_master.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')
        shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        ks = utils.run_vtctl_json(
            ['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
        self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')

        # we set full_mycnf_args to True as a test in the KIT_BYTES case
        full_mycnf_args = (
            base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES)

        # create databases so vttablet can start behaving somewhat normally
        for t in [
                shard_0_master, shard_0_replica, shard_0_ny_rdonly,
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_rdonly, shard_1_rdonly1
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             full_mycnf_args=full_mycnf_args)

        # wait for the tablets (replication is not setup, they won't be healthy)
        for t in [
                shard_0_master, shard_0_replica, shard_0_ny_rdonly,
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_rdonly, shard_1_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/-80',
            shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/80-',
            shard_1_master.tablet_alias
        ],
                        auto_log=True)

        # check the shards
        shards = utils.run_vtctl_json(
            ['FindAllShardsInKeyspace', 'test_keyspace'])
        self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards))
        self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards))
        self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards))

        # create the tables
        self._create_schema()
        self._insert_startup_values()

        # run a health check on source replicas so they respond to discovery
        # (for binlog players) and on the source rdonlys (for workers)
        for t in [shard_0_replica, shard_1_slave1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
        for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

        # create the split shards
        shard_2_master.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0')
        shard_3_master.init_tablet('replica', 'test_keyspace', 'c0-')
        shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-')
        shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        shard_2_master.start_vttablet(wait_for_state=None)
        shard_3_master.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_replica1, shard_2_replica2, shard_2_rdonly1,
                shard_3_replica, shard_3_rdonly1
        ]:
            t.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_master, shard_2_replica1, shard_2_replica2,
                shard_2_rdonly1, shard_3_master, shard_3_replica,
                shard_3_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/80-c0',
            shard_2_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/c0-',
            shard_3_master.tablet_alias
        ],
                        auto_log=True)

        # check the shards
        shards = utils.run_vtctl_json(
            ['FindAllShardsInKeyspace', 'test_keyspace'])
        for s in ['-80', '80-', '80-c0', 'c0-']:
            self.assertIn(s, shards, 'unexpected shards: %s' % str(shards))
        self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards))

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # disable shard_1_slave2, so we're sure filtered replication will go
        # from shard_1_slave1
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')

        # we need to create the schema, and the worker will do data copying
        for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_1_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)

        # Run vtworker as daemon for the following SplitClone commands.
        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj', '--command_display_interval', '10ms'],
            auto_log=True)

        # Copy the data from the source to the destination shards.
        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        #
        # Initial clone (online).
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 2, 0, 0, 0)

        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Test the correct handling of keyspace_id changes which happen after
        # the first clone.
        # Let row 2 go to shard 3 instead of shard 2.
        shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set'
                              ' custom_ksid_col=0xD000000000000000 WHERE id=2',
                              write=True)
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        # Row 2 will be deleted from shard 2 and inserted to shard 3.
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 1, 0, 1, 1)
        self._check_value(shard_2_master,
                          'resharding1',
                          2,
                          'msg2',
                          0xD000000000000000,
                          should_be_here=False)
        self._check_value(shard_3_master, 'resharding1', 2, 'msg2',
                          0xD000000000000000)
        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again.
        shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set'
                              ' custom_ksid_col=0x9000000000000000 WHERE id=2',
                              write=True)
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        # Row 2 will be deleted from shard 3 and inserted to shard 2.
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 1, 0, 1, 1)
        self._check_value(shard_2_master, 'resharding1', 2, 'msg2',
                          0x9000000000000000)
        self._check_value(shard_3_master,
                          'resharding1',
                          2,
                          'msg2',
                          0x9000000000000000,
                          should_be_here=False)
        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Modify the destination shard. SplitClone will revert the changes.
        # Delete row 2 (provokes an insert).
        shard_2_master.mquery('vt_test_keyspace',
                              'delete from resharding1 where id=2',
                              write=True)
        # Update row 3 (provokes an update).
        shard_3_master.mquery(
            'vt_test_keyspace',
            "update resharding1 set msg='msg-not-3' where id=3",
            write=True)
        # Insert row 4 and 5 (provokes a delete).
        self._insert_value(shard_3_master, 'resharding1', 4, 'msg4',
                           0xD000000000000000)
        self._insert_value(shard_3_master, 'resharding1', 5, 'msg5',
                           0xD000000000000000)

        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count',
            '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets',
            '1', '--max_tps', '9999', 'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        # Change tablet, which was taken offline, back to rdonly.
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 1, 1, 2, 0)
        self.verify_reconciliation_counters(worker_port, 'Offline',
                                            'resharding1', 0, 0, 0, 2)
        # Terminate worker daemon because it is no longer needed.
        utils.kill_sub_process(worker_proc, soft=True)

        # TODO(alainjobart): experiment with the dontStartBinlogPlayer option

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl([
            'ValidateSchemaKeyspace', '--exclude_tables=unrelated',
            'test_keyspace'
        ],
                        auto_log=True)

        # check the binlog players are running and exporting vars
        self.check_destination_master(shard_2_master, ['test_keyspace/80-'])
        self.check_destination_master(shard_3_master, ['test_keyspace/80-'])
        # When the binlog players/filtered replication is turned on, the query
        # service must be turned off on the destination masters.
        # The tested behavior is a safeguard to prevent that somebody can
        # accidentally modify data on the destination masters while they are not
        # migrated yet and the source shards are still the source of truth.
        shard_2_master.wait_for_vttablet_state('NOT_SERVING')
        shard_3_master.wait_for_vttablet_state('NOT_SERVING')

        # check that binlog server exported the stats vars
        self.check_binlog_server_vars(shard_1_slave1, horizontal=True)

        # Check that the throttler was enabled.
        self.check_throttler_service(shard_2_master.rpc_endpoint(),
                                     ['BinlogPlayer/0'], 9999)
        self.check_throttler_service(shard_3_master.rpc_endpoint(),
                                     ['BinlogPlayer/0'], 9999)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000)
        logging.debug('Executing MultiValue Insert Queries')
        self._exec_multi_shard_dmls()
        logging.debug('Checking 80 percent of data is sent quickly')
        v = self._check_lots_timeout(1000, 80, 5)
        if v != 100:
            # small optimization: only do this check if we don't have all the data
            # already anyway.
            logging.debug('Checking all data goes through eventually')
            self._check_lots_timeout(1000, 100, 20)
        logging.debug('Checking no data was sent the wrong way')
        self._check_lots_not_present(1000)

        logging.debug('Checking MultiValue Insert Queries')
        self._check_multi_shard_values()
        self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'],
                                      seconds_behind_master_max=30)
        self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'],
                                      seconds_behind_master_max=30)
        self.check_binlog_server_vars(shard_1_slave1,
                                      horizontal=True,
                                      min_statements=1000,
                                      min_transactions=1000)

        # use vtworker to compare the data (after health-checking the destination
        # rdonly tablets so discovery works)
        utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias])
        logging.debug('Running vtworker SplitDiff')
        utils.run_vtworker([
            '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause('Good time to test vtworker for diffs')

        # get status for destination master tablets, make sure we have it all
        if base_sharding.use_rbr:
            # We submitted non-annotated DMLs, that are properly routed
            # with RBR, but not with SBR. So the first shard counts
            # are smaller. In the second shard, we submitted statements
            # that affect more than one keyspace id. These will result
            # in two queries with RBR. So the count there is higher.
            self.check_running_binlog_player(shard_2_master, 4018, 2008)
            self.check_running_binlog_player(shard_3_master, 4028, 2008)
        else:
            self.check_running_binlog_player(shard_2_master, 4022, 2008)
            self.check_running_binlog_player(shard_3_master, 4024, 2008)

        # start a thread to insert data into shard_1 in the background
        # with current time, and monitor the delay
        insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000,
                                       0x9000000000000000)
        insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001,
                                       0xD000000000000000)
        monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1)
        monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2)

        # tests a failover switching serving to a different replica
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
        utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias])

        # test data goes through again
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000, base=1000)
        logging.debug('Checking 80 percent of data was sent quickly')
        self._check_lots_timeout(1000, 80, 5, base=1000)
        self.check_binlog_server_vars(shard_1_slave2,
                                      horizontal=True,
                                      min_statements=800,
                                      min_transactions=800)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        expect_fail=True)

        # check query service is off on master 2 and master 3, as filtered
        # replication is enabled. Even health check that is enabled on
        # master 3 should not interfere (we run it to be sure).
        utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias],
                        auto_log=True)
        for master in [shard_2_master, shard_3_master]:
            utils.check_tablet_query_service(self, master, False, False)
            stream_health = utils.run_vtctl_json(
                ['VtTabletStreamHealth', '-count', '1', master.tablet_alias])
            logging.debug('Got health: %s', str(stream_health))
            self.assertIn('realtime_stats', stream_health)
            self.assertNotIn('serving', stream_health)

        # check the destination master 3 is healthy, even though its query
        # service is not running (if not healthy this would exception out)
        shard_3_master.get_healthz()

        # now serve rdonly from the split shards, in test_nj only
        utils.run_vtctl([
            'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-',
            'rdonly'
        ],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_srv_keyspace(
            'test_ny',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

        # now serve rdonly from the split shards, everywhere
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_srv_keyspace(
            'test_ny',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True)
        utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

        # then serve replica from the split shards
        destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-']

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-c0 c0-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_1_slave2, False, True)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
            auto_log=True)
        # After a backwards migration, queryservice should be enabled on
        # source and disabled on destinations
        utils.check_tablet_query_service(self, shard_1_slave2, True, False)
        # Destination tablets would have query service disabled for other
        # reasons than the migration, so check the shard record instead of
        # the tablets directly.
        utils.check_shard_query_services(self, destination_shards,
                                         topodata_pb2.REPLICA, False)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        # After a forwards migration, queryservice should be disabled on
        # source and enabled on destinations
        utils.check_tablet_query_service(self, shard_1_slave2, False, True)
        # Destination tablets would have query service disabled for other
        # reasons than the migration, so check the shard record instead of
        # the tablets directly
        utils.check_shard_query_services(self, destination_shards,
                                         topodata_pb2.REPLICA, True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-c0 c0-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # reparent shard_2 to shard_2_replica1, then insert more data and
        # see it flow through still
        utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-c0',
            '-new_master', shard_2_replica1.tablet_alias
        ])

        # update our test variables to point at the new master
        shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master

        logging.debug(
            'Inserting lots of data on source shard after reparenting')
        self._insert_lots(3000, base=2000)
        logging.debug('Checking 80 percent of data was sent fairly quickly')
        self._check_lots_timeout(3000, 80, 10, base=2000)

        # use vtworker to compare the data again
        logging.debug('Running vtworker SplitDiff')
        utils.run_vtworker([
            '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        # going to migrate the master now, check the delays
        monitor_thread_1.done = True
        monitor_thread_2.done = True
        insert_thread_1.done = True
        insert_thread_2.done = True
        logging.debug(
            'DELAY 1: %s max_lag=%d ms avg_lag=%d ms',
            monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms,
            monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count)
        logging.debug(
            'DELAY 2: %s max_lag=%d ms avg_lag=%d ms',
            monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms,
            monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count)

        # mock with the SourceShard records to test 'vtctl SourceShardDelete'
        # and 'vtctl SourceShardAdd'
        utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'],
                        auto_log=True)
        utils.run_vtctl([
            'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0',
            'test_keyspace/80-'
        ],
                        auto_log=True)

        # then serve master from the split shards, make sure the source master's
        # query service is now turned off
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-c0 c0-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_1_master, False, True)

        # check the binlog players are gone now
        self.check_no_binlog_player(shard_2_master)
        self.check_no_binlog_player(shard_3_master)

        # delete the original tablets in the original shard
        tablet.kill_tablets([
            shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
            shard_1_rdonly1
        ])
        for t in [
                shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
                shard_1_rdonly1
        ]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
        utils.run_vtctl(
            ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias],
            auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # test RemoveShardCell
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'],
                        auto_log=True,
                        expect_fail=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'],
                        auto_log=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'],
                        auto_log=True)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
        self.assertNotIn('cells', shard)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)

        # make sure we can't delete the destination shard now that it's serving
        _, stderr = utils.run_vtctl(['DeleteShard', 'test_keyspace/80-c0'],
                                    expect_fail=True)
        self.assertIn('is still serving, cannot delete it', stderr)

        # kill everything
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master,
            shard_2_replica1, shard_2_replica2, shard_2_rdonly1,
            shard_3_master, shard_3_replica, shard_3_rdonly1
        ])
Ejemplo n.º 42
0
    def test_no_mysql_healthcheck(self):
        """This test starts a vttablet with no mysql port, while mysql is down.

    It makes sure vttablet will start properly and be unhealthy.
    Then we start mysql, and make sure vttablet becomes healthy.
    """
        # we need replication to be enabled, so the slave tablet can be healthy.
        for t in tablet_62344, tablet_62044:
            t.create_db('vt_test_keyspace')
        pos = mysql_flavor().master_position(tablet_62344)
        # Use 'localhost' as hostname because Travis CI worker hostnames
        # are too long for MySQL replication.
        change_master_cmds = mysql_flavor().change_master_commands(
            'localhost', tablet_62344.mysql_port, pos)
        tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                            change_master_cmds + ['START SLAVE'])

        # now shutdown all mysqld
        shutdown_procs = [
            tablet_62344.shutdown_mysql(),
            tablet_62044.shutdown_mysql(),
        ]
        utils.wait_procs(shutdown_procs)

        # start the tablets, wait for them to be NOT_SERVING (mysqld not there)
        tablet_62344.init_tablet('master', 'test_keyspace', '0')
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 include_mysql_port=False)
        for t in tablet_62344, tablet_62044:
            # Since MySQL is down at this point and we want the tablet to start up
            # successfully, we have to use supports_backups=False.
            t.start_vttablet(wait_for_state=None,
                             supports_backups=False,
                             full_mycnf_args=True,
                             include_mysql_port=False)
        for t in tablet_62344, tablet_62044:
            t.wait_for_vttablet_state('NOT_SERVING')
            self.check_healthz(t, False)

        # Tell slave to not try to repair replication in healthcheck.
        # The StopSlave will ultimately fail because mysqld is not running,
        # But vttablet should remember that it's not supposed to fix replication.
        utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias],
                        expect_fail=True)

        # The above notice to not fix replication should survive tablet restart.
        tablet_62044.kill_vttablet()
        tablet_62044.start_vttablet(wait_for_state='NOT_SERVING',
                                    full_mycnf_args=True,
                                    include_mysql_port=False,
                                    supports_backups=False)

        # restart mysqld
        start_procs = [
            tablet_62344.start_mysql(),
            tablet_62044.start_mysql(),
        ]
        utils.wait_procs(start_procs)

        # the master should still be healthy
        utils.run_vtctl(['RunHealthCheck', tablet_62344.tablet_alias],
                        auto_log=True)
        self.check_healthz(tablet_62344, True)

        # the slave will now be healthy, but report a very high replication
        # lag, because it can't figure out what it exactly is.
        utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias],
                        auto_log=True)
        tablet_62044.wait_for_vttablet_state('SERVING')
        self.check_healthz(tablet_62044, True)

        health = utils.run_vtctl_json(
            ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias])
        self.assertTrue('seconds_behind_master' in health['realtime_stats'])
        self.assertEqual(health['realtime_stats']['seconds_behind_master'],
                         7200)
        self.assertIn('serving', health)

        # restart replication, wait until health check goes small
        # (a value of zero is default and won't be in structure)
        utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias])
        timeout = 10
        while True:
            utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias],
                            auto_log=True)
            health = utils.run_vtctl_json([
                'VtTabletStreamHealth', '-count', '1',
                tablet_62044.tablet_alias
            ])
            if 'serving' in health and (
                ('seconds_behind_master' not in health['realtime_stats']) or
                (health['realtime_stats']['seconds_behind_master'] < 30)):
                break
            timeout = utils.wait_step('health delay goes back down', timeout)

        # wait for the tablet to fix its mysql port
        for t in tablet_62344, tablet_62044:
            # wait for mysql port to show up
            timeout = 10
            while True:
                ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias])
                if 'mysql' in ti['port_map']:
                    break
                timeout = utils.wait_step('mysql port in tablet record',
                                          timeout)
            self.assertEqual(ti['port_map']['mysql'], t.mysql_port)

        # all done
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 43
0
    def test_change_type_semi_sync(self):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # Create new names for tablets, so this test is less confusing.
        master = tablet_62344
        replica = tablet_62044
        rdonly1 = tablet_41983
        rdonly2 = tablet_31981

        # create the database so vttablets start, as they are serving
        for t in [master, replica, rdonly1, rdonly2]:
            t.create_db('vt_test_keyspace')

        # Start up a soon-to-be master, one replica and two rdonly.
        master.init_tablet('replica',
                           'test_keyspace',
                           '0',
                           start=True,
                           wait_for_start=False)
        replica.init_tablet('replica',
                            'test_keyspace',
                            '0',
                            start=True,
                            wait_for_start=False)
        rdonly1.init_tablet('rdonly',
                            'test_keyspace',
                            '0',
                            start=True,
                            wait_for_start=False)
        rdonly2.init_tablet('rdonly',
                            'test_keyspace',
                            '0',
                            start=True,
                            wait_for_start=False)
        for t in [master, replica, rdonly1, rdonly2]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Force the slaves to reparent assuming that all the datasets are
        # identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0', master.tablet_alias
        ],
                        auto_log=True)
        utils.validate_topology(ping_tablets=True)
        self._check_master_tablet(master)

        # Stop replication on rdonly1, to make sure when we make it
        # replica it doesn't start again.
        # Note we do a similar test for replica -> rdonly below.
        utils.run_vtctl(['StopSlave', rdonly1.tablet_alias])

        # Check semi-sync on slaves.
        # The flag is only an indication of the value to use next time
        # we turn replication on, so also check the status.
        # rdonly1 is not replicating, so its status is off.
        replica.check_db_var('rpl_semi_sync_slave_enabled', 'ON')
        rdonly1.check_db_var('rpl_semi_sync_slave_enabled', 'OFF')
        rdonly2.check_db_var('rpl_semi_sync_slave_enabled', 'OFF')
        replica.check_db_status('rpl_semi_sync_slave_status', 'ON')
        rdonly1.check_db_status('rpl_semi_sync_slave_status', 'OFF')
        rdonly2.check_db_status('rpl_semi_sync_slave_status', 'OFF')

        # Change replica to rdonly while replicating, should turn off semi-sync,
        # and restart replication.
        utils.run_vtctl(['ChangeSlaveType', replica.tablet_alias, 'rdonly'],
                        auto_log=True)
        replica.check_db_var('rpl_semi_sync_slave_enabled', 'OFF')
        replica.check_db_status('rpl_semi_sync_slave_status', 'OFF')

        # Change rdonly1 to replica, should turn on semi-sync, and not start rep.
        utils.run_vtctl(['ChangeSlaveType', rdonly1.tablet_alias, 'replica'],
                        auto_log=True)
        rdonly1.check_db_var('rpl_semi_sync_slave_enabled', 'ON')
        rdonly1.check_db_status('rpl_semi_sync_slave_status', 'OFF')
        slave_io_running = 10
        slave_sql_running = 11
        s = rdonly1.mquery('', 'show slave status')
        self.assertEqual(s[0][slave_io_running], 'No')
        self.assertEqual(s[0][slave_sql_running], 'No')

        # Now change from replica back to rdonly, make sure replication is
        # still not enabled.
        utils.run_vtctl(['ChangeSlaveType', rdonly1.tablet_alias, 'rdonly'],
                        auto_log=True)
        rdonly1.check_db_var('rpl_semi_sync_slave_enabled', 'OFF')
        rdonly1.check_db_status('rpl_semi_sync_slave_status', 'OFF')
        s = rdonly1.mquery('', 'show slave status')
        self.assertEqual(s[0][slave_io_running], 'No')
        self.assertEqual(s[0][slave_sql_running], 'No')

        # Change rdonly2 to replica, should turn on semi-sync, and restart rep.
        utils.run_vtctl(['ChangeSlaveType', rdonly2.tablet_alias, 'replica'],
                        auto_log=True)
        rdonly2.check_db_var('rpl_semi_sync_slave_enabled', 'ON')
        rdonly2.check_db_status('rpl_semi_sync_slave_status', 'ON')

        # Clean up.
        tablet.kill_tablets([master, replica, rdonly1, rdonly2])
Ejemplo n.º 44
0
    def test_reparent_with_down_slave(self, shard_id='0'):
        """See if a missing slave can be safely reparented after the fact."""
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_31981, tablet_41983]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Force the slaves to reparent assuming that all the datasets are identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/' + shard_id,
            tablet_62344.tablet_alias
        ])
        utils.validate_topology(ping_tablets=True)
        tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test)

        utils.wait_procs([tablet_41983.shutdown_mysql()])

        # Perform a graceful reparent operation. It will fail as one tablet is down.
        _, stderr = utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' +
            shard_id, '-new_master', tablet_62044.tablet_alias
        ],
                                    expect_fail=True)
        self.assertIn('TabletManager.SetMaster on test_nj-0000041983 error',
                      stderr)

        # insert data into the new master, check the connected slaves work
        self._populate_vt_insert_test(tablet_62044, 3)
        self._check_vt_insert_test(tablet_31981, 3)
        self._check_vt_insert_test(tablet_62344, 3)

        # restart mysql on the old slave, should still be connecting to the
        # old master
        utils.wait_procs([tablet_41983.start_mysql()])

        utils.pause('check orphan')

        # reparent the tablet (will not start replication, so we have to
        # do it ourselves), then it should catch up on replication really quickly
        utils.run_vtctl(['ReparentTablet', tablet_41983.tablet_alias])
        utils.run_vtctl(['StartSlave', tablet_41983.tablet_alias])

        # wait until it gets the data
        self._check_vt_insert_test(tablet_41983, 3)

        tablet.kill_tablets(
            [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
Ejemplo n.º 45
0
    def test_resharding(self):
        # create the keyspace with just one shard
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'keyspace_id', keyspace_id_type
        ])

        shard_master.init_tablet('master', 'test_keyspace', '0')
        shard_replica.init_tablet('replica', 'test_keyspace', '0')
        shard_rdonly.init_tablet('rdonly', 'test_keyspace', '0')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # create databases so vttablet can start behaving normally
        for t in [shard_master, shard_replica, shard_rdonly]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None)

        # wait for the tablets
        shard_master.wait_for_vttablet_state('SERVING')
        shard_replica.wait_for_vttablet_state('SERVING')
        shard_rdonly.wait_for_vttablet_state('SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/0',
            shard_master.tablet_alias
        ],
                        auto_log=True)

        # create the tables and add startup values
        self._create_schema()
        self._insert_startup_values()

        # change the schema, backfill keyspace_id, and change schema again
        self._add_sharding_key_to_schema()
        self._backfill_keyspace_id(shard_master)
        self._mark_sharding_key_not_null()

        # create the split shards
        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_replica.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly,
                shard_1_master, shard_1_replica, shard_1_rdonly
        ]:
            t.start_vttablet(wait_for_state=None)
        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly,
                shard_1_master, shard_1_replica, shard_1_rdonly
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/-80',
            shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/80-',
            shard_1_master.tablet_alias
        ],
                        auto_log=True)

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace',
            'Partitions(master): -\n' + 'Partitions(rdonly): -\n' +
            'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica',
            keyspace_id_type=keyspace_id_type)

        # take the snapshot for the split
        utils.run_vtctl(
            ['MultiSnapshot', '--spec=-80-', shard_replica.tablet_alias],
            auto_log=True)

        # wait for tablet's binlog server service to be enabled after snapshot
        shard_replica.wait_for_binlog_server_state("Enabled")

        # perform the restore.
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/-80', shard_replica.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/80-', shard_replica.tablet_alias
        ],
                        auto_log=True)

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'],
                        auto_log=True)

        # check the binlog players are running
        logging.debug("Waiting for binlog players to start on new masters...")
        shard_0_master.wait_for_binlog_player_count(1)
        shard_1_master.wait_for_binlog_player_count(1)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000)
        logging.debug("Checking 80 percent of data is sent quickly")
        self._check_lots_timeout(1000, 80, 5)
        logging.debug("Checking all data goes through eventually")
        self._check_lots_timeout(1000, 100, 20)
        logging.debug("Checking no data was sent the wrong way")
        self._check_lots_not_present(1000)

        # use the vtworker checker to compare the data
        logging.debug("Running vtworker SplitDiff for -80")
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)

        logging.debug("Running vtworker SplitDiff for 80-")
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause("Good time to test vtworker for diffs")

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        expect_fail=True)

        # now serve rdonly from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace',
            'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' +
            'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica',
            keyspace_id_type=keyspace_id_type)

        # then serve replica from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -\n' +
                                 'Partitions(rdonly): -80 80-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'],
            auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace',
            'Partitions(master): -\n' + 'Partitions(rdonly): -80 80-\n' +
            'Partitions(replica): -\n' + 'TabletTypes: master,rdonly,replica',
            keyspace_id_type=keyspace_id_type)
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -\n' +
                                 'Partitions(rdonly): -80 80-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # then serve master from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # check the binlog players are gone now
        shard_0_master.wait_for_binlog_player_count(0)
        shard_1_master.wait_for_binlog_player_count(0)

        # make sure we can't delete a shard with tablets
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True)

        # scrap the original tablets in the original shard
        for t in [shard_master, shard_replica, shard_rdonly]:
            utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True)
        tablet.kill_tablets([shard_master, shard_replica, shard_rdonly])
        for t in [shard_master, shard_replica, shard_rdonly]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True)

        # kill everything else
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_rdonly, shard_1_master,
            shard_1_replica, shard_1_rdonly
        ])
Ejemplo n.º 46
0
    def test_resharding(self):
        # create the keyspace with just one shard
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'keyspace_id', keyspace_id_type
        ])

        shard_master.init_tablet('master', 'test_keyspace', '0')
        shard_replica.init_tablet('replica', 'test_keyspace', '0')
        shard_rdonly1.init_tablet('rdonly', 'test_keyspace', '0')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # create databases so vttablet can start behaving normally
        for t in [shard_master, shard_replica, shard_rdonly1]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None)

        # wait for the tablets
        shard_master.wait_for_vttablet_state('SERVING')
        shard_replica.wait_for_vttablet_state('SERVING')
        shard_rdonly1.wait_for_vttablet_state('SERVING')

        # reparent to make the tablets work
        utils.run_vtctl(
            ['InitShardMaster', 'test_keyspace/0', shard_master.tablet_alias],
            auto_log=True)

        # create the tables and add startup values
        self._create_schema()
        self._insert_startup_values()

        # change the schema, backfill keyspace_id, and change schema again
        self._add_sharding_key_to_schema()
        self._backfill_keyspace_id(shard_master)
        self._mark_sharding_key_not_null()

        # create the split shards
        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_rdonly1.init_tablet('rdonly', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_replica.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly1,
                shard_1_master, shard_1_replica, shard_1_rdonly1
        ]:
            t.start_vttablet(wait_for_state=None)
        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly1,
                shard_1_master, shard_1_replica, shard_1_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias
        ],
                        auto_log=True)

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace', 'Partitions(master): -\n'
                                 'Partitions(rdonly): -\n'
                                 'Partitions(replica): -\n',
                                 keyspace_id_type=keyspace_id_type)

        # we need to create the schema, and the worker will do data copying
        for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)

        utils.run_vtworker([
            '--cell', 'test_nj', '--command_display_interval', '10ms',
            'SplitClone', '--exclude_tables', 'unrelated',
            '--strategy=-populate_blp_checkpoint', '--source_reader_count',
            '10', '--min_table_size_for_split', '1', 'test_keyspace/0'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'],
                        auto_log=True)

        # check the binlog players are running
        logging.debug('Waiting for binlog players to start on new masters...')
        shard_0_master.wait_for_binlog_player_count(1)
        shard_1_master.wait_for_binlog_player_count(1)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000)
        logging.debug('Checking 80 percent of data is sent quickly')
        self._check_lots_timeout(1000, 80, 5)
        logging.debug('Checking all data goes through eventually')
        self._check_lots_timeout(1000, 100, 20)
        logging.debug('Checking no data was sent the wrong way')
        self._check_lots_not_present(1000)

        # use vtworker to compare the data
        logging.debug('Running vtworker SplitDiff for -80')
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/-80'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_0_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        logging.debug('Running vtworker SplitDiff for 80-')
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/80-'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause('Good time to test vtworker for diffs')

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        expect_fail=True)

        # now serve rdonly from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace', 'Partitions(master): -\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -\n',
                                 keyspace_id_type=keyspace_id_type)

        # then serve replica from the split shards
        source_tablet = shard_replica
        destination_tablets = [shard_0_replica, shard_1_replica]

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace', 'Partitions(master): -\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'],
            auto_log=True)
        # After a backwards migration, queryservice should be enabled on
        # source and disabled on destinations
        utils.check_tablet_query_service(self, source_tablet, True, False)
        utils.check_tablet_query_services(self, destination_tablets, False,
                                          True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace', 'Partitions(master): -\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -\n',
                                 keyspace_id_type=keyspace_id_type)

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        # After a forwards migration, queryservice should be disabled on
        # source and enabled on destinations
        utils.check_tablet_query_service(self, source_tablet, False, True)
        utils.check_tablet_query_services(self, destination_tablets, True,
                                          False)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace', 'Partitions(master): -\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type)

        # then serve master from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n'
                                 'Partitions(rdonly): -80 80-\n'
                                 'Partitions(replica): -80 80-\n',
                                 keyspace_id_type=keyspace_id_type)

        # check the binlog players are gone now
        shard_0_master.wait_for_binlog_player_count(0)
        shard_1_master.wait_for_binlog_player_count(0)

        # make sure we can't delete a shard with tablets
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True)

        # remove the original tablets in the original shard
        tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1])
        for t in [shard_replica, shard_rdonly1]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
        utils.run_vtctl(
            ['DeleteTablet', '-allow_master', shard_master.tablet_alias],
            auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True)

        # kill everything else
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master,
            shard_1_replica, shard_1_rdonly1
        ])
Ejemplo n.º 47
0
    def test_health_check_worker_state_does_not_shutdown_query_service(self):
        # This test is similar to test_health_check, but has the following
        # differences:
        # - the second tablet is an 'rdonly' and not a 'replica'
        # - the second tablet will be set to 'worker' and we expect that
        #   the query service won't be shutdown

        # Setup master and rdonly tablets.
        tablet_62344.init_tablet('master', 'test_keyspace', '0')

        for t in tablet_62344, tablet_62044:
            t.create_db('vt_test_keyspace')

        tablet_62344.start_vttablet(wait_for_state=None,
                                    target_tablet_type='replica')
        tablet_62044.start_vttablet(wait_for_state=None,
                                    target_tablet_type='rdonly',
                                    init_keyspace='test_keyspace',
                                    init_shard='0')

        tablet_62344.wait_for_vttablet_state('SERVING')
        tablet_62044.wait_for_vttablet_state('NOT_SERVING')
        self.check_healthz(tablet_62044, False)

        # Enable replication.
        utils.run_vtctl(
            ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias])
        # Trigger healthcheck to save time waiting for the next interval.
        utils.run_vtctl(
            ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly'])
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'rdonly')
        self.check_healthz(tablet_62044, True)
        tablet_62044.wait_for_vttablet_state('SERVING')

        # Change from rdonly to worker and stop replication. (These
        # actions are similar to the SplitClone vtworker command
        # implementation.)  The tablet will become unhealthy, but the
        # query service is still running.
        utils.run_vtctl(
            ['ChangeSlaveType', tablet_62044.tablet_alias, 'worker'])
        utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias])
        # Trigger healthcheck explicitly to avoid waiting for the next interval.
        utils.run_vtctl(
            ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly'])
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'worker')
        self.check_healthz(tablet_62044, False)
        # Make sure that replication got disabled.
        self.assertIn(
            '>unhealthy: replication_reporter: '
            'Replication is not running</span></div>',
            tablet_62044.get_status())
        # Query service is still running.
        tablet_62044.wait_for_vttablet_state('SERVING')

        # Restart replication. Tablet will become healthy again.
        utils.run_vtctl(
            ['ChangeSlaveType', tablet_62044.tablet_alias, 'spare'])
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'spare')
        utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias])
        utils.run_vtctl(
            ['RunHealthCheck', tablet_62044.tablet_alias, 'rdonly'])
        self.wait_for_tablet_type_change(tablet_62044.tablet_alias, 'rdonly')
        self.check_healthz(tablet_62044, True)
        tablet_62044.wait_for_vttablet_state('SERVING')

        # kill the tablets
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 48
0
    def test_reparent_slave_offline(self, shard_id='0'):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Force the slaves to reparent assuming that all the datasets are
        # identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/' + shard_id,
            tablet_62344.tablet_alias
        ])
        utils.validate_topology(ping_tablets=True)

        self._check_master_tablet(tablet_62344)

        # Kill one tablet so we seem offline
        tablet_31981.kill_vttablet()

        # Perform a graceful reparent operation.
        _, stderr = utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' +
            shard_id, '-new_master', tablet_62044.tablet_alias
        ],
                                    expect_fail=True)
        self.assertIn('Tablet test_ny-0000031981 SetMaster failed', stderr)

        self._check_master_tablet(tablet_62044)

        tablet.kill_tablets([tablet_62344, tablet_62044, tablet_41983])
Ejemplo n.º 49
0
    def test_reparent_lag_slave(self, shard_id='0'):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('master',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('lag',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_31981]:
            t.wait_for_vttablet_state('SERVING')
        tablet_41983.wait_for_vttablet_state('NOT_SERVING')

        # Recompute the shard layout node - until you do that, it might not be
        # valid.
        utils.run_vtctl(['RebuildShardGraph', 'test_keyspace/' + shard_id])
        utils.validate_topology()

        # Force the slaves to reparent assuming that all the datasets are identical.
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.reset_replication()
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/' + shard_id,
            tablet_62344.tablet_alias
        ])
        utils.validate_topology(ping_tablets=True)

        tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test)

        tablet_41983.mquery('', 'stop slave')
        for q in self._populate_vt_insert_test:
            tablet_62344.mquery('vt_test_keyspace', q, write=True)

        # Perform a graceful reparent operation.
        utils.run_vtctl([
            'ReparentShard', 'test_keyspace/' + shard_id,
            tablet_62044.tablet_alias
        ])

        tablet_41983.mquery('', 'start slave')
        time.sleep(1)

        utils.pause('check orphan')

        utils.run_vtctl(['ReparentTablet', tablet_41983.tablet_alias])

        result = tablet_41983.mquery(
            'vt_test_keyspace', 'select msg from vt_insert_test where id=1')
        if len(result) != 1:
            self.fail('expected 1 row from vt_insert_test: %s' % str(result))

        utils.pause('check lag reparent')

        tablet.kill_tablets(
            [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
Ejemplo n.º 50
0
  def test_merge_sharding(self):
    utils.run_vtctl(['CreateKeyspace',
                     '--sharding_column_name', 'custom_ksid_col',
                     '--sharding_column_type', base_sharding.keyspace_id_type,
                     'test_keyspace'])

    shard_0_master.init_tablet('replica', 'test_keyspace', '-40')
    shard_0_replica.init_tablet('replica', 'test_keyspace', '-40')
    shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40')
    shard_1_master.init_tablet('replica', 'test_keyspace', '40-80')
    shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80')
    shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80')
    shard_2_master.init_tablet('replica', 'test_keyspace', '80-')
    shard_2_replica.init_tablet('replica', 'test_keyspace', '80-')
    shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')

    # rebuild and check SrvKeyspace
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
    ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
    self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')

    # create databases so vttablet can start behaving normally
    for t in [shard_0_master, shard_0_replica, shard_0_rdonly,
              shard_1_master, shard_1_replica, shard_1_rdonly,
              shard_2_master, shard_2_replica, shard_2_rdonly]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None)

    # won't be serving, no replication state
    for t in [shard_0_master, shard_0_replica, shard_0_rdonly,
              shard_1_master, shard_1_replica, shard_1_rdonly,
              shard_2_master, shard_2_replica, shard_2_rdonly]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # reparent to make the tablets work
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40',
                     shard_0_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80',
                     shard_1_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-',
                     shard_2_master.tablet_alias], auto_log=True)

    # create the tables
    self._create_schema()
    self._insert_startup_values()

    # run a health check on source replicas so they respond to discovery
    # (for binlog players) and on the source rdonlys (for workers)
    for t in [shard_0_replica, shard_1_replica]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
    for t in [shard_0_rdonly, shard_1_rdonly]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

    # create the merge shards
    shard_dest_master.init_tablet('replica', 'test_keyspace', '-80')
    shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80')
    shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')

    # start vttablet on the destination shard (no db created,
    # so they're all not serving)
    for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]:
      t.start_vttablet(wait_for_state=None)
    for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]:
      t.wait_for_vttablet_state('NOT_SERVING')

    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80',
                     shard_dest_master.tablet_alias], auto_log=True)

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                    auto_log=True)
    utils.check_srv_keyspace(
        'test_nj', 'test_keyspace',
        'Partitions(master): -40 40-80 80-\n'
        'Partitions(rdonly): -40 40-80 80-\n'
        'Partitions(replica): -40 40-80 80-\n',
        keyspace_id_type=base_sharding.keyspace_id_type,
        sharding_column_name='custom_ksid_col')

    # copy the schema
    utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias,
                     'test_keyspace/-80'], auto_log=True)

    # copy the data (will also start filtered replication), reset source
    # Run vtworker as daemon for the following SplitClone commands.
    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--command_display_interval', '10ms'],
        auto_log=True)

    # Initial clone (online).
    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--offline=false',
         '--chunk_count', '10',
         '--min_rows_per_chunk', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/-80'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        2, 0, 0, 0)

    # Reset vtworker such that we can run the next command.
    workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
    utils.wait_procs([workerclient_proc])

    # Modify the destination shard. SplitClone will revert the changes.
    # Delete row 1 (provokes an insert).
    shard_dest_master.mquery('vt_test_keyspace',
                             'delete from resharding1 where id=1', write=True)
    # Update row 2 (provokes an update).
    shard_dest_master.mquery(
        'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2",
        write=True)
    # Insert row 0 (provokes a delete).
    self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0',
                       0x5000000000000000)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--chunk_count', '10',
         '--min_rows_per_chunk', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/-80'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    # Change tablets, which were taken offline, back to rdonly.
    utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        1, 1, 1, 0)
    self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1',
                                        0, 0, 0, 2)
    # Terminate worker daemon because it is no longer needed.
    utils.kill_sub_process(worker_proc, soft=True)

    # check the startup values are in the right place
    self._check_startup_values()

    # check the schema too
    utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True)

    # check binlog player variables
    self.check_destination_master(shard_dest_master,
                                  ['test_keyspace/-40', 'test_keyspace/40-80'])

    # check that binlog server exported the stats vars
    self.check_binlog_server_vars(shard_0_replica, horizontal=True)
    self.check_binlog_server_vars(shard_1_replica, horizontal=True)

    # testing filtered replication: insert a bunch of data on shard 0 and 1,
    # check we get most of it after a few seconds, wait for binlog server
    # timeout, check we get all of it.
    logging.debug('Inserting lots of data on source shards')
    self._insert_lots(1000)
    logging.debug('Checking 80 percent of data is sent quickly')
    v = self._check_lots_timeout(1000, 80, 10)
    if v != 100:
      # small optimization: only do this check if we don't have all the data
      # already anyway.
      logging.debug('Checking all data goes through eventually')
      self._check_lots_timeout(1000, 100, 30)
    self.check_binlog_player_vars(shard_dest_master,
                                  ['test_keyspace/-40', 'test_keyspace/40-80'],
                                  seconds_behind_master_max=30)
    self.check_binlog_server_vars(shard_0_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)
    self.check_binlog_server_vars(shard_1_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)

    # use vtworker to compare the data (after health-checking the destination
    # rdonly tablets so discovery works)
    utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias])
    logging.debug('Running vtworker SplitDiff on first half')
    utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        '--source_uid', '0',
                        'test_keyspace/-80'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    logging.debug('Running vtworker SplitDiff on second half')
    utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        '--source_uid', '1',
                        'test_keyspace/-80'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)

    # get status for the destination master tablet, make sure we have it all
    self.check_running_binlog_player(shard_dest_master, 3000, 1000)

    # check destination master query service is not running
    utils.check_tablet_query_service(self, shard_dest_master, False, False)
    stream_health = utils.run_vtctl_json(['VtTabletStreamHealth',
                                          '-count', '1',
                                          shard_dest_master.tablet_alias])
    logging.debug('Got health: %s', str(stream_health))
    self.assertIn('realtime_stats', stream_health)
    self.assertNotIn('serving', stream_health)

    # check the destination master 3 is healthy, even though its query
    # service is not running (if not healthy this would exception out)
    shard_dest_master.get_healthz()

    # now serve rdonly from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -40 40-80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -40 40-80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # now serve replica from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -40 40-80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # now serve master from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_0_master, False, True)
    utils.check_tablet_query_service(self, shard_1_master, False, True)

    # check the binlog players are gone now
    self.check_no_binlog_player(shard_dest_master)

    # kill the original tablets in the original shards
    tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly,
                         shard_1_master, shard_1_replica, shard_1_rdonly])
    for t in [shard_0_replica, shard_0_rdonly,
              shard_1_replica, shard_1_rdonly]:
      utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
    for t in [shard_0_master, shard_1_master]:
      utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias],
                      auto_log=True)

    # delete the original shards
    utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True)
    utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True)

    # rebuild the serving graph, all mentions of the old shards shoud be gone
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    # kill everything else
    tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly,
                         shard_dest_master, shard_dest_replica,
                         shard_dest_rdonly])
Ejemplo n.º 51
0
    def _test_reparent_from_outside(self, brutal=False):
        """This test will start a master and 3 slaves.

    Then:
    - one slave will be the new master
    - one slave will be reparented to that new master
    - one slave will be busted and dead in the water
    and we'll call TabletExternallyReparented.

    Args:
      brutal: kills the old master first
    """
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Reparent as a starting point
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ],
                        auto_log=True)

        # now manually reparent 1 out of 2 tablets
        # 62044 will be the new master
        # 31981 won't be re-parented, so it will be busted

        # Shutdown the old master first.
        if not brutal:
            tablet_62344.mquery('', mysql_flavor().demote_master_commands())

            # Get the position of the old master and wait for the new one to catch up.
            utils.wait_for_replication_pos(tablet_62344, tablet_62044)

        # Promote the new master.
        tablet_62044.mquery('', mysql_flavor().promote_slave_commands())
        new_pos = mysql_flavor().master_position(tablet_62044)
        logging.debug('New master position: %s', str(new_pos))
        # Use 'localhost' as hostname because Travis CI worker hostnames
        # are too long for MySQL replication.
        change_master_cmds = mysql_flavor().change_master_commands(
            'localhost', tablet_62044.mysql_port, new_pos)

        # 62344 will now be a slave of 62044
        tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                            change_master_cmds + ['START SLAVE'])

        # 41983 will be a slave of 62044
        tablet_41983.mquery('', ['STOP SLAVE'] + change_master_cmds +
                            ['START SLAVE'])

        # in brutal mode, we kill the old master first
        # and delete its tablet record
        if brutal:
            tablet_62344.kill_vttablet()
            utils.run_vtctl(
                ['DeleteTablet', '-allow_master', tablet_62344.tablet_alias],
                auto_log=True)

        base_time = time.time()

        # update topology with the new server
        utils.run_vtctl(
            ['TabletExternallyReparented', tablet_62044.tablet_alias],
            mode=utils.VTCTL_VTCTL,
            auto_log=True)

        self._test_reparent_from_outside_check(brutal, base_time)

        if not brutal:
            tablet_62344.kill_vttablet()
        tablet.kill_tablets([tablet_31981, tablet_62044, tablet_41983])
Ejemplo n.º 52
0
  def test_vertical_split(self):
    utils.run_vtctl(['CreateKeyspace', 'source_keyspace'])
    utils.run_vtctl(['CreateKeyspace',
                     '--served_from', 'master:source_keyspace,replica:source_keyspace,rdonly:source_keyspace',
                     'destination_keyspace'])
    source_master.init_tablet('master', 'source_keyspace', '0')
    source_replica.init_tablet('replica', 'source_keyspace', '0')
    source_rdonly1.init_tablet('rdonly', 'source_keyspace', '0')
    source_rdonly2.init_tablet('rdonly', 'source_keyspace', '0')

    # rebuild destination keyspace to make sure there is a serving
    # graph entry, even though there is no tablet yet.
    utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True)
    utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'],
                    auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' +
                             'ServedFrom(rdonly): source_keyspace\n' +
                             'ServedFrom(replica): source_keyspace\n')

    destination_master.init_tablet('master', 'destination_keyspace', '0')
    destination_replica.init_tablet('replica', 'destination_keyspace', '0')
    destination_rdonly1.init_tablet('rdonly', 'destination_keyspace', '0')
    destination_rdonly2.init_tablet('rdonly', 'destination_keyspace', '0')

    utils.run_vtctl(['RebuildKeyspaceGraph', 'source_keyspace'], auto_log=True)
    utils.run_vtctl(['RebuildKeyspaceGraph', 'destination_keyspace'],
                    auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' +
                             'ServedFrom(rdonly): source_keyspace\n' +
                             'ServedFrom(replica): source_keyspace\n')

    # create databases so vttablet can start behaving normally
    for t in [source_master, source_replica, source_rdonly1, source_rdonly2]:
      t.create_db('vt_source_keyspace')
      t.start_vttablet(wait_for_state=None)
    destination_master.start_vttablet(wait_for_state=None,
                                      target_tablet_type='replica')
    for t in [destination_replica, destination_rdonly1, destination_rdonly2]:
      t.start_vttablet(wait_for_state=None)

    # wait for the tablets
    for t in [source_master, source_replica, source_rdonly1, source_rdonly2]:
      t.wait_for_vttablet_state('SERVING')
    for t in [destination_master, destination_replica, destination_rdonly1,
              destination_rdonly2]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # reparent to make the tablets work
    utils.run_vtctl(['InitShardMaster', 'source_keyspace/0',
                     source_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', 'destination_keyspace/0',
                     destination_master.tablet_alias], auto_log=True)

    # read all the keyspaces, this will populate the topology cache.
    self._populate_topo_cache()

    # create the schema on the source keyspace, add some values
    self._create_source_schema()
    moving1_first = self._insert_values('moving1', 100)
    moving2_first = self._insert_values('moving2', 100)
    staying1_first = self._insert_values('staying1', 100)
    staying2_first = self._insert_values('staying2', 100)
    self._check_values(source_master, 'vt_source_keyspace', 'moving1',
                       moving1_first, 100)
    self._check_values(source_master, 'vt_source_keyspace', 'moving2',
                       moving2_first, 100)
    self._check_values(source_master, 'vt_source_keyspace', 'staying1',
                       staying1_first, 100)
    self._check_values(source_master, 'vt_source_keyspace', 'staying2',
                       staying2_first, 100)
    self._check_values(source_master, 'vt_source_keyspace', 'view1',
                       moving1_first, 100)

    # the worker will do everything. We test with source_reader_count=10
    # (down from default=20) as connection pool is not big enough for 20.
    # min_table_size_for_split is set to 1 as to force a split even on the
    # small table we have.
    utils.run_vtctl(['CopySchemaShard', '--tables', 'moving.*,view1',
                     source_rdonly1.tablet_alias, 'destination_keyspace/0'],
                    auto_log=True)

    utils.run_vtworker(['--cell', 'test_nj',
                        '--command_display_interval', '10ms',
                        'VerticalSplitClone',
                        '--tables', 'moving.*,view1',
                        '--strategy=-populate_blp_checkpoint',
                        '--source_reader_count', '10',
                        '--min_table_size_for_split', '1',
                        'destination_keyspace/0'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', source_rdonly1.tablet_alias,
                     'rdonly'], auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', source_rdonly2.tablet_alias,
                     'rdonly'], auto_log=True)

    topology.refresh_keyspace(self.vtgate_client, 'destination_keyspace')

    # check values are present
    self._check_values(destination_master, 'vt_destination_keyspace', 'moving1',
                       moving1_first, 100)
    self._check_values(destination_master, 'vt_destination_keyspace', 'moving2',
                       moving2_first, 100)
    self._check_values(destination_master, 'vt_destination_keyspace', 'view1',
                       moving1_first, 100)

    # check the binlog players is running
    destination_master.wait_for_binlog_player_count(1)

    # add values to source, make sure they're replicated
    moving1_first_add1 = self._insert_values('moving1', 100)
    staying1_first_add1 = self._insert_values('staying1', 100)
    moving2_first_add1 = self._insert_values('moving2', 100)
    self._check_values_timeout(destination_master, 'vt_destination_keyspace',
                               'moving1', moving1_first_add1, 100)
    self._check_values_timeout(destination_master, 'vt_destination_keyspace',
                               'moving2', moving2_first_add1, 100)

    # use vtworker to compare the data
    logging.debug("Running vtworker VerticalSplitDiff")
    utils.run_vtworker(['-cell', 'test_nj', 'VerticalSplitDiff',
                        'destination_keyspace/0'], auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', source_rdonly1.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', source_rdonly2.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', destination_rdonly1.tablet_alias,
                     'rdonly'], auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', destination_rdonly2.tablet_alias,
                     'rdonly'], auto_log=True)

    utils.pause("Good time to test vtworker for diffs")

    # get status for destination master tablet, make sure we have it all
    destination_master_status = destination_master.get_status()
    self.assertIn('Binlog player state: Running', destination_master_status)
    self.assertIn('moving.*', destination_master_status)
    self.assertIn('<td><b>All</b>: 1000<br><b>Query</b>: 700<br><b>Transaction</b>: 300<br></td>', destination_master_status)
    self.assertIn('</html>', destination_master_status)

    # check query service is off on destination master, as filtered
    # replication is enabled. Even health check should not interfere.
    destination_master_vars = utils.get_vars(destination_master.port)
    self.assertEqual(destination_master_vars['TabletStateName'], 'NOT_SERVING')

    # check we can't migrate the master just yet
    utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'],
                    expect_fail=True)

    # migrate rdonly only in test_ny cell, make sure nothing is migrated
    # in test_nj
    utils.run_vtctl(['MigrateServedFrom', '--cells=test_ny',
                     'destination_keyspace/0', 'rdonly'],
                    auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' +
                             'ServedFrom(rdonly): source_keyspace\n' +
                             'ServedFrom(replica): source_keyspace\n')
    self._check_blacklisted_tables(source_master, None)
    self._check_blacklisted_tables(source_replica, None)
    self._check_blacklisted_tables(source_rdonly1, None)
    self._check_blacklisted_tables(source_rdonly2, None)

    # migrate test_nj only, using command line manual fix command,
    # and restore it back.
    keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace'])
    self.assertEqual(keyspace_json['ServedFromMap']['rdonly']['Cells'],
                     ['test_nj'])
    utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace',
                     '-remove', '-cells=test_nj', 'destination_keyspace',
                     'rdonly'], auto_log=True)
    keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace'])
    self.assertFalse('rdonly' in keyspace_json['ServedFromMap'])
    utils.run_vtctl(['SetKeyspaceServedFrom', '-source=source_keyspace',
                     'destination_keyspace', 'rdonly'],
                    auto_log=True)
    keyspace_json = utils.run_vtctl_json(['GetKeyspace', 'destination_keyspace'])
    self.assertEqual(keyspace_json['ServedFromMap']['rdonly']['Cells'],
                     None)

    # now serve rdonly from the destination shards
    utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'rdonly'],
                    auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' +
                             'ServedFrom(replica): source_keyspace\n')
    self._check_blacklisted_tables(source_master, None)
    self._check_blacklisted_tables(source_replica, None)
    self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1'])
    self._check_client_conn_redirection(
        'source_keyspace', 'destination_keyspace', ['rdonly'],
        ['master', 'replica'], ['moving1', 'moving2'])

    # then serve replica from the destination shards
    utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'],
                    auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n')
    self._check_blacklisted_tables(source_master, None)
    self._check_blacklisted_tables(source_replica, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1'])
    self._check_client_conn_redirection('source_keyspace', 'destination_keyspace', ['replica', 'rdonly'], ['master'], ['moving1', 'moving2'])

    # move replica back and forth
    utils.run_vtctl(['MigrateServedFrom', '-reverse',
                     'destination_keyspace/0', 'replica'], auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n' +
                             'ServedFrom(replica): source_keyspace\n')
    self._check_blacklisted_tables(source_master, None)
    self._check_blacklisted_tables(source_replica, None)
    self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1'])
    utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'replica'],
                    auto_log=True)
    self._check_srv_keyspace('ServedFrom(master): source_keyspace\n')
    self._check_blacklisted_tables(source_master, None)
    self._check_blacklisted_tables(source_replica, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1'])
    self._check_client_conn_redirection(
        'source_keyspace', 'destination_keyspace', ['replica', 'rdonly'],
        ['master'], ['moving1', 'moving2'])

    # then serve master from the destination shards
    utils.run_vtctl(['MigrateServedFrom', 'destination_keyspace/0', 'master'],
                    auto_log=True)
    self._check_srv_keyspace('')
    self._check_blacklisted_tables(source_master, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_replica, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly1, ['moving.*', 'view1'])
    self._check_blacklisted_tables(source_rdonly2, ['moving.*', 'view1'])
    self._check_client_conn_redirection(
        'source_keyspace', 'destination_keyspace',
        ['replica', 'rdonly', 'master'], [], ['moving1', 'moving2'])

    # check 'vtctl SetShardTabletControl' command works as expected:
    # clear the rdonly entry, re-add it, and then clear all entries.
    utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0',
                     'rdonly'], auto_log=True)
    shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0'])
    self.assertNotIn('rdonly', shard_json['TabletControlMap'])
    self.assertIn('replica', shard_json['TabletControlMap'])
    self.assertIn('master', shard_json['TabletControlMap'])
    utils.run_vtctl(['SetShardTabletControl', '--tables=moving.*,view1',
                     'source_keyspace/0', 'rdonly'], auto_log=True)
    shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0'])
    self.assertEqual(['moving.*', 'view1'],
                     shard_json['TabletControlMap']['rdonly']['BlacklistedTables'])
    utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0',
                     'rdonly'], auto_log=True)
    utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0',
                     'replica'], auto_log=True)
    utils.run_vtctl(['SetShardTabletControl', '--remove', 'source_keyspace/0',
                     'master'], auto_log=True)
    shard_json = utils.run_vtctl_json(['GetShard', 'source_keyspace/0'])
    self.assertEqual(None, shard_json['TabletControlMap'])

    # check the binlog player is gone now
    destination_master.wait_for_binlog_player_count(0)

    # optional method to check the stats are correct
    self._check_stats()

    # kill everything
    tablet.kill_tablets([source_master, source_replica, source_rdonly1,
                         source_rdonly2, destination_master,
                         destination_replica, destination_rdonly1,
                         destination_rdonly2])
Ejemplo n.º 53
0
    def test_reparent_cross_cell(self, shard_id='0'):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
        self.assertEqual(
            shard['cells'], ['test_nj'],
            'wrong list of cell in Shard: %s' % str(shard['cells']))

        # Create a few slaves for testing reparenting. Won't be healthy
        # as replication is not running.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 shard_id,
                                 start=True,
                                 wait_for_start=False)
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/' + shard_id])
        self.assertEqual(
            shard['cells'], ['test_nj', 'test_ny'],
            'wrong list of cell in Shard: %s' % str(shard['cells']))

        # Force the slaves to reparent assuming that all the datasets are
        # identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/' + shard_id,
            tablet_62344.tablet_alias
        ],
                        auto_log=True)
        utils.validate_topology(ping_tablets=True)

        self._check_master_tablet(tablet_62344)

        # Perform a graceful reparent operation to another cell.
        utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/' +
            shard_id, '-new_master', tablet_31981.tablet_alias
        ],
                        auto_log=True)
        utils.validate_topology()

        self._check_master_tablet(tablet_31981)

        tablet.kill_tablets(
            [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
Ejemplo n.º 54
0
    def test_resharding(self):
        utils.run_vtctl([
            'CreateKeyspace', '--sharding_column_name', 'bad_column',
            '--sharding_column_type', 'bytes', 'test_keyspace'
        ])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'
        ],
                        expect_fail=True)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'keyspace_id', keyspace_id_type
        ])

        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_ny_slave.init_tablet('spare', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-')
        shard_1_ny_slave.init_tablet('spare', 'test_keyspace', '80-')
        shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # we set full_mycnf_args to True as a test in the KIT_BYTES case
        full_mycnf_args = keyspace_id_type == keyrange_constants.KIT_BYTES

        # create databases so vttablet can start behaving normally
        for t in [
                shard_0_master, shard_0_replica, shard_0_ny_slave,
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_slave, shard_1_rdonly
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             full_mycnf_args=full_mycnf_args)

        # wait for the tablets
        shard_0_master.wait_for_vttablet_state('SERVING')
        shard_0_replica.wait_for_vttablet_state('SERVING')
        shard_0_ny_slave.wait_for_vttablet_state('NOT_SERVING')  # spare
        shard_1_master.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('SERVING')
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')  # spare
        shard_1_ny_slave.wait_for_vttablet_state('NOT_SERVING')  # spare
        shard_1_rdonly.wait_for_vttablet_state('SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/-80',
            shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/80-',
            shard_1_master.tablet_alias
        ],
                        auto_log=True)

        # create the tables
        self._create_schema()
        self._insert_startup_values()
        self._test_keyrange_constraints()

        # create the split shards
        shard_2_master.init_tablet('master', 'test_keyspace', '80-c0')
        shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-c0')
        shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-c0')
        shard_3_master.init_tablet('master', 'test_keyspace', 'c0-')
        shard_3_replica.init_tablet('spare', 'test_keyspace', 'c0-')
        shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'c0-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        shard_3_master.start_vttablet(wait_for_state=None,
                                      target_tablet_type='replica')
        for t in [
                shard_2_master, shard_2_replica1, shard_2_replica2,
                shard_3_replica, shard_3_rdonly
        ]:
            t.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_master, shard_2_replica1, shard_2_replica2,
                shard_3_master, shard_3_replica, shard_3_rdonly
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/80-c0',
            shard_2_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/c0-',
            shard_3_master.tablet_alias
        ],
                        auto_log=True)

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # take the snapshot for the split
        utils.run_vtctl([
            'MultiSnapshot', '--spec=80-c0-', '--exclude_tables=unrelated',
            shard_1_slave1.tablet_alias
        ],
                        auto_log=True)

        # the snapshot_copy hook will copy the snapshot files to
        # VTDATAROOT/tmp/... as a test. We want to use these for one half,
        # but not for the other, so we test both scenarios.
        os.unlink(
            os.path.join(
                environment.tmproot, "snapshot-from-%s-for-%s.tar" %
                (shard_1_slave1.tablet_alias, "80-c0")))

        # wait for tablet's binlog server service to be enabled after snapshot
        shard_1_slave1.wait_for_binlog_server_state("Enabled")

        # perform the restores: first one from source tablet. We removed the
        # storage backup, so it's coming from the tablet itself.
        # we also delay starting the binlog player, then enable it.
        utils.run_vtctl([
            'ShardMultiRestore',
            '-strategy=populateBlpCheckpoint,dontStartBinlogPlayer',
            'test_keyspace/80-c0', shard_1_slave1.tablet_alias
        ],
                        auto_log=True)

        timeout = 10
        while True:
            shard_2_master_status = shard_2_master.get_status()
            if not "not starting because flag &#39;DontStart&#39; is set" in shard_2_master_status:
                timeout = utils.wait_step(
                    'shard 2 master has not failed starting yet', timeout)
                continue
            logging.debug("shard 2 master is waiting on flag removal, good")
            break

        qr = utils.run_vtctl_json([
            'ExecuteFetch', shard_2_master.tablet_alias,
            'update _vt.blp_checkpoint set flags="" where source_shard_uid=0'
        ])
        self.assertEqual(qr['RowsAffected'], 1)

        timeout = 10
        while True:
            shard_2_master_status = shard_2_master.get_status()
            if "not starting because flag &#39;DontStart&#39; is set" in shard_2_master_status:
                timeout = utils.wait_step(
                    'shard 2 master has not started replication yet', timeout)
                continue
            logging.debug("shard 2 master has started replication, good")
            break

        # second restore from storage: to be sure, we stop vttablet, and restart
        # it afterwards
        shard_1_slave1.kill_vttablet()
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/c0-', shard_1_slave1.tablet_alias
        ],
                        auto_log=True)
        shard_1_slave1.start_vttablet(wait_for_state=None)
        shard_1_slave1.wait_for_binlog_server_state("Enabled")

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl([
            'ValidateSchemaKeyspace', '--exclude_tables=unrelated',
            'test_keyspace'
        ],
                        auto_log=True)

        # check the binlog players are running and exporting vars
        shard_2_master.wait_for_binlog_player_count(1)
        shard_3_master.wait_for_binlog_player_count(1)
        self._check_binlog_player_vars(shard_2_master)
        self._check_binlog_player_vars(shard_3_master)

        # check that binlog server exported the stats vars
        self._check_binlog_server_vars(shard_1_slave1)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000)
        logging.debug("Checking 80 percent of data is sent quickly")
        self._check_lots_timeout(1000, 80, 5)
        logging.debug("Checking all data goes through eventually")
        self._check_lots_timeout(1000, 100, 20)
        logging.debug("Checking no data was sent the wrong way")
        self._check_lots_not_present(1000)
        self._check_binlog_player_vars(shard_2_master,
                                       seconds_behind_master_max=30)
        self._check_binlog_player_vars(shard_3_master,
                                       seconds_behind_master_max=30)

        # use the vtworker checker to compare the data
        logging.debug("Running vtworker SplitDiff")
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause("Good time to test vtworker for diffs")

        # get status for a destination master tablet, make sure we have it all
        shard_2_master_status = shard_2_master.get_status()
        self.assertIn('Binlog player state: Running', shard_2_master_status)
        self.assertIn(
            '<td><b>All</b>: 6000<br><b>Query</b>: 4000<br><b>Transaction</b>: 2000<br></td>',
            shard_2_master_status)
        self.assertIn('</html>', shard_2_master_status)

        # start a thread to insert data into shard_1 in the background
        # with current time, and monitor the delay
        insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000,
                                       0x9000000000000000)
        insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001,
                                       0xD000000000000000)
        monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low")
        monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high")

        # tests a failover switching serving to a different replica
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')

        # test data goes through again
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000, base=1000)
        logging.debug("Checking 80 percent of data was sent quickly")
        self._check_lots_timeout(1000, 80, 5, base=1000)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        expect_fail=True)

        # check query service is off on master 2 and master 3, as filtered
        # replication is enabled. Even health check that is enabled on
        # master 3 should not interfere.
        shard_2_master_vars = utils.get_vars(shard_2_master.port)
        self.assertEqual(shard_2_master_vars['TabletStateName'], 'NOT_SERVING')
        shard_3_master_vars = utils.get_vars(shard_3_master.port)
        self.assertEqual(shard_3_master_vars['TabletStateName'], 'NOT_SERVING')

        # now serve rdonly from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-c0 c0-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # then serve replica from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-c0 c0-\n' +
                                 'Partitions(replica): -80 80-c0 c0-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
            auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-c0 c0-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-c0 c0-\n' +
                                 'Partitions(replica): -80 80-c0 c0-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # reparent shard_2 to shard_2_replica1, then insert more data and
        # see it flow through still
        utils.run_vtctl([
            'ReparentShard', 'test_keyspace/80-c0',
            shard_2_replica1.tablet_alias
        ])
        logging.debug(
            "Inserting lots of data on source shard after reparenting")
        self._insert_lots(3000, base=2000)
        logging.debug("Checking 80 percent of data was sent fairly quickly")
        self._check_lots_timeout(3000, 80, 10, base=2000)

        # use the vtworker checker to compare the data again
        logging.debug("Running vtworker SplitDiff")
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/c0-'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)

        # going to migrate the master now, check the delays
        monitor_thread_1.done = True
        monitor_thread_2.done = True
        insert_thread_1.done = True
        insert_thread_2.done = True
        logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u",
                      monitor_thread_1.object_name, monitor_thread_1.max_lag,
                      monitor_thread_1.lag_sum / monitor_thread_1.sample_count)
        logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u",
                      monitor_thread_2.object_name, monitor_thread_2.max_lag,
                      monitor_thread_2.lag_sum / monitor_thread_2.sample_count)

        # then serve master from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-c0 c0-\n' +
                                 'Partitions(rdonly): -80 80-c0 c0-\n' +
                                 'Partitions(replica): -80 80-c0 c0-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # check the binlog players are gone now
        shard_2_master.wait_for_binlog_player_count(0)
        shard_3_master.wait_for_binlog_player_count(0)

        # get status for a destination master tablet, make sure it's good
        shard_2_master_status = shard_2_master.get_status()
        self.assertIn('No binlog player is running', shard_2_master_status)
        self.assertIn('</html>', shard_2_master_status)

        # scrap the original tablets in the original shard
        for t in [
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_slave, shard_1_rdonly
        ]:
            utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True)
        tablet.kill_tablets([
            shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_slave,
            shard_1_rdonly
        ])
        for t in [
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_slave, shard_1_rdonly
        ]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # test RemoveShardCell
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'],
                        auto_log=True,
                        expect_fail=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'],
                        auto_log=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'],
                        auto_log=True)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
        if shard['Cells']:
            self.fail("Non-empty Cells record for shard: %s" % str(shard))

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)

        # kill everything
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_ny_slave, shard_2_master,
            shard_2_replica1, shard_2_replica2, shard_3_master,
            shard_3_replica, shard_3_rdonly
        ])
Ejemplo n.º 55
0
    def test_resharding(self):
        utils.run_vtctl([
            'CreateKeyspace', '--sharding_column_name', 'bad_column',
            '--sharding_column_type', 'bytes', 'test_keyspace'
        ])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'keyspace_id', 'uint64'
        ],
                        expect_fail=True)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'keyspace_id', keyspace_id_type
        ])

        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave2.init_tablet('spare', 'test_keyspace', '80-')
        shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # create databases so vttablet can start behaving normally
        for t in [
                shard_0_master, shard_0_replica, shard_1_master,
                shard_1_slave1, shard_1_slave2, shard_1_rdonly
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None)

        # wait for the tablets
        shard_0_master.wait_for_vttablet_state('SERVING')
        shard_0_replica.wait_for_vttablet_state('SERVING')
        shard_1_master.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('SERVING')
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')  # spare
        shard_1_rdonly.wait_for_vttablet_state('SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/-80',
            shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/80-',
            shard_1_master.tablet_alias
        ],
                        auto_log=True)

        # create the tables
        self._create_schema()
        self._insert_startup_values()

        # create the split shards
        shard_2_master.init_tablet('master', 'test_keyspace', '80-C0')
        shard_2_replica1.init_tablet('spare', 'test_keyspace', '80-C0')
        shard_2_replica2.init_tablet('spare', 'test_keyspace', '80-C0')
        shard_3_master.init_tablet('master', 'test_keyspace', 'C0-')
        shard_3_replica.init_tablet('spare', 'test_keyspace', 'C0-')
        shard_3_rdonly.init_tablet('rdonly', 'test_keyspace', 'C0-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        for t in [
                shard_2_master, shard_2_replica1, shard_2_replica2,
                shard_3_master, shard_3_replica, shard_3_rdonly
        ]:
            t.start_vttablet(wait_for_state=None)
        shard_2_master.wait_for_vttablet_state('CONNECTING')
        shard_2_replica1.wait_for_vttablet_state('NOT_SERVING')
        shard_2_replica2.wait_for_vttablet_state('NOT_SERVING')
        shard_3_master.wait_for_vttablet_state('CONNECTING')
        shard_3_replica.wait_for_vttablet_state('NOT_SERVING')
        shard_3_rdonly.wait_for_vttablet_state('CONNECTING')

        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/80-C0',
            shard_2_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ReparentShard', '-force', 'test_keyspace/C0-',
            shard_3_master.tablet_alias
        ],
                        auto_log=True)

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # take the snapshot for the split
        utils.run_vtctl(
            ['MultiSnapshot', '--spec=80-C0-', shard_1_slave1.tablet_alias],
            auto_log=True)

        # wait for tablet's binlog server service to be enabled after snapshot,
        # and check all the others while we're at it
        shard_1_slave1.wait_for_binlog_server_state("Enabled")

        # perform the restore.
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/80-C0', shard_1_slave1.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'ShardMultiRestore', '-strategy=populateBlpCheckpoint',
            'test_keyspace/C0-', shard_1_slave1.tablet_alias
        ],
                        auto_log=True)

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'],
                        auto_log=True)

        # check the binlog players are running
        shard_2_master.wait_for_binlog_player_count(1)
        shard_3_master.wait_for_binlog_player_count(1)

        # check that binlog server exported the stats vars
        self._check_binlog_server_vars(shard_1_slave1)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000)
        logging.debug("Checking 80 percent of data is sent quickly")
        self._check_lots_timeout(1000, 80, 5)
        logging.debug("Checking all data goes through eventually")
        self._check_lots_timeout(1000, 100, 20)
        logging.debug("Checking no data was sent the wrong way")
        self._check_lots_not_present(1000)

        # use the vtworker checker to compare the data
        logging.debug("Running vtworker SplitDiff")
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause("Good time to test vtworker for diffs")

        # start a thread to insert data into shard_1 in the background
        # with current time, and monitor the delay
        insert_thread_1 = InsertThread(shard_1_master, "insert_low", 10000,
                                       0x9000000000000000)
        insert_thread_2 = InsertThread(shard_1_master, "insert_high", 10001,
                                       0xD000000000000000)
        monitor_thread_1 = MonitorLagThread(shard_2_replica2, "insert_low")
        monitor_thread_2 = MonitorLagThread(shard_3_replica, "insert_high")

        # tests a failover switching serving to a different replica
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')

        # test data goes through again
        logging.debug("Inserting lots of data on source shard")
        self._insert_lots(1000, base=1000)
        logging.debug("Checking 80 percent of data was sent quickly")
        self._check_lots_timeout(1000, 80, 5, base=1000)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        expect_fail=True)

        # now serve rdonly from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-C0 C0-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # then serve replica from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-C0 C0-\n' +
                                 'Partitions(replica): -80 80-C0 C0-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
            auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-C0 C0-\n' +
                                 'Partitions(replica): -80 80-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-\n' +
                                 'Partitions(rdonly): -80 80-C0 C0-\n' +
                                 'Partitions(replica): -80 80-C0 C0-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # reparent shard_2 to shard_2_replica1, then insert more data and
        # see it flow through still
        utils.run_vtctl([
            'ReparentShard', 'test_keyspace/80-C0',
            shard_2_replica1.tablet_alias
        ])
        logging.debug(
            "Inserting lots of data on source shard after reparenting")
        self._insert_lots(3000, base=2000)
        logging.debug("Checking 80 percent of data was sent fairly quickly")
        self._check_lots_timeout(3000, 80, 10, base=2000)

        # use the vtworker checker to compare the data again
        logging.debug("Running vtworker SplitDiff")
        utils.run_vtworker(
            ['-cell', 'test_nj', 'SplitDiff', 'test_keyspace/C0-'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly.tablet_alias, 'rdonly'],
            auto_log=True)

        # going to migrate the master now, check the delays
        monitor_thread_1.done = True
        monitor_thread_2.done = True
        insert_thread_1.done = True
        insert_thread_2.done = True
        logging.debug("DELAY 1: %s max_lag=%u avg_lag=%u",
                      monitor_thread_1.object_name, monitor_thread_1.max_lag,
                      monitor_thread_1.lag_sum / monitor_thread_1.sample_count)
        logging.debug("DELAY 2: %s max_lag=%u avg_lag=%u",
                      monitor_thread_2.object_name, monitor_thread_2.max_lag,
                      monitor_thread_2.lag_sum / monitor_thread_2.sample_count)

        # then serve master from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace('test_nj',
                                 'test_keyspace',
                                 'Partitions(master): -80 80-C0 C0-\n' +
                                 'Partitions(rdonly): -80 80-C0 C0-\n' +
                                 'Partitions(replica): -80 80-C0 C0-\n' +
                                 'TabletTypes: master,rdonly,replica',
                                 keyspace_id_type=keyspace_id_type)

        # check the binlog players are gone now
        shard_2_master.wait_for_binlog_player_count(0)
        shard_3_master.wait_for_binlog_player_count(0)

        # scrap the original tablets in the original shard
        for t in [
                shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly
        ]:
            utils.run_vtctl(['ScrapTablet', t.tablet_alias], auto_log=True)
        tablet.kill_tablets(
            [shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_rdonly])

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # test RemoveShardCell
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'],
                        auto_log=True,
                        expect_fail=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'],
                        auto_log=True)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
        if shard['Cells']:
            self.fail("Non-empty Cells record for shard: %s" % str(shard))

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)

        # kill everything
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_2_master, shard_2_replica1,
            shard_2_replica2, shard_3_master, shard_3_replica, shard_3_rdonly
        ])
Ejemplo n.º 56
0
    def test_reparent_avoid(self):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting. Won't be healthy
        # as replication is not running.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        for t in [tablet_62344, tablet_62044, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Force the slaves to reparent assuming that all the datasets are
        # identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ],
                        auto_log=True)

        utils.validate_topology(ping_tablets=True)
        self._check_master_tablet(tablet_62344)

        # Perform a reparent operation with avoid_master pointing to non-master. It
        # should succeed without doing anything.
        utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0',
            '-avoid_master', tablet_62044.tablet_alias
        ],
                        auto_log=True)

        utils.validate_topology()
        self._check_master_tablet(tablet_62344)

        # Perform a reparent operation with avoid_master pointing to master.
        utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0',
            '-avoid_master', tablet_62344.tablet_alias
        ],
                        auto_log=True)

        utils.validate_topology()
        # 62044 is in the same cell and 31981 is in a different cell, so we must
        # land on 62044
        self._check_master_tablet(tablet_62044)

        # If we kill the tablet in the same cell as master then reparent
        # -avoid_master will fail.
        tablet_62344.kill_vttablet()
        _, stderr = utils.run_vtctl([
            'PlannedReparentShard', '-keyspace_shard', 'test_keyspace/0',
            '-avoid_master', tablet_62044.tablet_alias
        ],
                                    auto_log=True,
                                    expect_fail=True)
        self.assertIn('cannot find a tablet to reparent to', stderr)

        utils.validate_topology()
        self._check_master_tablet(tablet_62044)

        tablet.kill_tablets(
            [tablet_62344, tablet_62044, tablet_41983, tablet_31981])
Ejemplo n.º 57
0
    def test_repeated_init_shard_master(self):
        """Test that using InitShardMaster can go back and forth between 2 hosts."""
        for t in tablet_62344, tablet_62044:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             lameduck_period='5s',
                             init_tablet_type='replica',
                             init_keyspace='test_keyspace',
                             init_shard='0')

        # Tablets are not replicating, so they won't be healthy.
        for t in tablet_62344, tablet_62044:
            t.wait_for_vttablet_state('NOT_SERVING')
            self.check_healthz(t, False)

        # Pick one master out of the two.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ])

        # Run health check on both, make sure they are both healthy.
        # Also make sure the types are correct.
        for t in tablet_62344, tablet_62044:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True)
            self.check_healthz(t, True)
        utils.wait_for_tablet_type(tablet_62344.tablet_alias,
                                   'master',
                                   timeout=0)
        utils.wait_for_tablet_type(tablet_62044.tablet_alias,
                                   'replica',
                                   timeout=0)

        # Pick the other one as master, make sure they are still healthy.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62044.tablet_alias
        ])

        # Run health check on both, make sure they are both healthy.
        # Also make sure the types are correct.
        for t in tablet_62344, tablet_62044:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True)
            self.check_healthz(t, True)
        utils.wait_for_tablet_type(tablet_62344.tablet_alias,
                                   'replica',
                                   timeout=0)
        utils.wait_for_tablet_type(tablet_62044.tablet_alias,
                                   'master',
                                   timeout=0)

        # Come back to the original guy.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ])

        # Run health check on both, make sure they are both healthy.
        # Also make sure the types are correct.
        for t in tablet_62344, tablet_62044:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias], auto_log=True)
            self.check_healthz(t, True)
        utils.wait_for_tablet_type(tablet_62344.tablet_alias,
                                   'master',
                                   timeout=0)
        utils.wait_for_tablet_type(tablet_62044.tablet_alias,
                                   'replica',
                                   timeout=0)

        # And done.
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 58
0
    def test_resharding(self):
        global l2vtgate1, l2vtgate2

        # create the keyspace with just one shard
        shard_master.init_tablet('replica',
                                 keyspace='test_keyspace',
                                 shard='0',
                                 tablet_index=0)
        shard_replica.init_tablet('replica',
                                  keyspace='test_keyspace',
                                  shard='0',
                                  tablet_index=1)
        shard_rdonly1.init_tablet('rdonly',
                                  keyspace='test_keyspace',
                                  shard='0',
                                  tablet_index=2)

        for t in [shard_master, shard_replica, shard_rdonly1]:
            t.create_db('vt_test_keyspace')

        shard_master.start_vttablet(wait_for_state=None,
                                    binlog_use_v3_resharding_mode=False)
        shard_replica.start_vttablet(wait_for_state=None,
                                     binlog_use_v3_resharding_mode=False)
        shard_rdonly1.start_vttablet(wait_for_state=None,
                                     binlog_use_v3_resharding_mode=False)

        for t in [shard_master, shard_replica, shard_rdonly1]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            shard_master.tablet_alias
        ],
                        auto_log=True)

        utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica')
        utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly')
        for t in [shard_master, shard_replica, shard_rdonly1]:
            t.wait_for_vttablet_state('SERVING')

        # create the tables and add startup values
        self._create_schema()
        self._insert_startup_values()

        # reload schema on all tablets so we can query them
        for t in [shard_master, shard_replica, shard_rdonly1]:
            utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True)

        # We must start vtgate after tablets are up, or else wait until 1min refresh
        # (that is the tablet_refresh_interval parameter for discovery gateway)
        # we want cache_ttl at zero so we re-read the topology for every test query.
        if use_l2vtgate:
            l2vtgate1 = utils.L2VtGate()
            l2vtgate1.start(
                tablets=[shard_master, shard_replica, shard_rdonly1])
            l2vtgate1.wait_for_endpoints('test_keyspace.0.master', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.0.replica', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.0.rdonly', 1)

            _, addr = l2vtgate1.rpc_endpoint()
            l2vtgate1_param = '%s|test_keyspace|0' % addr
            utils.VtGate().start(cache_ttl='0', l2vtgates=[
                l2vtgate1_param,
            ])

        else:
            utils.VtGate().start(
                cache_ttl='0',
                tablets=[shard_master, shard_replica, shard_rdonly1])
            utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1)

        # check the Map Reduce API works correctly, should use ExecuteShards,
        # as we're not sharded yet.
        # we have 3 values in the database, asking for 4 splits will get us
        # a single query.
        sql = 'select id, msg from resharding1'
        s = utils.vtgate.split_query(sql, 'test_keyspace', 4)
        self.assertEqual(len(s), 1)
        self.assertEqual(s[0]['shard_part']['shards'][0], '0')

        # change the schema, backfill keyspace_id, and change schema again
        self._add_sharding_key_to_schema()
        self._backfill_keyspace_id(shard_master)
        self._mark_sharding_key_not_null()

        # now we can be a sharded keyspace (and propagate to SrvKeyspace)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col',
            base_sharding.keyspace_id_type
        ])
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # run a health check on source replica so it responds to discovery
        utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias])

        # create the split shards
        shard_0_master.init_tablet('replica',
                                   keyspace='test_keyspace',
                                   shard='-80',
                                   tablet_index=0)
        shard_0_replica.init_tablet('replica',
                                    keyspace='test_keyspace',
                                    shard='-80',
                                    tablet_index=1)
        shard_0_rdonly1.init_tablet('rdonly',
                                    keyspace='test_keyspace',
                                    shard='-80',
                                    tablet_index=2)
        shard_1_master.init_tablet('replica',
                                   keyspace='test_keyspace',
                                   shard='80-',
                                   tablet_index=0)
        shard_1_replica.init_tablet('replica',
                                    keyspace='test_keyspace',
                                    shard='80-',
                                    tablet_index=1)
        shard_1_rdonly1.init_tablet('rdonly',
                                    keyspace='test_keyspace',
                                    shard='80-',
                                    tablet_index=2)

        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly1,
                shard_1_master, shard_1_replica, shard_1_rdonly1
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             binlog_use_v3_resharding_mode=False)

        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly1,
                shard_1_master, shard_1_replica, shard_1_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/-80',
            shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/80-',
            shard_1_master.tablet_alias
        ],
                        auto_log=True)

        for t in [shard_0_replica, shard_1_replica]:
            utils.wait_for_tablet_type(t.tablet_alias, 'replica')
        for t in [shard_0_rdonly1, shard_1_rdonly1]:
            utils.wait_for_tablet_type(t.tablet_alias, 'rdonly')

        sharded_tablets = [
            shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master,
            shard_1_replica, shard_1_rdonly1
        ]
        for t in sharded_tablets:
            t.wait_for_vttablet_state('SERVING')

        # must restart vtgate after tablets are up, or else wait until 1min refresh
        # we want cache_ttl at zero so we re-read the topology for every test query.
        utils.vtgate.kill()
        if use_l2vtgate:
            l2vtgate1.kill()

            l2vtgate1 = utils.L2VtGate()
            l2vtgate1.start(tablets=[
                shard_master, shard_replica, shard_rdonly1, shard_0_master,
                shard_0_replica, shard_0_rdonly1
            ],
                            tablet_filters='test_keyspace|0,test_keyspace|-80')
            l2vtgate1.wait_for_endpoints('test_keyspace.0.master', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.0.replica', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.0.rdonly', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.-80.master', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.-80.replica', 1)
            l2vtgate1.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
            l2vtgate1.verify_no_endpoint('test_keyspace.80-.master')
            l2vtgate1.verify_no_endpoint('test_keyspace.80-.replica')
            l2vtgate1.verify_no_endpoint('test_keyspace.80-.rdonly')

            # FIXME(alainjobart) we clear tablet_types_to_wait, as this
            # l2vtgate2 doesn't serve the current test_keyspace shard, which
            # is test_keyspace.0. This is not ideal, we should re-work
            # which keyspace/shard a l2vtgate can wait for, as the ones
            # filtered by tablet_filters.
            l2vtgate2 = utils.L2VtGate()
            l2vtgate2.start(
                tablets=[shard_1_master, shard_1_replica, shard_1_rdonly1],
                tablet_filters='test_keyspace|80-',
                tablet_types_to_wait='')
            l2vtgate2.wait_for_endpoints('test_keyspace.80-.master', 1)
            l2vtgate2.wait_for_endpoints('test_keyspace.80-.replica', 1)
            l2vtgate2.wait_for_endpoints('test_keyspace.80-.rdonly', 1)
            l2vtgate2.verify_no_endpoint('test_keyspace.0.master')
            l2vtgate2.verify_no_endpoint('test_keyspace.0.replica')
            l2vtgate2.verify_no_endpoint('test_keyspace.0.rdonly')
            l2vtgate2.verify_no_endpoint('test_keyspace.-80.master')
            l2vtgate2.verify_no_endpoint('test_keyspace.-80.replica')
            l2vtgate2.verify_no_endpoint('test_keyspace.-80.rdonly')

            _, addr1 = l2vtgate1.rpc_endpoint()
            _, addr2 = l2vtgate2.rpc_endpoint()
            l2vtgate1_param1 = '%s|test_keyspace|0' % addr1
            l2vtgate1_param2 = '%s|test_keyspace|-80' % addr1
            l2vtgate2_param = '%s|test_keyspace|80-' % addr2
            utils.VtGate().start(cache_ttl='0',
                                 l2vtgates=[
                                     l2vtgate1_param1,
                                     l2vtgate1_param2,
                                     l2vtgate2_param,
                                 ])

        else:
            utils.VtGate().start(cache_ttl='0',
                                 tablets=[
                                     shard_master, shard_replica,
                                     shard_rdonly1, shard_0_master,
                                     shard_0_replica, shard_0_rdonly1,
                                     shard_1_master, shard_1_replica,
                                     shard_1_rdonly1
                                 ])
            utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1)

        # check the Map Reduce API works correctly, should use ExecuteKeyRanges now,
        # as we are sharded (with just one shard).
        # again, we have 3 values in the database, asking for 4 splits will get us
        # a single query.
        sql = 'select id, msg from resharding1'
        s = utils.vtgate.split_query(sql, 'test_keyspace', 4)
        self.assertEqual(len(s), 1)
        self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace')
        # There must be one empty KeyRange which represents the full keyspace.
        self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1)
        self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {})

        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -\n'
            'Partitions(replica): -\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # we need to create the schema, and the worker will do data copying
        for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)
        utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias])

        # Run vtworker as daemon for the following SplitClone commands.
        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            [
                '--cell', 'test_nj', '--command_display_interval', '10ms',
                '--use_v3_resharding_mode=false'
            ],
            auto_log=True)

        # Initial clone (online).
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 3, 0, 0, 0)

        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Modify the destination shard. SplitClone will revert the changes.
        # Delete row 1 (provokes an insert).
        shard_0_master.mquery('vt_test_keyspace',
                              'delete from resharding1 where id=1',
                              write=True)
        # Delete row 2 (provokes an insert).
        shard_1_master.mquery('vt_test_keyspace',
                              'delete from resharding1 where id=2',
                              write=True)
        # Update row 3 (provokes an update).
        shard_1_master.mquery(
            'vt_test_keyspace',
            "update resharding1 set msg='msg-not-3' where id=3",
            write=True)
        # Insert row 4 (provokes a delete).
        self._insert_value(shard_1_master, 'resharding1', 4, 'msg4',
                           0xD000000000000000)

        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count',
            '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets',
            '1', 'test_keyspace/0'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 2, 1, 1, 0)
        self.verify_reconciliation_counters(worker_port, 'Offline',
                                            'resharding1', 0, 0, 0, 3)
        # Terminate worker daemon because it is no longer needed.
        utils.kill_sub_process(worker_proc, soft=True)

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'],
                        auto_log=True)

        # check the binlog players are running
        logging.debug('Waiting for binlog players to start on new masters...')
        self.check_destination_master(shard_0_master, ['test_keyspace/0'])
        self.check_destination_master(shard_1_master, ['test_keyspace/0'])

        # check that binlog server exported the stats vars
        self.check_binlog_server_vars(shard_replica, horizontal=True)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000)
        logging.debug('Checking 80 percent of data is sent quickly')
        v = self._check_lots_timeout(1000, 80, 5)
        if v != 100:
            logging.debug('Checking all data goes through eventually')
            self._check_lots_timeout(1000, 100, 20)
        logging.debug('Checking no data was sent the wrong way')
        self._check_lots_not_present(1000)
        self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'],
                                      seconds_behind_master_max=30)
        self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'],
                                      seconds_behind_master_max=30)
        self.check_binlog_server_vars(shard_replica,
                                      horizontal=True,
                                      min_statements=1000,
                                      min_transactions=1000)

        # use vtworker to compare the data
        logging.debug('Running vtworker SplitDiff for -80')
        for t in [shard_0_rdonly1, shard_1_rdonly1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
        utils.run_vtworker([
            '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/-80'
        ],
                           auto_log=True)

        logging.debug('Running vtworker SplitDiff for 80-')
        utils.run_vtworker([
            '-cell', 'test_nj', '--use_v3_resharding_mode=false', 'SplitDiff',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/80-'
        ],
                           auto_log=True)

        utils.pause('Good time to test vtworker for diffs')

        # get status for the destination master tablet, make sure we have it all
        self.check_running_binlog_player(shard_0_master, 2000, 2000)
        self.check_running_binlog_player(shard_1_master, 6000, 2000)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        expect_fail=True)

        # now serve rdonly from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # make sure rdonly tablets are back to serving before hitting vtgate.
        for t in [shard_0_rdonly1, shard_1_rdonly1]:
            t.wait_for_vttablet_state('SERVING')
        if use_l2vtgate:
            l2vtgate1.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
            l2vtgate2.wait_for_endpoints('test_keyspace.80-.rdonly', 1)
        else:
            utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
            utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1)

        # check the Map Reduce API works correctly, should use ExecuteKeyRanges
        # on both destination shards now.
        # we ask for 2 splits to only have one per shard
        sql = 'select id, msg from resharding1'
        s = utils.vtgate.split_query(sql, 'test_keyspace', 2)
        self.assertEqual(len(s), 2)
        self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace')
        self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace')
        self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1)
        self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1)

        # then serve replica from the split shards
        source_tablet = shard_replica
        destination_tablets = [shard_0_replica, shard_1_replica]

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'],
            auto_log=True)
        # After a backwards migration, queryservice should be enabled on
        # source and disabled on destinations
        utils.check_tablet_query_service(self, source_tablet, True, False)
        utils.check_tablet_query_services(self, destination_tablets, False,
                                          True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        # After a forwards migration, queryservice should be disabled on
        # source and enabled on destinations
        utils.check_tablet_query_service(self, source_tablet, False, True)
        utils.check_tablet_query_services(self, destination_tablets, True,
                                          False)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # then serve master from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # check the binlog players are gone now
        self.check_no_binlog_player(shard_0_master)
        self.check_no_binlog_player(shard_1_master)

        # make sure we can't delete a shard with tablets
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True)

        # remove the original tablets in the original shard
        tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1])
        for t in [shard_replica, shard_rdonly1]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
        utils.run_vtctl(
            ['DeleteTablet', '-allow_master', shard_master.tablet_alias],
            auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True)

        # kill everything else
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master,
            shard_1_replica, shard_1_rdonly1
        ])
Ejemplo n.º 59
0
    def test_health_check(self):
        # one master, one replica that starts not initialized
        # (for the replica, we let vttablet do the InitTablet)
        tablet_62344.init_tablet('master', 'test_keyspace', '0')

        for t in tablet_62344, tablet_62044:
            t.create_db('vt_test_keyspace')

        tablet_62344.start_vttablet(wait_for_state=None)
        tablet_62044.start_vttablet(wait_for_state=None,
                                    lameduck_period='5s',
                                    init_tablet_type='replica',
                                    init_keyspace='test_keyspace',
                                    init_shard='0')

        tablet_62344.wait_for_vttablet_state('SERVING')
        tablet_62044.wait_for_vttablet_state('NOT_SERVING')
        self.check_healthz(tablet_62044, False)

        utils.run_vtctl(
            ['InitShardMaster', 'test_keyspace/0', tablet_62344.tablet_alias])

        # make sure the unhealthy slave goes to healthy
        tablet_62044.wait_for_vttablet_state('SERVING')
        utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])
        self.check_healthz(tablet_62044, True)

        # make sure the master is still master
        ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias])
        self.assertEqual(ti['type'], topodata_pb2.MASTER,
                         'unexpected master type: %s' % ti['type'])

        # stop replication at the mysql level.
        tablet_62044.mquery('', 'stop slave')
        # vttablet replication_reporter should restart it.
        utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])
        # insert something on the master and wait for it on the slave.
        tablet_62344.mquery('vt_test_keyspace', [
            'create table repl_test_table (id int)',
            'insert into repl_test_table values (123)'
        ],
                            write=True)
        timeout = 10.0
        while True:
            try:
                result = tablet_62044.mquery('vt_test_keyspace',
                                             'select * from repl_test_table')
                if result:
                    self.assertEqual(result[0][0], 123L)
                    break
            except MySQLdb.ProgrammingError:
                # Maybe the create table hasn't gone trough yet, we wait more
                logging.exception(
                    'got this exception waiting for data, ignoring it')
            timeout = utils.wait_step(
                'slave replication repaired by replication_reporter', timeout)

        # stop replication, make sure we don't go unhealthy.
        # (we have a baseline as well, so the time should be good).
        utils.run_vtctl(['StopSlave', tablet_62044.tablet_alias])
        utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])
        self.check_healthz(tablet_62044, True)

        # make sure status web page is healthy
        self.assertIn('>healthy</span></div>', tablet_62044.get_status())

        # make sure the health stream is updated
        health = utils.run_vtctl_json(
            ['VtTabletStreamHealth', '-count', '1', tablet_62044.tablet_alias])
        self.assertTrue(
            ('seconds_behind_master' not in health['realtime_stats'])
            or (health['realtime_stats']['seconds_behind_master'] < 30),
            'got unexpected health: %s' % str(health))
        self.assertIn('serving', health)

        # then restart replication, make sure we stay healthy
        utils.run_vtctl(['StartSlave', tablet_62044.tablet_alias])
        utils.run_vtctl(['RunHealthCheck', tablet_62044.tablet_alias])

        # make sure status web page is healthy
        self.assertIn('>healthy</span></div>', tablet_62044.get_status())

        # now test VtTabletStreamHealth returns the right thing
        stdout, _ = utils.run_vtctl(
            ['VtTabletStreamHealth', '-count', '2', tablet_62044.tablet_alias],
            trap_output=True,
            auto_log=True)
        lines = stdout.splitlines()
        self.assertEqual(len(lines), 2)
        for line in lines:
            logging.debug('Got health: %s', line)
            data = json.loads(line)
            self.assertIn('realtime_stats', data)
            self.assertIn('serving', data)
            self.assertTrue(data['serving'])
            self.assertNotIn('health_error', data['realtime_stats'])
            self.assertNotIn('tablet_externally_reparented_timestamp', data)
            self.assertEqual('test_keyspace', data['target']['keyspace'])
            self.assertEqual('0', data['target']['shard'])
            self.assertEqual(topodata_pb2.REPLICA,
                             data['target']['tablet_type'])

        # Test that VtTabletStreamHealth reports a QPS >0.0.
        # Therefore, issue several reads first.
        # NOTE: This may be potentially flaky because we'll observe a QPS >0.0
        #       exactly "once" for the duration of one sampling interval (5s) and
        #       after that we'll see 0.0 QPS rates again. If this becomes actually
        #       flaky, we need to read continuously in a separate thread.
        for _ in range(10):
            tablet_62044.execute('select 1 from dual')
        # This may take up to 5 seconds to become true because we sample the query
        # counts for the rates only every 5 seconds (see query_service_stats.go).
        timeout = 10
        while True:
            health = utils.run_vtctl_json([
                'VtTabletStreamHealth', '-count', '1',
                tablet_62044.tablet_alias
            ])
            if health['realtime_stats'].get('qps', 0.0) > 0.0:
                break
            timeout = utils.wait_step('QPS >0.0 seen', timeout)

        # kill the tablets
        tablet.kill_tablets([tablet_62344, tablet_62044])
Ejemplo n.º 60
0
    def test_reparent_down_master(self):
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        tablet_62344.create_db('vt_test_keyspace')
        tablet_62044.create_db('vt_test_keyspace')
        tablet_41983.create_db('vt_test_keyspace')
        tablet_31981.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Force the slaves to reparent assuming that all the datasets are
        # identical.
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ],
                        auto_log=True)
        utils.validate_topology()
        tablet_62344.mquery('vt_test_keyspace', self._create_vt_insert_test)

        # Make the current master agent and database unavailable.
        tablet_62344.kill_vttablet()
        tablet_62344.shutdown_mysql().wait()

        # Perform a planned reparent operation, will try to contact
        # the current master and fail somewhat quickly
        _, stderr = utils.run_vtctl([
            '-wait-time', '5s', 'PlannedReparentShard', '-keyspace_shard',
            'test_keyspace/0', '-new_master', tablet_62044.tablet_alias
        ],
                                    expect_fail=True)
        self.assertIn('DemoteMaster failed', stderr)

        # Run forced reparent operation, this should now proceed unimpeded.
        utils.run_vtctl([
            'EmergencyReparentShard', '-keyspace_shard', 'test_keyspace/0',
            '-new_master', tablet_62044.tablet_alias
        ],
                        auto_log=True)

        utils.validate_topology()
        self._check_master_tablet(tablet_62044)

        # insert data into the new master, check the connected slaves work
        self._populate_vt_insert_test(tablet_62044, 2)
        self._check_vt_insert_test(tablet_41983, 2)
        self._check_vt_insert_test(tablet_31981, 2)

        # bring back the old master as a slave, check that it catches up
        tablet_62344.start_mysql().wait()
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        self._check_vt_insert_test(tablet_62344, 2)

        tablet.kill_tablets(
            [tablet_62344, tablet_62044, tablet_41983, tablet_31981])