Beispiel #1
0
def run_test_zkocc_qps():
  _populate_zk()

  # preload the test_nj cell
  zkocc_14850 = utils.zkocc_start()

  qpser = utils.run_bg(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode qps /zk/test_nj/zkocc1/data1 /zk/test_nj/zkocc1/data2' % utils.zkocc_port_base)
  time.sleep(10)
  utils.kill_sub_process(qpser)

  # get the zkocc vars, make sure we have what we need
  v = utils.get_vars(utils.zkocc_port_base)
  if v['ZkReader']['test_nj']['State']['Current'] != 'Connected':
    raise utils.TestError('invalid zk global state: ', v['ZkReader']['test_nj']['State']['Current'])
  if v['ZkReader']['test_nj']['State']['DurationConnected'] < 9e9:
    raise utils.TestError('not enough time in Connected state', v['ZkReader']['test_nj']['State']['DurationConnected'])

  # some checks on performance / stats
  # a typical workstation will do 15k QPS, check we have more than 3k
  rpcCalls = v['ZkReader']['RpcCalls']
  if rpcCalls < 30000:
    raise utils.TestError('QPS is too low: %u < 30000', rpcCalls / 10)
  cacheReads = v['ZkReader']['test_nj']['CacheReads']
  if cacheReads < 30000:
    raise utils.TestError('Cache QPS is too low: %u < 30000', cacheReads / 10)
  totalCacheReads = v['ZkReader']['total']['CacheReads']
  if cacheReads != totalCacheReads:
    raise utils.TestError('Rollup stats are wrong: %u != %u', cacheReads,
                          totalCacheReads)
  if v['ZkReader']['UnknownCellErrors'] != 0:
    raise utils.TestError('unexpected UnknownCellErrors', v['ZkReader']['UnknownCellErrors'])

  utils.zkocc_kill(zkocc_14850)
Beispiel #2
0
  def test_zkocc_qps(self):
    # preload the test_nj cell
    zkocc_14850 = utils.zkocc_start()

    qpser = utils.run_bg(environment.binary_path('zkclient2')+' -server localhost:%u -mode qps /zk/test_nj/vt/zkocc1/data1 /zk/test_nj/vt/zkocc1/data2' % environment.zkocc_port_base)
    time.sleep(10)
    utils.kill_sub_process(qpser)

    # get the zkocc vars, make sure we have what we need
    v = utils.get_vars(environment.zkocc_port_base)
    if v['ZkReader']['test_nj']['State'] != 'Connected':
      self.fail('invalid zk global state: ' + v['ZkReader']['test_nj']['State'])

    # some checks on performance / stats
    # a typical workstation will do 45-47k QPS, check we have more than 10k
    rpcCalls = v['ZkReader']['RpcCalls']
    if rpcCalls < 100000:
      self.fail('QPS is too low: %u < 10000' % (rpcCalls / 10))
    else:
      logging.debug("Recorded qps: %u", rpcCalls / 10)
    cacheReads = v['ZkReader']['test_nj']['CacheReads']
    if cacheReads < 100000:
      self.fail('Cache QPS is too low: %u < 10000' % (cacheReads / 10))
    totalCacheReads = v['ZkReader']['total']['CacheReads']
    self.assertEqual(cacheReads, totalCacheReads, 'Rollup stats are wrong')
    self.assertEqual(v['ZkReader']['UnknownCellErrors'], 0, 'unexpected UnknownCellErrors')
    utils.zkocc_kill(zkocc_14850)
Beispiel #3
0
    def test_regular_operation(self):
        # Use a dedicated worker to run all vtworker commands.
        worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)
        vtworker_endpoint = 'localhost:' + str(worker_rpc_port)

        automation_server_proc, automation_server_port = (
            utils.run_automation_server())

        source_shard_list = '0'
        dest_shard_list = '-80,80-'
        _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
        utils.run(
            environment.binary_argstr('automation_client') +
            ' --server localhost:' + str(automation_server_port) +
            ' --task HorizontalReshardingTask' + ' --param keyspace=' +
            self.KEYSPACE + ' --param source_shard_list=' + source_shard_list +
            ' --param dest_shard_list=' + dest_shard_list +
            ' --param vtctld_endpoint=' + vtctld_endpoint +
            ' --param vtworker_endpoint=' + vtworker_endpoint +
            ' --param min_healthy_rdonly_tablets=1')

        self.verify()

        utils.kill_sub_process(automation_server_proc, soft=True)
        utils.kill_sub_process(worker_proc, soft=True)
  def test_regular_operation(self):
    # Use a dedicated worker to run all vtworker commands.
    worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'],
        auto_log=True)
    vtworker_endpoint = 'localhost:' + str(worker_rpc_port)

    automation_server_proc, automation_server_port = (
        utils.run_automation_server())

    source_shard_list = '0'
    dest_shard_list = '-80,80-'
    _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
    utils.run(
        environment.binary_argstr('automation_client') +
        ' --server localhost:' + str(automation_server_port) +
        ' --task HorizontalReshardingTask' +
        ' --param keyspace=' + self.KEYSPACE +
        ' --param source_shard_list=' + source_shard_list +
        ' --param dest_shard_list=' + dest_shard_list +
        ' --param vtctld_endpoint=' + vtctld_endpoint +
        ' --param vtworker_endpoint=' + vtworker_endpoint +
        ' --param min_healthy_rdonly_endpoints=1')

    self.verify()

    utils.kill_sub_process(automation_server_proc, soft=True)
    utils.kill_sub_process(worker_proc, soft=True)
Beispiel #5
0
def tearDownModule():
  if utils.options.skip_teardown:
    return

  if use_mysqlctld:
    # Try to terminate mysqlctld gracefully, so it kills its mysqld.
    for proc in setup_procs:
      utils.kill_sub_process(proc, soft=True)
    teardown_procs = setup_procs
  else:
    teardown_procs = [
        tablet_62344.teardown_mysql(),
        tablet_31981.teardown_mysql(),
        ]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server().teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()

  tablet_62344.remove_tree()
  tablet_31981.remove_tree()

  path = os.path.join(environment.vtdataroot, 'snapshot')
  try:
    shutil.rmtree(path)
  except OSError as e:
    logging.debug("removing snapshot %s: %s", path, str(e))
Beispiel #6
0
  def teardown(self):
    import utils

    for cluster in self.clusters.itervalues():
      utils.kill_sub_process(cluster.proc)
      if not utils.options.keep_logs:
        shutil.rmtree(cluster.data_dir)
Beispiel #7
0
def tearDownModule():
  utils.required_teardown()
  if utils.options.skip_teardown:
    return

  if use_mysqlctld:
    # Try to terminate mysqlctld gracefully, so it kills its mysqld.
    for proc in setup_procs:
      utils.kill_sub_process(proc, soft=True)
    teardown_procs = setup_procs
  else:
    teardown_procs = [
        tablet_master.teardown_mysql(),
        tablet_replica1.teardown_mysql(),
        tablet_replica2.teardown_mysql(),
    ]
  utils.wait_procs(teardown_procs, raise_on_error=False)

  environment.topo_server().teardown()
  utils.kill_sub_processes()
  utils.remove_tmp_files()

  tablet_master.remove_tree()
  tablet_replica1.remove_tree()
  tablet_replica2.remove_tree()
Beispiel #8
0
  def test_vtgate_qps(self):
    # create the topology
    utils.run_vtctl('CreateKeyspace test_keyspace')
    t = tablet.Tablet(tablet_uid=1, cell="nj")
    t.init_tablet("master", "test_keyspace", "0")
    t.update_addrs()
    utils.run_vtctl('RebuildKeyspaceGraph test_keyspace', auto_log=True)

    # start vtgate and the qps-er
    vtgate_proc, vtgate_port = utils.vtgate_start()
    qpser = utils.run_bg(environment.binary_path('zkclient2')+' -server localhost:%u -mode qps2 test_nj test_keyspace' % vtgate_port)
    time.sleep(10)
    utils.kill_sub_process(qpser)

    # get the vtgate vars, make sure we have what we need
    v = utils.get_vars(vtgate_port)

    # some checks on performance / stats
    # a typical workstation will do 38-40k QPS, check we have more than 10k
    rpcCalls = v['TopoReaderRpcQueryCount']['test_nj']
    if rpcCalls < 100000:
      self.fail('QPS is too low: %u < 10000' % (rpcCalls / 10))
    else:
      logging.debug("Recorded qps: %u", rpcCalls / 10)
    utils.vtgate_kill(vtgate_proc)
    def test_regular_operation(self):
        # Use a dedicated worker to run all vtworker commands.
        worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(["--cell", "test_nj"], auto_log=True)
        vtworker_endpoint = "localhost:" + str(worker_rpc_port)

        automation_server_proc, automation_server_port = utils.run_automation_server()

        source_shard_list = "0"
        dest_shard_list = "-80,80-"
        _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
        utils.run(
            environment.binary_argstr("automation_client")
            + " --server localhost:"
            + str(automation_server_port)
            + " --task HorizontalReshardingTask"
            + " --param keyspace="
            + self.KEYSPACE
            + " --param source_shard_list="
            + source_shard_list
            + " --param dest_shard_list="
            + dest_shard_list
            + " --param vtctld_endpoint="
            + vtctld_endpoint
            + " --param vtworker_endpoint="
            + vtworker_endpoint
            + " --param min_healthy_rdonly_tablets=1"
        )

        self.verify()

        utils.kill_sub_process(automation_server_proc, soft=True)
        utils.kill_sub_process(worker_proc, soft=True)
Beispiel #10
0
def tearDownModule():
    if utils.options.skip_teardown:
        return

    if use_mysqlctld:
        # Try to terminate mysqlctld gracefully, so it kills its mysqld.
        for proc in setup_procs:
            utils.kill_sub_process(proc, soft=True)
        teardown_procs = setup_procs
    else:
        teardown_procs = [
            tablet_62344.teardown_mysql(),
            tablet_31981.teardown_mysql(),
        ]
    utils.wait_procs(teardown_procs, raise_on_error=False)

    environment.topo_server().teardown()
    utils.kill_sub_processes()
    utils.remove_tmp_files()

    tablet_62344.remove_tree()
    tablet_31981.remove_tree()

    path = os.path.join(environment.vtdataroot, 'snapshot')
    try:
        shutil.rmtree(path)
    except OSError as e:
        logging.debug("removing snapshot %s: %s", path, str(e))
Beispiel #11
0
  def test_vtgate_qps(self):
    # create the topology
    utils.run_vtctl('CreateKeyspace test_keyspace')
    t = tablet.Tablet(tablet_uid=1, cell="nj")
    t.init_tablet("master", "test_keyspace", "0")
    t.update_addrs()
    utils.run_vtctl('RebuildShardGraph test_keyspace/0', auto_log=True)
    utils.run_vtctl('RebuildKeyspaceGraph test_keyspace', auto_log=True)

    # start vtgate and the qps-er
    vtgate_proc, vtgate_port = utils.vtgate_start()
    qpser = utils.run_bg(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode qps2 test_nj test_keyspace' % vtgate_port)
    time.sleep(10)
    utils.kill_sub_process(qpser)

    # get the vtgate vars, make sure we have what we need
    v = utils.get_vars(vtgate_port)

    # some checks on performance / stats
    # a typical workstation will do 38-40k QPS, check we have more than 15k
    rpcCalls = v['TopoReaderRpcQueryCount']['test_nj']
    if rpcCalls < 150000:
      self.fail('QPS is too low: %u < 15000' % (rpcCalls / 10))
    else:
      logging.debug("Recorded qps: %u", rpcCalls / 10)
    utils.vtgate_kill(vtgate_proc)
Beispiel #12
0
    def test_zkocc_qps(self):
        # preload the test_nj cell
        zkocc_14850 = utils.zkocc_start()

        qpser = utils.run_bg(
            utils.vtroot +
            '/bin/zkclient2 -server localhost:%u -mode qps /zk/test_nj/zkocc1/data1 /zk/test_nj/zkocc1/data2'
            % utils.zkocc_port_base)
        time.sleep(10)
        utils.kill_sub_process(qpser)

        # get the zkocc vars, make sure we have what we need
        v = utils.get_vars(utils.zkocc_port_base)
        if v['ZkReader']['test_nj']['State']['Current'] != 'Connected':
            raise utils.TestError('invalid zk global state: ',
                                  v['ZkReader']['test_nj']['State']['Current'])
        if v['ZkReader']['test_nj']['State']['DurationConnected'] < 9e9:
            self.fail('not enough time in Connected state: %s' %
                      v['ZkReader']['test_nj']['State']['DurationConnected'])

        # some checks on performance / stats
        # a typical workstation will do 15k QPS, check we have more than 3k
        rpcCalls = v['ZkReader']['RpcCalls']
        if rpcCalls < 30000:
            self.fail('QPS is too low: %u < 30000' % rpcCalls / 10)
        cacheReads = v['ZkReader']['test_nj']['CacheReads']
        if cacheReads < 30000:
            self.fail('Cache QPS is too low: %u < 30000' % cacheReads / 10)
        totalCacheReads = v['ZkReader']['total']['CacheReads']
        self.assertEqual(cacheReads, totalCacheReads, 'Rollup stats are wrong')
        self.assertEqual(v['ZkReader']['UnknownCellErrors'], 0,
                         'unexpected UnknownCellErrors')
        utils.zkocc_kill(zkocc_14850)
Beispiel #13
0
  def teardown(self):
    import utils  # pylint: disable=g-import-not-at-top

    for cluster in self.clusters.itervalues():
      utils.kill_sub_process(cluster.proc)
      if not utils.options.keep_logs:
        shutil.rmtree(cluster.data_dir)
Beispiel #14
0
    def teardown(self):
        import utils  # pylint: disable=g-import-not-at-top

        for cluster in self.clusters.itervalues():
            utils.kill_sub_process(cluster.proc)
            if not utils.options.keep_logs:
                shutil.rmtree(cluster.data_dir)
Beispiel #15
0
  def test_zkocc_qps(self):
    # preload the test_nj cell
    zkocc_14850 = utils.zkocc_start()

    qpser = utils.run_bg(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode qps /zk/test_nj/vt/zkocc1/data1 /zk/test_nj/vt/zkocc1/data2' % utils.zkocc_port_base)
    time.sleep(10)
    utils.kill_sub_process(qpser)

    # get the zkocc vars, make sure we have what we need
    v = utils.get_vars(utils.zkocc_port_base)
    if v['ZkReader']['test_nj']['State'] != 'Connected':
      raise utils.TestError('invalid zk global state: ', v['ZkReader']['test_nj']['State'])

    # some checks on performance / stats
    # a typical workstation will do 45-47k QPS, check we have more than 15k
    rpcCalls = v['ZkReader']['RpcCalls']
    if rpcCalls < 150000:
      self.fail('QPS is too low: %u < 15000' % (rpcCalls / 10))
    else:
      logging.debug("Recorded qps: %u", rpcCalls / 10)
    cacheReads = v['ZkReader']['test_nj']['CacheReads']
    if cacheReads < 150000:
      self.fail('Cache QPS is too low: %u < 15000' % (cacheReads / 10))
    totalCacheReads = v['ZkReader']['total']['CacheReads']
    self.assertEqual(cacheReads, totalCacheReads, 'Rollup stats are wrong')
    self.assertEqual(v['ZkReader']['UnknownCellErrors'], 0, 'unexpected UnknownCellErrors')
    utils.zkocc_kill(zkocc_14850)
Beispiel #16
0
    def test_regular_operation(self):
        # Use a dedicated worker to run all vtworker commands.
        worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)
        vtworker_endpoint = 'localhost:' + str(worker_rpc_port)

        automation_server_proc, automation_server_port = (
            utils.run_automation_server())

        keyspace = 'test_keyspace'
        source_shard_list = '0'
        dest_shard_list = '-80,80-'
        _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
        utils.run(
            environment.binary_argstr('automation_client') +
            ' --server localhost:' + str(automation_server_port) +
            ' --task HorizontalReshardingTask' + ' --param keyspace=' +
            keyspace + ' --param source_shard_list=' + source_shard_list +
            ' --param dest_shard_list=' + dest_shard_list +
            ' --param vtctld_endpoint=' + vtctld_endpoint +
            ' --param vtworker_endpoint=' + vtworker_endpoint)

        self.assert_shard_data_equal(0, worker.shard_master,
                                     worker.shard_0_tablets.replica)
        self.assert_shard_data_equal(1, worker.shard_master,
                                     worker.shard_1_tablets.replica)

        utils.kill_sub_process(automation_server_proc, soft=True)
        utils.kill_sub_process(worker_proc, soft=True)
  def test_regular_operation(self):
    # Use a dedicated worker to run all vtworker commands.
    worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'],
        auto_log=True)
    vtworker_endpoint = "localhost:" + str(worker_rpc_port)

    automation_server_proc, automation_server_port = utils.run_automation_server()

    keyspace = 'test_keyspace'
    source_shard_list = '0'
    dest_shard_list = '-80,80-'
    _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
    utils.run(environment.binary_argstr('automation_client') +
              ' --server localhost:' + str(automation_server_port) +
              ' --task HorizontalReshardingTask' +
              ' --param keyspace=' + keyspace +
              ' --param source_shard_list=' + source_shard_list +
              ' --param dest_shard_list=' + dest_shard_list +
              ' --param source_shard_rdonly_list=' +
              worker.shard_rdonly1.tablet_alias +
              ' --param dest_shard_rdonly_list=' +
              worker.shard_0_rdonly1.tablet_alias + ',' +
              worker.shard_1_rdonly1.tablet_alias +
              ' --param vtctld_endpoint=' + vtctld_endpoint +
              ' --param vtworker_endpoint=' + vtworker_endpoint)

    self.assert_shard_data_equal(0, worker.shard_master,
                                 worker.shard_0_tablets.replica)
    self.assert_shard_data_equal(1, worker.shard_master,
                                 worker.shard_1_tablets.replica)

    utils.kill_sub_process(automation_server_proc, soft=True)
    utils.kill_sub_process(worker_proc, soft=True)
Beispiel #18
0
def tearDownModule():
    utils.required_teardown()
    if utils.options.skip_teardown:
        return

    if use_mysqlctld:
        # Try to terminate mysqlctld gracefully, so it kills its mysqld.
        for proc in setup_procs:
            utils.kill_sub_process(proc, soft=True)
        teardown_procs = setup_procs
    else:
        teardown_procs = [
            tablet_master.teardown_mysql(),
            tablet_replica1.teardown_mysql(),
            tablet_replica2.teardown_mysql(),
        ]
    utils.wait_procs(teardown_procs, raise_on_error=False)

    environment.topo_server().teardown()
    utils.kill_sub_processes()
    utils.remove_tmp_files()

    tablet_master.remove_tree()
    tablet_replica1.remove_tree()
    tablet_replica2.remove_tree()
Beispiel #19
0
    def teardown(self):
        import utils

        for cluster in self.clusters.itervalues():
            utils.kill_sub_process(cluster.proc)
            if not utils.options.keep_logs:
                shutil.rmtree(cluster.data_dir)
Beispiel #20
0
  def test_primecache(self):
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    master.init_tablet( 'master',  'test_keyspace', '0')
    replica.init_tablet('idle')

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    master.create_db('vt_test_keyspace')
    master.start_vttablet(wait_for_state=None)
    replica.start_vttablet(wait_for_state=None)

    master.wait_for_vttablet_state('SERVING')
    replica.wait_for_vttablet_state('NOT_SERVING') # DB doesn't exist

    self._create_data()

    # we use clone to not prime the mysql cache on the slave db
    utils.run_vtctl(['Clone', '-force', '-server-mode',
                     master.tablet_alias, replica.tablet_alias],
                    auto_log=True)

    # sync the buffer cache, and clear it. This will prompt for user's password
    utils.run(['sync'])
    utils.run(['sudo', 'bash', '-c', 'echo 1 > /proc/sys/vm/drop_caches'])

    # we can now change data on the master for 30s, while slave is stopped.
    # master's binlog will be in OS buffer cache now.
    replica.mquery('', 'slave stop')
    self._change_random_data()

    use_primecache = True # easy to test without
    if use_primecache:
      # starting vtprimecache, sleeping for a couple seconds
      args = environment.binary_args('vtprimecache') + [
              '-db-config-dba-uname', 'vt_dba',
              '-db-config-dba-charset', 'utf8',
              '-db-config-dba-dbname', 'vt_test_keyspace',
              '-db-config-app-uname', 'vt_app',
              '-db-config-app-charset', 'utf8',
              '-db-config-app-dbname', 'vt_test_keyspace',
              '-relay_logs_path', replica.tablet_dir+'/relay-logs',
              '-mysql_socket_file', replica.tablet_dir+'/mysql.sock',
              '-log_dir', environment.vtlogroot,
              '-worker_count', '4',
              '-alsologtostderr',
             ]
      vtprimecache = utils.run_bg(args)
      time.sleep(2)

    # start slave, see how longs it takes to catch up on replication
    replica.mquery('', 'slave start')
    self.catch_up()

    if use_primecache:
      # TODO(alainjobart): read and check stats
      utils.kill_sub_process(vtprimecache)

    tablet.kill_tablets([master, replica])
Beispiel #21
0
 def teardown_mysql(self, extra_args=None):
     if self.use_mysqlctld and self.mysqlctld_process:
         # if we use mysqlctld, we just terminate it gracefully, so it kills
         # its mysqld. And we return it, so we can wait for it.
         utils.kill_sub_process(self.mysqlctld_process, soft=True)
         return self.mysqlctld_process
     if utils.options.keep_logs:
         return self.shutdown_mysql(extra_args=extra_args)
     return self.mysqlctl(['teardown', '-force'], extra_args=extra_args)
Beispiel #22
0
 def teardown_mysql(self, extra_args=None):
   if self.use_mysqlctld and self.mysqlctld_process:
     # if we use mysqlctld, we just terminate it gracefully, so it kills
     # its mysqld. And we return it, so we can wait for it.
     utils.kill_sub_process(self.mysqlctld_process, soft=True)
     return self.mysqlctld_process
   if utils.options.keep_logs:
     return self.shutdown_mysql(extra_args=extra_args)
   return self.mysqlctl(['teardown', '-force'], extra_args=extra_args)
Beispiel #23
0
def tearDownModule():
    utils.required_teardown()
    if utils.options.skip_teardown:
        return

    for proc in setup_procs:
        utils.kill_sub_process(proc, soft=True)
    utils.wait_procs(setup_procs, raise_on_error=False)

    environment.topo_server().teardown()
    utils.kill_sub_processes()
    utils.remove_tmp_files()

    for t in all_tablets:
        t.remove_tree()
Beispiel #24
0
  def test_restart(self):
    zkocc_server = utils.zkocc_start()

    shard_0_master.create_db('vt_test_keyspace')

    proc1 = shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                          key=cert_dir + "/vt-server-key.pem")
    proc2 = shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                          key=cert_dir + "/vt-server-key.pem")
    time.sleep(2.0)
    proc1.poll()
    if proc1.returncode is None:
      raise utils.TestError("proc1 still running")
    shard_0_master.kill_vttablet()
    utils.kill_sub_process(zkocc_server)
    logging.debug("Done here")
    def test_vertical_split(self):
        # Use a dedicated worker to run all vtworker commands.
        worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)
        vtworker_endpoint = 'localhost:' + str(worker_rpc_port)

        automation_server_proc, automation_server_port = (
            utils.run_automation_server())

        _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
        params = {
            'source_keyspace': 'source_keyspace',
            'dest_keyspace': 'destination_keyspace',
            'shard_list': '0',
            'tables': 'moving.*,view1',
            'vtctld_endpoint': vtctld_endpoint,
            'vtworker_endpoint': vtworker_endpoint,
        }
        args = [
            '--server', 'localhost:' + str(automation_server_port), '--task',
            'VerticalSplitTask'
        ]
        args.extend(['--param=' + k + '=' + v for k, v in params.items()])
        utils.run(environment.binary_args('automation_client') + args)

        # One of the two source rdonly tablets went spare after the diff.
        # Force a healthcheck on both to get them back to "rdonly".
        for t in [
                vertical_split.source_rdonly1, vertical_split.source_rdonly2
        ]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

        self._check_srv_keyspace('')
        self._check_blacklisted_tables(vertical_split.source_master,
                                       ['moving.*', 'view1'])
        self._check_blacklisted_tables(vertical_split.source_replica,
                                       ['moving.*', 'view1'])
        self._check_blacklisted_tables(vertical_split.source_rdonly1,
                                       ['moving.*', 'view1'])
        self._check_blacklisted_tables(vertical_split.source_rdonly2,
                                       ['moving.*', 'view1'])

        # check the binlog player is gone now
        vertical_split.destination_master.wait_for_binlog_player_count(0)

        utils.kill_sub_process(automation_server_proc, soft=True)
        utils.kill_sub_process(worker_proc, soft=True)
Beispiel #26
0
def run_test_restart():
    zkocc_server = utils.zkocc_start()

    shard_0_master.create_db('vt_test_keyspace')

    proc1 = shard_0_master.start_vttablet(cert=cert_dir +
                                          "/vt-server-cert.pem",
                                          key=cert_dir + "/vt-server-key.pem")
    proc2 = shard_0_master.start_vttablet(cert=cert_dir +
                                          "/vt-server-cert.pem",
                                          key=cert_dir + "/vt-server-key.pem")
    time.sleep(2.0)
    proc1.poll()
    if proc1.returncode is None:
        raise utils.TestError("proc1 still running")
    shard_0_master.kill_vttablet()
    utils.kill_sub_process(zkocc_server)
Beispiel #27
0
  def test_restart(self):
    zkocc_server = utils.zkocc_start()

    shard_0_master.create_db('vt_test_keyspace')

    proc1 = shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                          key=cert_dir + "/vt-server-key.pem")
    # Takes a bit longer for vttablet to serve the pid port
    time.sleep(1.0)
    proc2 = shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                          key=cert_dir + "/vt-server-key.pem")
    time.sleep(1.0)
    proc1.poll()
    if proc1.returncode is None:
      self.fail("proc1 still running")
    shard_0_master.kill_vttablet()
    utils.kill_sub_process(zkocc_server)
    logging.debug("Done here")
Beispiel #28
0
def run_test_vttablet_authenticated():
  utils.zk_wipe()
  utils.run_vtctl('CreateKeyspace -force test_keyspace')
  tablet_62344.init_tablet('master', 'test_keyspace', '0')
  utils.run_vtctl('RebuildShardGraph test_keyspace/0')
  utils.validate_topology()

  tablet_62344.populate('vt_test_keyspace', create_vt_select_test,
                        populate_vt_select_test)
  agent = tablet_62344.start_vttablet(auth=True)
  utils.run_vtctl('SetReadWrite ' + tablet_62344.tablet_alias)

  err, out = tablet_62344.vquery('select * from vt_select_test', path='test_keyspace/0', user='******', password=r'ma kota')
  utils.debug("Got rows: " + out)
  if 'Row count: ' not in out:
    raise utils.TestError("query didn't go through: %s, %s" % (err, out))

  utils.kill_sub_process(agent)
  def test_vertical_split(self):
    # Use a dedicated worker to run all vtworker commands.
    worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'],
        auto_log=True)
    vtworker_endpoint = 'localhost:' + str(worker_rpc_port)

    automation_server_proc, automation_server_port = (
        utils.run_automation_server())

    _, vtctld_endpoint = utils.vtctld.rpc_endpoint()
    params = {'source_keyspace': 'source_keyspace',
              'dest_keyspace': 'destination_keyspace',
              'shard_list': '0',
              'tables': 'moving.*,view1',
              'vtctld_endpoint': vtctld_endpoint,
              'vtworker_endpoint': vtworker_endpoint,
             }
    args = ['--server', 'localhost:' + str(automation_server_port),
            '--task', 'VerticalSplitTask']
    args.extend(['--param=' + k + '=' + v for k, v in params.items()])
    utils.run(environment.binary_args('automation_client') + args)

    # One of the two source rdonly tablets went spare after the diff.
    # Force a healthcheck on both to get them back to "rdonly".
    for t in [vertical_split.source_rdonly1, vertical_split.source_rdonly2]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

    self._check_srv_keyspace('')
    self._check_blacklisted_tables(vertical_split.source_master,
                                   ['moving.*', 'view1'])
    self._check_blacklisted_tables(vertical_split.source_replica,
                                   ['moving.*', 'view1'])
    self._check_blacklisted_tables(vertical_split.source_rdonly1,
                                   ['moving.*', 'view1'])
    self._check_blacklisted_tables(vertical_split.source_rdonly2,
                                   ['moving.*', 'view1'])

    # check the binlog player is gone now
    vertical_split.destination_master.wait_for_binlog_player_count(0)

    utils.kill_sub_process(automation_server_proc, soft=True)
    utils.kill_sub_process(worker_proc, soft=True)
  def test_successful_resharding(self):
    """Reshard from 1 to 2 shards by running the workflow."""
    worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'], auto_log=True)
    vtworker_endpoint = 'localhost:%d' % worker_rpc_port

    stdout = utils.run_vtctl(['WorkflowCreate', 'horizontal_resharding',
                              '-keyspace=test_keyspace',
                              '-vtworkers=%s' % vtworker_endpoint],
                             auto_log=True)
    workflow_uuid = re.match(r'^uuid: (.*)$', stdout[0]).group(1)

    utils.pause('Now is a good time to look at vtctld UI at: '
                '%s, workflow uuid=%s' % (utils.vtctld.port, workflow_uuid))

    utils.run_vtctl(['WorkflowWait', workflow_uuid])

    self.verify()
    utils.kill_sub_process(worker_proc, soft=True)
Beispiel #31
0
    def test_restart(self):
        zkocc_server = utils.zkocc_start()

        shard_0_master.create_db('vt_test_keyspace')

        proc1 = shard_0_master.start_vttablet(
            cert=cert_dir + "/vt-server-cert.pem",
            key=cert_dir + "/vt-server-key.pem")
        # Takes a bit longer for vttablet to serve the pid port
        time.sleep(1.0)
        proc2 = shard_0_master.start_vttablet(
            cert=cert_dir + "/vt-server-cert.pem",
            key=cert_dir + "/vt-server-key.pem")
        time.sleep(1.0)
        proc1.poll()
        if proc1.returncode is None:
            self.fail("proc1 still running")
        shard_0_master.kill_vttablet()
        utils.kill_sub_process(zkocc_server)
        logging.debug("Done here")
    def test_successful_resharding(self):
        """Reshard from 1 to 2 shards by running the workflow."""
        worker_proc, _, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)
        vtworker_endpoint = 'localhost:%d' % worker_rpc_port

        stdout = utils.run_vtctl([
            'WorkflowCreate', 'horizontal_resharding',
            '-keyspace=test_keyspace',
            '-vtworkers=%s' % vtworker_endpoint, '-enable_approvals=false'
        ],
                                 auto_log=True)
        workflow_uuid = re.match(r'^uuid: (.*)$', stdout[0]).group(1)

        utils.pause('Now is a good time to look at vtctld UI at: '
                    '%s, workflow uuid=%s' %
                    (utils.vtctld.port, workflow_uuid))

        utils.run_vtctl(['WorkflowWait', workflow_uuid])

        self.verify()
        utils.kill_sub_process(worker_proc, soft=True)
Beispiel #33
0
  def test_resharding(self):
    # create the keyspace with just one shard
    shard_master.init_tablet(
        'master',
        keyspace='test_keyspace',
        shard='0',
        tablet_index=0)
    shard_replica.init_tablet(
        'replica',
        keyspace='test_keyspace',
        shard='0',
        tablet_index=1)
    shard_rdonly1.init_tablet(
        'rdonly',
        keyspace='test_keyspace',
        shard='0',
        tablet_index=2)

    for t in [shard_master, shard_replica, shard_rdonly1]:
      t.create_db('vt_test_keyspace')

    shard_master.start_vttablet(wait_for_state=None)
    shard_replica.start_vttablet(wait_for_state=None)
    shard_rdonly1.start_vttablet(wait_for_state=None)

    shard_master.wait_for_vttablet_state('SERVING')
    for t in [shard_replica, shard_rdonly1]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # reparent to make the tablets work
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     shard_master.tablet_alias], auto_log=True)

    utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica')
    utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly')
    for t in [shard_master, shard_replica, shard_rdonly1]:
      t.wait_for_vttablet_state('SERVING')

    # create the tables and add startup values
    self._create_schema()
    self._insert_startup_values()

    # reload schema on all tablets so we can query them
    for t in [shard_master, shard_replica, shard_rdonly1]:
      utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True)

    # must start vtgate after tablets are up, or else wait until 1min refresh
    # we want cache_ttl at zero so we re-read the topology for every test query.
    utils.VtGate().start(cache_ttl='0', tablets=[
        shard_master, shard_replica, shard_rdonly1])
    utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1)

    # check the Map Reduce API works correctly, should use ExecuteShards,
    # as we're not sharded yet.
    # we have 3 values in the database, asking for 4 splits will get us
    # a single query.
    sql = 'select id, msg from resharding1'
    s = utils.vtgate.split_query(sql, 'test_keyspace', 4)
    self.assertEqual(len(s), 1)
    self.assertEqual(s[0]['shard_part']['shards'][0], '0')

    # change the schema, backfill keyspace_id, and change schema again
    self._add_sharding_key_to_schema()
    self._backfill_keyspace_id(shard_master)
    self._mark_sharding_key_not_null()

    # now we can be a sharded keyspace (and propagate to SrvKeyspace)
    utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace',
                     'custom_ksid_col', base_sharding.keyspace_id_type])
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                    auto_log=True)

    # run a health check on source replica so it responds to discovery
    utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias])

    # create the split shards
    shard_0_master.init_tablet(
        'master',
        keyspace='test_keyspace',
        shard='-80',
        tablet_index=0)
    shard_0_replica.init_tablet(
        'replica',
        keyspace='test_keyspace',
        shard='-80',
        tablet_index=1)
    shard_0_rdonly1.init_tablet(
        'rdonly',
        keyspace='test_keyspace',
        shard='-80',
        tablet_index=2)
    shard_1_master.init_tablet(
        'master',
        keyspace='test_keyspace',
        shard='80-',
        tablet_index=0)
    shard_1_replica.init_tablet(
        'replica',
        keyspace='test_keyspace',
        shard='80-',
        tablet_index=1)
    shard_1_rdonly1.init_tablet(
        'rdonly',
        keyspace='test_keyspace',
        shard='80-',
        tablet_index=2)

    for t in [shard_0_master, shard_0_replica,
              shard_1_master, shard_1_replica]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None)
    for t in [shard_0_rdonly1, shard_1_rdonly1]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None)

    for t in [shard_0_master, shard_1_master]:
      t.wait_for_vttablet_state('SERVING')
    for t in [shard_0_replica, shard_0_rdonly1,
              shard_1_replica, shard_1_rdonly1]:
      t.wait_for_vttablet_state('NOT_SERVING')

    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80',
                     shard_0_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-',
                     shard_1_master.tablet_alias], auto_log=True)

    for t in [shard_0_replica, shard_1_replica]:
      utils.wait_for_tablet_type(t.tablet_alias, 'replica')
    for t in [shard_0_rdonly1, shard_1_rdonly1]:
      utils.wait_for_tablet_type(t.tablet_alias, 'rdonly')

    sharded_tablets = [shard_0_master, shard_0_replica, shard_0_rdonly1,
                       shard_1_master, shard_1_replica, shard_1_rdonly1]
    for t in sharded_tablets:
      t.wait_for_vttablet_state('SERVING')

    # must restart vtgate after tablets are up, or else wait until 1min refresh
    # we want cache_ttl at zero so we re-read the topology for every test query.
    utils.vtgate.kill()
    utils.VtGate().start(cache_ttl='0', tablets=[
        shard_master, shard_replica, shard_rdonly1,
        shard_0_master, shard_0_replica, shard_0_rdonly1,
        shard_1_master, shard_1_replica, shard_1_rdonly1])
    utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1)

    # check the Map Reduce API works correctly, should use ExecuteKeyRanges now,
    # as we are sharded (with just one shard).
    # again, we have 3 values in the database, asking for 4 splits will get us
    # a single query.
    sql = 'select id, msg from resharding1'
    s = utils.vtgate.split_query(sql, 'test_keyspace', 4)
    self.assertEqual(len(s), 1)
    self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace')
    # There must be one empty KeyRange which represents the full keyspace.
    self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1)
    self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {})

    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -\n'
                             'Partitions(rdonly): -\n'
                             'Partitions(replica): -\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # we need to create the schema, and the worker will do data copying
    for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
      utils.run_vtctl(['CopySchemaShard',
                       '--exclude_tables', 'unrelated',
                       shard_rdonly1.tablet_alias,
                       keyspace_shard],
                      auto_log=True)
    utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias])

    # Run vtworker as daemon for the following SplitClone commands.
    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--command_display_interval', '10ms'],
        auto_log=True)

    # Initial clone (online).
    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--offline=false',
         '--exclude_tables', 'unrelated',
         '--min_table_size_for_split', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/0'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        3, 0, 0)

    # Reset vtworker such that we can run the next command.
    workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
    utils.wait_procs([workerclient_proc])

    # Modify the destination shard. SplitClone will revert the changes.
    # Delete row 1 (provokes an insert).
    shard_0_master.mquery('vt_test_keyspace',
                          'delete from resharding1 where id=1', write=True)
    # Delete row 2 (provokes an insert).
    shard_1_master.mquery('vt_test_keyspace',
                          'delete from resharding1 where id=2', write=True)
    # Update row 3 (provokes an update).
    shard_1_master.mquery('vt_test_keyspace',
                          "update resharding1 set msg='msg-not-3' where id=3",
                          write=True)
    # Insert row 4 (provokes a delete).
    self._insert_value(shard_1_master, 'resharding1', 4, 'msg4',
                       0xD000000000000000)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--exclude_tables', 'unrelated',
         '--min_table_size_for_split', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/0'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        2, 1, 1)
    self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1',
                                        0, 0, 0)
    # Terminate worker daemon because it is no longer needed.
    utils.kill_sub_process(worker_proc, soft=True)

    # check the startup values are in the right place
    self._check_startup_values()

    # check the schema too
    utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True)

    # check the binlog players are running
    logging.debug('Waiting for binlog players to start on new masters...')
    self.check_destination_master(shard_0_master, ['test_keyspace/0'])
    self.check_destination_master(shard_1_master, ['test_keyspace/0'])

    # check that binlog server exported the stats vars
    self.check_binlog_server_vars(shard_replica, horizontal=True)

    # testing filtered replication: insert a bunch of data on shard 1,
    # check we get most of it after a few seconds, wait for binlog server
    # timeout, check we get all of it.
    logging.debug('Inserting lots of data on source shard')
    self._insert_lots(1000)
    logging.debug('Checking 80 percent of data is sent quickly')
    v = self._check_lots_timeout(1000, 80, 5)
    if v != 100:
      logging.debug('Checking all data goes through eventually')
      self._check_lots_timeout(1000, 100, 20)
    logging.debug('Checking no data was sent the wrong way')
    self._check_lots_not_present(1000)
    self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'],
                                  seconds_behind_master_max=30)
    self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'],
                                  seconds_behind_master_max=30)
    self.check_binlog_server_vars(shard_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)

    # use vtworker to compare the data
    logging.debug('Running vtworker SplitDiff for -80')
    for t in [shard_0_rdonly1, shard_1_rdonly1]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
    utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
                        '--min_healthy_rdonly_tablets', '1',
                        'test_keyspace/-80'],
                       auto_log=True)

    logging.debug('Running vtworker SplitDiff for 80-')
    utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
                        '--min_healthy_rdonly_tablets', '1',
                        'test_keyspace/80-'],
                       auto_log=True)

    utils.pause('Good time to test vtworker for diffs')

    # get status for the destination master tablet, make sure we have it all
    self.check_running_binlog_player(shard_0_master, 2000, 2000)
    self.check_running_binlog_player(shard_1_master, 6000, 2000)

    # check we can't migrate the master just yet
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                    expect_fail=True)

    # now serve rdonly from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # make sure rdonly tablets are back to serving before hitting vtgate.
    for t in [shard_0_rdonly1, shard_1_rdonly1]:
      t.wait_for_vttablet_state('SERVING')
    utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
    utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1)

    # check the Map Reduce API works correctly, should use ExecuteKeyRanges
    # on both destination shards now.
    # we ask for 2 splits to only have one per shard
    sql = 'select id, msg from resharding1'
    s = utils.vtgate.split_query(sql, 'test_keyspace', 2)
    self.assertEqual(len(s), 2)
    self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace')
    self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace')
    self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1)
    self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1)

    # then serve replica from the split shards
    source_tablet = shard_replica
    destination_tablets = [shard_0_replica, shard_1_replica]

    utils.run_vtctl(
        ['MigrateServedTypes', 'test_keyspace/0', 'replica'], auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # move replica back and forth
    utils.run_vtctl(
        ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'],
        auto_log=True)
    # After a backwards migration, queryservice should be enabled on
    # source and disabled on destinations
    utils.check_tablet_query_service(self, source_tablet, True, False)
    utils.check_tablet_query_services(self, destination_tablets, False, True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                    auto_log=True)
    # After a forwards migration, queryservice should be disabled on
    # source and enabled on destinations
    utils.check_tablet_query_service(self, source_tablet, False, True)
    utils.check_tablet_query_services(self, destination_tablets, True, False)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # then serve master from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # check the binlog players are gone now
    self.check_no_binlog_player(shard_0_master)
    self.check_no_binlog_player(shard_1_master)

    # make sure we can't delete a shard with tablets
    utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True)

    # remove the original tablets in the original shard
    tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1])
    for t in [shard_replica, shard_rdonly1]:
      utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
    utils.run_vtctl(['DeleteTablet', '-allow_master',
                     shard_master.tablet_alias], auto_log=True)

    # rebuild the serving graph, all mentions of the old shards shoud be gone
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    # delete the original shard
    utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True)

    # kill everything else
    tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly1,
                         shard_1_master, shard_1_replica, shard_1_rdonly1])
Beispiel #34
0
  def stop(self):
    import utils  # pylint: disable=g-import-not-at-top

    utils.kill_sub_process(self.proc)
    self.proc.wait()
    shutil.rmtree(self.data_dir)
  def test_merge_sharding(self):
    utils.run_vtctl(['CreateKeyspace',
                     '--sharding_column_name', 'custom_ksid_col',
                     '--sharding_column_type', base_sharding.keyspace_id_type,
                     'test_keyspace'])

    shard_0_master.init_tablet('replica', 'test_keyspace', '-40')
    shard_0_replica.init_tablet('replica', 'test_keyspace', '-40')
    shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40')
    shard_1_master.init_tablet('replica', 'test_keyspace', '40-80')
    shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80')
    shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80')
    shard_2_master.init_tablet('replica', 'test_keyspace', '80-')
    shard_2_replica.init_tablet('replica', 'test_keyspace', '80-')
    shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')

    # rebuild and check SrvKeyspace
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
    ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
    self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')

    # create databases so vttablet can start behaving normally
    for t in [shard_0_master, shard_0_replica, shard_0_rdonly,
              shard_1_master, shard_1_replica, shard_1_rdonly,
              shard_2_master, shard_2_replica, shard_2_rdonly]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None,
                       binlog_use_v3_resharding_mode=False)

    # won't be serving, no replication state
    for t in [shard_0_master, shard_0_replica, shard_0_rdonly,
              shard_1_master, shard_1_replica, shard_1_rdonly,
              shard_2_master, shard_2_replica, shard_2_rdonly]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # reparent to make the tablets work
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40',
                     shard_0_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80',
                     shard_1_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-',
                     shard_2_master.tablet_alias], auto_log=True)

    # create the tables
    self._create_schema()
    self._insert_startup_values()

    # run a health check on source replicas so they respond to discovery
    # (for binlog players) and on the source rdonlys (for workers)
    for t in [shard_0_replica, shard_1_replica]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
    for t in [shard_0_rdonly, shard_1_rdonly]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

    # create the merge shards
    shard_dest_master.init_tablet('replica', 'test_keyspace', '-80')
    shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80')
    shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')

    # start vttablet on the destination shard (no db created,
    # so they're all not serving)
    for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]:
      t.start_vttablet(wait_for_state=None,
                       binlog_use_v3_resharding_mode=False)
    for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]:
      t.wait_for_vttablet_state('NOT_SERVING')

    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80',
                     shard_dest_master.tablet_alias], auto_log=True)

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                    auto_log=True)
    utils.check_srv_keyspace(
        'test_nj', 'test_keyspace',
        'Partitions(master): -40 40-80 80-\n'
        'Partitions(rdonly): -40 40-80 80-\n'
        'Partitions(replica): -40 40-80 80-\n',
        keyspace_id_type=base_sharding.keyspace_id_type,
        sharding_column_name='custom_ksid_col')

    # copy the schema
    utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias,
                     'test_keyspace/-80'], auto_log=True)

    # copy the data (will also start filtered replication), reset source
    # Run vtworker as daemon for the following SplitClone commands.
    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--command_display_interval', '10ms',
          '--use_v3_resharding_mode=false'],
        auto_log=True)

    # Initial clone (online).
    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--offline=false',
         '--chunk_count', '10',
         '--min_rows_per_chunk', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/-80'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        2, 0, 0, 0)

    # Reset vtworker such that we can run the next command.
    workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
    utils.wait_procs([workerclient_proc])

    # Modify the destination shard. SplitClone will revert the changes.
    # Delete row 1 (provokes an insert).
    shard_dest_master.mquery('vt_test_keyspace',
                             'delete from resharding1 where id=1', write=True)
    # Update row 2 (provokes an update).
    shard_dest_master.mquery(
        'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2",
        write=True)
    # Insert row 0 (provokes a delete).
    self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0',
                       0x5000000000000000)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--chunk_count', '10',
         '--min_rows_per_chunk', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/-80'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    # Change tablets, which were taken offline, back to rdonly.
    utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        1, 1, 1, 0)
    self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1',
                                        0, 0, 0, 2)
    # Terminate worker daemon because it is no longer needed.
    utils.kill_sub_process(worker_proc, soft=True)

    # check the startup values are in the right place
    self._check_startup_values()

    # check the schema too
    utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True)

    # check binlog player variables
    self.check_destination_master(shard_dest_master,
                                  ['test_keyspace/-40', 'test_keyspace/40-80'])

    # check that binlog server exported the stats vars
    self.check_binlog_server_vars(shard_0_replica, horizontal=True)
    self.check_binlog_server_vars(shard_1_replica, horizontal=True)

    # testing filtered replication: insert a bunch of data on shard 0 and 1,
    # check we get most of it after a few seconds, wait for binlog server
    # timeout, check we get all of it.
    logging.debug('Inserting lots of data on source shards')
    self._insert_lots(1000)
    logging.debug('Checking 80 percent of data is sent quickly')
    v = self._check_lots_timeout(1000, 80, 10)
    if v != 100:
      # small optimization: only do this check if we don't have all the data
      # already anyway.
      logging.debug('Checking all data goes through eventually')
      self._check_lots_timeout(1000, 100, 30)
    self.check_binlog_player_vars(shard_dest_master,
                                  ['test_keyspace/-40', 'test_keyspace/40-80'],
                                  seconds_behind_master_max=30)
    self.check_binlog_server_vars(shard_0_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)
    self.check_binlog_server_vars(shard_1_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)

    # use vtworker to compare the data (after health-checking the destination
    # rdonly tablets so discovery works)
    utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias])
    logging.debug('Running vtworker SplitDiff on first half')
    utils.run_vtworker(['-cell', 'test_nj',
                        '--use_v3_resharding_mode=false',
                        'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        '--source_uid', '1',
                        'test_keyspace/-80'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    logging.debug('Running vtworker SplitDiff on second half')
    utils.run_vtworker(['-cell', 'test_nj',
                        '--use_v3_resharding_mode=false',
                        'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        '--source_uid', '2',
                        'test_keyspace/-80'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)

    # get status for the destination master tablet, make sure we have it all
    self.check_running_binlog_player(shard_dest_master, 3000, 1000)

    # check destination master query service is not running
    utils.check_tablet_query_service(self, shard_dest_master, False, False)
    stream_health = utils.run_vtctl_json(['VtTabletStreamHealth',
                                          '-count', '1',
                                          shard_dest_master.tablet_alias])
    logging.debug('Got health: %s', str(stream_health))
    self.assertIn('realtime_stats', stream_health)
    self.assertNotIn('serving', stream_health)

    # check the destination master 3 is healthy, even though its query
    # service is not running (if not healthy this would exception out)
    shard_dest_master.get_healthz()

    # now serve rdonly from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -40 40-80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -40 40-80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # now serve replica from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -40 40-80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # now serve master from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_0_master, False, True)
    utils.check_tablet_query_service(self, shard_1_master, False, True)

    # check the binlog players are gone now
    self.check_no_binlog_player(shard_dest_master)

    # kill the original tablets in the original shards
    tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly,
                         shard_1_master, shard_1_replica, shard_1_rdonly])
    for t in [shard_0_replica, shard_0_rdonly,
              shard_1_replica, shard_1_rdonly]:
      utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
    for t in [shard_0_master, shard_1_master]:
      utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias],
                      auto_log=True)

    # delete the original shards
    utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True)
    utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True)

    # rebuild the serving graph, all mentions of the old shards should be gone
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    # kill everything else
    tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly,
                         shard_dest_master, shard_dest_replica,
                         shard_dest_rdonly])
Beispiel #36
0
def run_test_zkocc():
    _populate_zk()

    # preload the test_nj cell
    zkocc_14850 = utils.zkocc_start(
        extra_params=['-connect-timeout=2s', '-cache-refresh-interval=1s'])
    time.sleep(1)

    # create a python client. The first address is bad, will test the retry logic
    bad_port = utils.reserve_ports(3)
    zkocc_client = zkocc.ZkOccConnection(
        "localhost:%u,localhost:%u,localhost:%u" %
        (bad_port, utils.zkocc_port_base, bad_port + 1), "test_nj", 30)
    zkocc_client.dial()

    # test failure for a python client that cannot connect
    bad_zkocc_client = zkocc.ZkOccConnection(
        "localhost:%u,localhost:%u" % (bad_port + 2, bad_port), "test_nj", 30)
    try:
        bad_zkocc_client.dial()
        raise utils.TestError('exception expected')
    except zkocc.ZkOccError as e:
        if str(e) != "Cannot dial to any server":
            raise utils.TestError('Unexpected exception: ', str(e))
    logging.getLogger().setLevel(logging.ERROR)
    try:
        bad_zkocc_client.get("/zk/test_nj/zkocc1/data1")
        raise utils.TestError('exception expected')
    except zkocc.ZkOccError as e:
        if str(e) != "Cannot dial to any server":
            raise utils.TestError('Unexpected exception: ', str(e))
    logging.getLogger().setLevel(logging.WARNING)

    # get test
    out, err = utils.run(
        utils.vtroot +
        '/bin/zkclient2 -server localhost:%u /zk/test_nj/zkocc1/data1' %
        utils.zkocc_port_base,
        trap_output=True)
    if err != "/zk/test_nj/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=false, Stale=false)\n":
        raise utils.TestError('unexpected get output: ', err)
    zkNode = zkocc_client.get("/zk/test_nj/zkocc1/data1")
    if (zkNode['Data'] != "Test data 1" or \
        zkNode['Stat']['NumChildren'] != 0 or \
        zkNode['Stat']['Version'] != 0 or \
        zkNode['Cached'] != True or \
        zkNode['Stale'] != False):
        raise utils.TestError('unexpected zkocc_client.get output: ', zkNode)

    # getv test
    out, err = utils.run(
        utils.vtroot +
        '/bin/zkclient2 -server localhost:%u /zk/test_nj/zkocc1/data1 /zk/test_nj/zkocc1/data2 /zk/test_nj/zkocc1/data3'
        % utils.zkocc_port_base,
        trap_output=True)
    if err != """[0] /zk/test_nj/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=false)
[1] /zk/test_nj/zkocc1/data2 = Test data 2 (NumChildren=0, Version=0, Cached=false, Stale=false)
[2] /zk/test_nj/zkocc1/data3 = Test data 3 (NumChildren=0, Version=0, Cached=false, Stale=false)
""":
        raise utils.TestError('unexpected getV output: ', err)
    zkNodes = zkocc_client.getv([
        "/zk/test_nj/zkocc1/data1", "/zk/test_nj/zkocc1/data2",
        "/zk/test_nj/zkocc1/data3"
    ])
    if (zkNodes['Nodes'][0]['Data'] != "Test data 1" or \
        zkNodes['Nodes'][0]['Stat']['NumChildren'] != 0 or \
        zkNodes['Nodes'][0]['Stat']['Version'] != 0 or \
        zkNodes['Nodes'][0]['Cached'] != True or \
        zkNodes['Nodes'][0]['Stale'] != False or \
        zkNodes['Nodes'][1]['Data'] != "Test data 2" or \
        zkNodes['Nodes'][1]['Stat']['NumChildren'] != 0 or \
        zkNodes['Nodes'][1]['Stat']['Version'] != 0 or \
        zkNodes['Nodes'][1]['Cached'] != True or \
        zkNodes['Nodes'][1]['Stale'] != False or \
        zkNodes['Nodes'][2]['Data'] != "Test data 3" or \
        zkNodes['Nodes'][2]['Stat']['NumChildren'] != 0 or \
        zkNodes['Nodes'][2]['Stat']['Version'] != 0 or \
        zkNodes['Nodes'][2]['Cached'] != True or \
        zkNodes['Nodes'][2]['Stale'] != False):
        raise utils.TestError('unexpected zkocc_client.getv output: ', zkNodes)

    # children test
    out, err = utils.run(
        utils.vtroot +
        '/bin/zkclient2 -server localhost:%u -mode children /zk/test_nj' %
        utils.zkocc_port_base,
        trap_output=True)
    if err != """Path = /zk/test_nj
Child[0] = zkocc1
Child[1] = zkocc2
NumChildren = 2
CVersion = 4
Cached = false
Stale = false
""":
        raise utils.TestError('unexpected children output: ', err)

    # zk command tests
    _check_zk_output("cat /zk/test_nj/zkocc1/data1", "Test data 1")
    _check_zk_output(
        "ls -l /zk/test_nj/zkocc1", """total: 3
-rw-rw-rw- zk zk       11  %s data1
-rw-rw-rw- zk zk       11  %s data2
-rw-rw-rw- zk zk       11  %s data3
""" % (_format_time(zkNodes['Nodes'][0]['Stat']['MTime']),
       _format_time(zkNodes['Nodes'][1]['Stat']['MTime']),
       _format_time(zkNodes['Nodes'][2]['Stat']['MTime'])))

    # test /zk/local is not resolved and rejected
    out, err = utils.run(
        utils.vtroot +
        '/bin/zkclient2 -server localhost:%u /zk/local/zkocc1/data1' %
        utils.zkocc_port_base,
        trap_output=True,
        raise_on_error=False)
    if "zkocc: cannot resolve local cell" not in err:
        raise utils.TestError('unexpected get output, not local cell error: ',
                              err)

    # start a background process to query the same value over and over again
    # while we kill the zk server and restart it
    outfd = tempfile.NamedTemporaryFile(dir=utils.tmp_root, delete=False)
    filename = outfd.name
    querier = utils.run_bg(
        '/bin/bash -c "while true ; do ' + utils.vtroot +
        '/bin/zkclient2 -server localhost:%u /zk/test_nj/zkocc1/data1 ; sleep 0.1 ; done"'
        % utils.zkocc_port_base,
        stderr=outfd.file)
    outfd.close()
    time.sleep(1)

    # kill zk server, sleep a bit, restart zk server, sleep a bit
    utils.run(
        utils.vtroot + '/bin/zkctl -zk.cfg 1@' + utils.hostname +
        ':%u:%u:%u shutdown' %
        (utils.zk_port_base, utils.zk_port_base + 1, utils.zk_port_base + 2))
    time.sleep(3)
    utils.run(
        utils.vtroot + '/bin/zkctl -zk.cfg 1@' + utils.hostname +
        ':%u:%u:%u start' %
        (utils.zk_port_base, utils.zk_port_base + 1, utils.zk_port_base + 2))
    time.sleep(3)

    utils.kill_sub_process(querier)

    utils.debug("Checking " + filename)
    fd = open(filename, "r")
    state = 0
    for line in fd:
        if line == "/zk/test_nj/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=false)\n":
            stale = False
        elif line == "/zk/test_nj/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=true)\n":
            stale = True
        else:
            raise utils.TestError('unexpected line: ', line)
        if state == 0:
            if stale:
                state = 1
        elif state == 1:
            if not stale:
                state = 2
        else:
            if stale:
                raise utils.TestError('unexpected stale state')
    if state != 2:
        raise utils.TestError('unexpected ended stale state')
    fd.close()

    utils.zkocc_kill(zkocc_14850)

    # check that after the server is gone, the python client fails correctly
    logging.getLogger().setLevel(logging.ERROR)
    try:
        zkocc_client.get("/zk/test_nj/zkocc1/data1")
        raise utils.TestError('exception expected')
    except zkocc.ZkOccError as e:
        if str(e) != "Cannot dial to any server":
            raise utils.TestError('Unexpected exception: ', str(e))
    logging.getLogger().setLevel(logging.WARNING)
Beispiel #37
0
def tearDownModule():
  utils.kill_sub_process(vtgateclienttest_process, soft=True)
  vtgateclienttest_process.wait()
Beispiel #38
0
 def tearDown(self):
     utils.kill_sub_process(self.zkocc_server)
Beispiel #39
0
 def tearDown(self):
   utils.kill_sub_process(self.worker_proc)
Beispiel #40
0
  def test_secure(self):
    zkocc_server = utils.zkocc_start()

    # start the tablets
    shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                  key=cert_dir + "/vt-server-key.pem")
    shard_0_slave.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                 key=cert_dir + "/vt-server-key.pem",
                                 repl_extra_flags={
        'flags': "2048",
        'ssl-ca': cert_dir + "/ca-cert.pem",
        'ssl-cert': cert_dir + "/client-cert.pem",
        'ssl-key': cert_dir + "/client-key.pem",
        })

    # Reparent using SSL
    for t in [shard_0_master, shard_0_slave]:
      t.reset_replication()
    utils.run_vtctl('ReparentShard -force test_keyspace/0 ' + shard_0_master.tablet_alias, auto_log=True)

    # then get the topology and check it
    zkocc_client = zkocc.ZkOccConnection("localhost:%u" % environment.zkocc_port_base,
                                         "test_nj", 30.0)
    topology.read_keyspaces(zkocc_client)

    shard_0_master_addrs = topology.get_host_port_by_name(zkocc_client, "test_keyspace.0.master:_vts")
    if len(shard_0_master_addrs) != 1:
      self.fail('topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts", got: %s' % " ".join(["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs]))
    if shard_0_master_addrs[0][2] != True:
      self.fail('topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts" is not encrypted')
    logging.debug("shard 0 master addrs: %s", " ".join(["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs]))

    # make sure asking for optionally secure connections works too
    auto_addrs = topology.get_host_port_by_name(zkocc_client, "test_keyspace.0.master:_vtocc", encrypted=True)
    if auto_addrs != shard_0_master_addrs:
      self.fail('topology.get_host_port_by_name doesn\'t resolve encrypted addresses properly: %s != %s' % (str(shard_0_master_addrs), str(auto_addrs)))

    # try to connect with regular client
    try:
      conn = tablet3.TabletConnection("%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
                                      "", "test_keyspace", "0", 10.0)
      conn.dial()
      self.fail("No exception raised to secure port")
    except dbexceptions.FatalError as e:
      if not e.args[0][0].startswith('Unexpected EOF in handshake to'):
        self.fail("Unexpected exception: %s" % str(e))

    sconn = utils.get_vars(shard_0_master.port)["SecureConnections"]
    if sconn != 0:
      self.fail("unexpected conns %s" % sconn)

    # connect to encrypted port
    conn = tablet3.TabletConnection("%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
                                    "", "test_keyspace", "0", 5.0, encrypted=True)
    conn.dial()
    (results, rowcount, lastrowid, fields) = conn._execute("select 1 from dual", {})
    self.assertEqual(results, [(1,),], 'wrong conn._execute output: %s' % str(results))

    sconn = utils.get_vars(shard_0_master.port)["SecureConnections"]
    if sconn != 1:
      self.fail("unexpected conns %s" % sconn)
    saccept = utils.get_vars(shard_0_master.port)["SecureAccepts"]
    if saccept == 0:
      self.fail("unexpected accepts %s" % saccept)

    # trigger a time out on a secure connection, see what exception we get
    try:
      conn._execute("select sleep(100) from dual", {})
      self.fail("No timeout exception")
    except dbexceptions.TimeoutError as e:
      logging.debug("Got the right exception for SSL timeout: %s", str(e))

    # start a vtgate to connect to that tablet
    gate_proc, gate_port, gate_secure_port = utils.vtgate_start(
        tablet_bson_encrypted=True,
        cert=cert_dir + "/vt-server-cert.pem",
        key=cert_dir + "/vt-server-key.pem")

    # try to connect to vtgate with regular client
    timeout = 2.0
    try:
      conn = vtgatev2.connect(["localhost:%s" % (gate_secure_port),],
                               timeout)
      self.fail("No exception raised to VTGate secure port")
    except dbexceptions.OperationalError as e:
      exception_type = e.args[2]
      exception_msg = str(e.args[2][0][0])
      self.assertIsInstance(exception_type, dbexceptions.FatalError,
                            "unexpected exception type")
      if not exception_msg.startswith('Unexpected EOF in handshake to'):
        self.fail("Unexpected exception message: %s" % exception_msg)

    sconn = utils.get_vars(gate_port)["SecureConnections"]
    if sconn != 0:
      self.fail("unexpected conns %s" % sconn)

    # connect to vtgate with encrypted port
    conn = vtgatev2.connect(["localhost:%s" % (gate_secure_port),],
                             timeout, encrypted=True)
    (results, rowcount, lastrowid, fields) = conn._execute(
        "select 1 from dual",
        {},
        "test_keyspace",
        "master",
        keyranges=[keyrange.KeyRange(keyrange_constants.NON_PARTIAL_KEYRANGE),])
    self.assertEqual(rowcount, 1, "want 1, got %d" % (rowcount))
    self.assertEqual(len(fields), 1, "want 1, got %d" % (len(fields)))
    self.assertEqual(results, [(1,),], 'wrong conn._execute output: %s' % str(results))

    sconn = utils.get_vars(gate_port)["SecureConnections"]
    if sconn != 1:
      self.fail("unexpected conns %s" % sconn)
    saccept = utils.get_vars(gate_port)["SecureAccepts"]
    if saccept == 0:
      self.fail("unexpected accepts %s" % saccept)

    # trigger a time out on a vtgate secure connection, see what exception we get
    try:
      conn._execute("select sleep(4) from dual",
                    {},
                    "test_keyspace",
                    "master",
                    keyranges=[keyrange.KeyRange(keyrange_constants.NON_PARTIAL_KEYRANGE),])
      self.fail("No timeout exception")
    except dbexceptions.TimeoutError as e:
      logging.debug("Got the right exception for SSL timeout: %s", str(e))
    conn.close()
    utils.vtgate_kill(gate_proc)

    # kill everything
    utils.kill_sub_process(zkocc_server)
Beispiel #41
0
    def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
        """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
        if mysql_down:
            logging.debug('Shutting down mysqld on destination masters.')
            utils.wait_procs([
                shard_0_master.shutdown_mysql(),
                shard_1_master.shutdown_mysql()
            ])

        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)

        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        # --chunk_count is 2 because rows are currently ordered by primary key such
        # that all rows of the first shard come first and then the second shard.
        # TODO(mberlin): Remove --offline=false once vtworker ensures that the
        #                destination shards are not behind the master's replication
        #                position.
        args = [
            'SplitClone', '--offline=false', '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999'
        ]
        if not mysql_down:
            # Make the clone as slow as necessary such that there is enough time to
            # run PlannedReparent in the meantime.
            # TOOD(mberlin): Once insert_values is fixed to uniformly distribute the
            #                rows across shards when sorted by primary key, remove
            #                --chunk_count 2, --min_rows_per_chunk 1 and set
            #                --source_reader_count back to 1.
            args.extend([
                '--source_reader_count', '2', '--chunk_count', '2',
                '--min_rows_per_chunk', '1', '--write_query_max_rows', '1'
            ])
        args.append('test_keyspace/0')
        workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port)

        if mysql_down:
            # If MySQL is down, we wait until vtworker retried at least once to make
            # sure it reached the point where a write failed due to MySQL being down.
            # There should be two retries at least, one for each destination shard.
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerRetryCount >= 2',
                condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
            logging.debug(
                'Worker has retried at least twice, starting reparent now')

            # vtworker is blocked at this point. This is a good time to test that its
            # throttler server is reacting to RPCs.
            self.check_throttler_service(
                'localhost:%d' % worker_rpc_port,
                ['test_keyspace/-80', 'test_keyspace/80-'], 9999)

            # Bring back masters. Since we test with semi-sync now, we need at least
            # one replica for the new master. This test is already quite expensive,
            # so we bring back the old master as a replica rather than having a third
            # replica up the whole time.
            logging.debug('Restarting mysqld on destination masters')
            utils.wait_procs(
                [shard_0_master.start_mysql(),
                 shard_1_master.start_mysql()])

            # Reparent away from the old masters.
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        else:
            # NOTE: There is a race condition around this:
            #   It's possible that the SplitClone vtworker command finishes before the
            #   PlannedReparentShard vtctl command, which we start below, succeeds.
            #   Then the test would fail because vtworker did not have to retry.
            #
            # To workaround this, the test takes a parameter to increase the number of
            # rows that the worker has to copy (with the idea being to slow the worker
            # down).
            # You should choose a value for num_insert_rows, such that this test
            # passes for your environment (trial-and-error...)
            # Make sure that vtworker got past the point where it picked a master
            # for each destination shard ("finding targets" state).
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerState == cloning the data (online)',
                condition_fn=lambda v: v.get('WorkerState') == 'cloning the'
                ' data (online)')
            logging.debug('Worker is in copy state, starting reparent now')

            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        utils.wait_procs([workerclient_proc])

        # Verify that we were forced to re-resolve and retry.
        worker_vars = utils.get_vars(worker_port)
        self.assertGreater(
            worker_vars['WorkerRetryCount'], 1,
            "expected vtworker to retry each of the two reparented"
            " destination masters at least once, but it didn't")
        self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                            "expected vtworker to retry, but it didn't")
        utils.kill_sub_process(worker_proc, soft=True)

        # Wait for the destination RDONLYs to catch up or the following offline
        # clone will try to insert rows which already exist.
        # TODO(mberlin): Remove this once SplitClone supports it natively.
        utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1)
        utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1)
        # Run final offline clone to enable filtered replication.
        _, _ = utils.run_vtworker([
            '-cell', 'test_nj', 'SplitClone', '--online=false',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'
        ],
                                  auto_log=True)

        # Make sure that everything is caught up to the same replication point
        self.run_split_diff('test_keyspace/-80', all_shard_tablets,
                            shard_0_tablets)
        self.run_split_diff('test_keyspace/80-', all_shard_tablets,
                            shard_1_tablets)

        self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
        self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Beispiel #42
0
    def test_resharding(self):
        # we're going to reparent and swap these two
        global shard_2_master, shard_2_replica1

        utils.run_vtctl([
            'CreateKeyspace', '--sharding_column_name', 'bad_column',
            '--sharding_column_type', 'bytes', 'test_keyspace'
        ])
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col',
            'uint64'
        ],
                        expect_fail=True)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', '-force', 'test_keyspace',
            'custom_ksid_col', base_sharding.keyspace_id_type
        ])

        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-')
        shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')
        shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-')

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        ks = utils.run_vtctl_json(
            ['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
        self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')

        # we set full_mycnf_args to True as a test in the KIT_BYTES case
        full_mycnf_args = (
            base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES)

        # create databases so vttablet can start behaving somewhat normally
        for t in [
                shard_0_master, shard_0_replica, shard_0_ny_rdonly,
                shard_1_master, shard_1_slave1, shard_1_slave2,
                shard_1_ny_rdonly, shard_1_rdonly1
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             full_mycnf_args=full_mycnf_args)

        # wait for the tablets (replication is not setup, the slaves won't be
        # healthy)
        shard_0_master.wait_for_vttablet_state('SERVING')
        shard_0_replica.wait_for_vttablet_state('NOT_SERVING')
        shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
        shard_1_master.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')
        shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
        shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/-80', shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/80-', shard_1_master.tablet_alias
        ],
                        auto_log=True)

        # check the shards
        shards = utils.run_vtctl_json(
            ['FindAllShardsInKeyspace', 'test_keyspace'])
        self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards))
        self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards))
        self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards))

        # create the tables
        self._create_schema()
        self._insert_startup_values()
        self._test_keyrange_constraints()

        # run a health check on source replicas so they respond to discovery
        # (for binlog players) and on the source rdonlys (for workers)
        for t in [shard_0_replica, shard_1_slave1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
        for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

        # create the split shards
        shard_2_master.init_tablet('master', 'test_keyspace', '80-c0')
        shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0')
        shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0')
        shard_3_master.init_tablet('master', 'test_keyspace', 'c0-')
        shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-')
        shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-')

        # start vttablet on the split shards (no db created,
        # so they're all not serving)
        shard_2_master.start_vttablet(wait_for_state=None)
        shard_3_master.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_replica1, shard_2_replica2, shard_2_rdonly1,
                shard_3_replica, shard_3_rdonly1
        ]:
            t.start_vttablet(wait_for_state=None)
        for t in [
                shard_2_master, shard_2_replica1, shard_2_replica2,
                shard_2_rdonly1, shard_3_master, shard_3_replica,
                shard_3_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/80-c0',
            shard_2_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', 'test_keyspace/c0-', shard_3_master.tablet_alias
        ],
                        auto_log=True)

        # check the shards
        shards = utils.run_vtctl_json(
            ['FindAllShardsInKeyspace', 'test_keyspace'])
        for s in ['-80', '80-', '80-c0', 'c0-']:
            self.assertIn(s, shards, 'unexpected shards: %s' % str(shards))
        self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards))

        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # disable shard_1_slave2, so we're sure filtered replication will go
        # from shard_1_slave1
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')

        # we need to create the schema, and the worker will do data copying
        for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_1_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)

        # Run vtworker as daemon for the following SplitClone commands.
        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj', '--command_display_interval', '10ms'],
            auto_log=True)

        # Copy the data from the source to the destination shards.
        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        #
        # Initial clone (online).
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 2, 0, 0, 0)

        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Test the correct handling of keyspace_id changes which happen after
        # the first clone.
        # Let row 2 go to shard 3 instead of shard 2.
        shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set'
                              ' custom_ksid_col=0xD000000000000000 WHERE id=2',
                              write=True)
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        # Row 2 will be deleted from shard 2 and inserted to shard 3.
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 1, 0, 1, 1)
        self._check_value(shard_2_master,
                          'resharding1',
                          2,
                          'msg2',
                          0xD000000000000000,
                          should_be_here=False)
        self._check_value(shard_3_master, 'resharding1', 2, 'msg2',
                          0xD000000000000000)
        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again.
        shard_1_master.mquery('vt_test_keyspace', 'update resharding1 set'
                              ' custom_ksid_col=0x9000000000000000 WHERE id=2',
                              write=True)
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999',
            'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        # Row 2 will be deleted from shard 3 and inserted to shard 2.
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 1, 0, 1, 1)
        self._check_value(shard_2_master, 'resharding1', 2, 'msg2',
                          0x9000000000000000)
        self._check_value(shard_3_master,
                          'resharding1',
                          2,
                          'msg2',
                          0x9000000000000000,
                          should_be_here=False)
        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Modify the destination shard. SplitClone will revert the changes.
        # Delete row 2 (provokes an insert).
        shard_2_master.mquery('vt_test_keyspace',
                              'delete from resharding1 where id=2',
                              write=True)
        # Update row 3 (provokes an update).
        shard_3_master.mquery(
            'vt_test_keyspace',
            "update resharding1 set msg='msg-not-3' where id=3",
            write=True)
        # Insert row 4 and 5 (provokes a delete).
        self._insert_value(shard_3_master, 'resharding1', 4, 'msg4',
                           0xD000000000000000)
        self._insert_value(shard_3_master, 'resharding1', 5, 'msg5',
                           0xD000000000000000)

        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count',
            '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets',
            '1', '--max_tps', '9999', 'test_keyspace/80-'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        # Change tablet, which was taken offline, back to rdonly.
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 1, 1, 2, 0)
        self.verify_reconciliation_counters(worker_port, 'Offline',
                                            'resharding1', 0, 0, 0, 2)
        # Terminate worker daemon because it is no longer needed.
        utils.kill_sub_process(worker_proc, soft=True)

        # TODO(alainjobart): experiment with the dontStartBinlogPlayer option

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl([
            'ValidateSchemaKeyspace', '--exclude_tables=unrelated',
            'test_keyspace'
        ],
                        auto_log=True)

        # check the binlog players are running and exporting vars
        self.check_destination_master(shard_2_master, ['test_keyspace/80-'])
        self.check_destination_master(shard_3_master, ['test_keyspace/80-'])

        # check that binlog server exported the stats vars
        self.check_binlog_server_vars(shard_1_slave1, horizontal=True)

        # Check that the throttler was enabled.
        self.check_throttler_service(shard_2_master.rpc_endpoint(),
                                     ['BinlogPlayer/0'], 9999)
        self.check_throttler_service(shard_3_master.rpc_endpoint(),
                                     ['BinlogPlayer/0'], 9999)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000)
        logging.debug('Checking 80 percent of data is sent quickly')
        v = self._check_lots_timeout(1000, 80, 5)
        if v != 100:
            # small optimization: only do this check if we don't have all the data
            # already anyway.
            logging.debug('Checking all data goes through eventually')
            self._check_lots_timeout(1000, 100, 20)
        logging.debug('Checking no data was sent the wrong way')
        self._check_lots_not_present(1000)
        self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'],
                                      seconds_behind_master_max=30)
        self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'],
                                      seconds_behind_master_max=30)
        self.check_binlog_server_vars(shard_1_slave1,
                                      horizontal=True,
                                      min_statements=1000,
                                      min_transactions=1000)

        # use vtworker to compare the data (after health-checking the destination
        # rdonly tablets so discovery works)
        utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias])
        logging.debug('Running vtworker SplitDiff')
        utils.run_vtworker([
            '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        utils.pause('Good time to test vtworker for diffs')

        # get status for destination master tablets, make sure we have it all
        self.check_running_binlog_player(shard_2_master, 4000, 2000)
        self.check_running_binlog_player(shard_3_master, 4000, 2000)

        # start a thread to insert data into shard_1 in the background
        # with current time, and monitor the delay
        insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000,
                                       0x9000000000000000)
        insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001,
                                       0xD000000000000000)
        monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1)
        monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2)

        # tests a failover switching serving to a different replica
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
        shard_1_slave2.wait_for_vttablet_state('SERVING')
        shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
        utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias])

        # test data goes through again
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000, base=1000)
        logging.debug('Checking 80 percent of data was sent quickly')
        self._check_lots_timeout(1000, 80, 5, base=1000)
        self.check_binlog_server_vars(shard_1_slave2,
                                      horizontal=True,
                                      min_statements=800,
                                      min_transactions=800)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        expect_fail=True)

        # check query service is off on master 2 and master 3, as filtered
        # replication is enabled. Even health check that is enabled on
        # master 3 should not interfere (we run it to be sure).
        utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias],
                        auto_log=True)
        for master in [shard_2_master, shard_3_master]:
            utils.check_tablet_query_service(self, master, False, False)
            stream_health = utils.run_vtctl_json(
                ['VtTabletStreamHealth', '-count', '1', master.tablet_alias])
            logging.debug('Got health: %s', str(stream_health))
            self.assertIn('realtime_stats', stream_health)
            self.assertNotIn('serving', stream_health)

        # check the destination master 3 is healthy, even though its query
        # service is not running (if not healthy this would exception out)
        shard_3_master.get_healthz()

        # now serve rdonly from the split shards, in test_nj only
        utils.run_vtctl([
            'MigrateServedTypes', '--cells=test_nj', 'test_keyspace/80-',
            'rdonly'
        ],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_srv_keyspace(
            'test_ny',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

        # now serve rdonly from the split shards, everywhere
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_srv_keyspace(
            'test_ny',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
        utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True)
        utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

        # then serve replica from the split shards
        destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-']

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-c0 c0-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_1_slave2, False, True)

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
            auto_log=True)
        # After a backwards migration, queryservice should be enabled on
        # source and disabled on destinations
        utils.check_tablet_query_service(self, shard_1_slave2, True, False)
        # Destination tablets would have query service disabled for other
        # reasons than the migration, so check the shard record instead of
        # the tablets directly.
        utils.check_shard_query_services(self, destination_shards,
                                         topodata_pb2.REPLICA, False)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                        auto_log=True)
        # After a forwards migration, queryservice should be disabled on
        # source and enabled on destinations
        utils.check_tablet_query_service(self, shard_1_slave2, False, True)
        # Destination tablets would have query service disabled for other
        # reasons than the migration, so check the shard record instead of
        # the tablets directly
        utils.check_shard_query_services(self, destination_shards,
                                         topodata_pb2.REPLICA, True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-c0 c0-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # reparent shard_2 to shard_2_replica1, then insert more data and
        # see it flow through still
        utils.run_vtctl([
            'PlannedReparentShard', 'test_keyspace/80-c0',
            shard_2_replica1.tablet_alias
        ])

        # update our test variables to point at the new master
        shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master

        logging.debug(
            'Inserting lots of data on source shard after reparenting')
        self._insert_lots(3000, base=2000)
        logging.debug('Checking 80 percent of data was sent fairly quickly')
        self._check_lots_timeout(3000, 80, 10, base=2000)

        # use vtworker to compare the data again
        logging.debug('Running vtworker SplitDiff')
        utils.run_vtworker([
            '-cell', 'test_nj', 'SplitDiff', '--exclude_tables', 'unrelated',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/c0-'
        ],
                           auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)
        utils.run_vtctl(
            ['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
            auto_log=True)

        # going to migrate the master now, check the delays
        monitor_thread_1.done = True
        monitor_thread_2.done = True
        insert_thread_1.done = True
        insert_thread_2.done = True
        logging.debug(
            'DELAY 1: %s max_lag=%d ms avg_lag=%d ms',
            monitor_thread_1.thread_name, monitor_thread_1.max_lag_ms,
            monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count)
        logging.debug(
            'DELAY 2: %s max_lag=%d ms avg_lag=%d ms',
            monitor_thread_2.thread_name, monitor_thread_2.max_lag_ms,
            monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count)

        # mock with the SourceShard records to test 'vtctl SourceShardDelete'
        # and 'vtctl SourceShardAdd'
        utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'],
                        auto_log=True)
        utils.run_vtctl([
            'SourceShardAdd', '--key_range=80-', 'test_keyspace/c0-', '0',
            'test_keyspace/80-'
        ],
                        auto_log=True)

        # then serve master from the split shards, make sure the source master's
        # query service is now turned off
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-c0 c0-\n'
            'Partitions(rdonly): -80 80-c0 c0-\n'
            'Partitions(replica): -80 80-c0 c0-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')
        utils.check_tablet_query_service(self, shard_1_master, False, True)

        # check the binlog players are gone now
        self.check_no_binlog_player(shard_2_master)
        self.check_no_binlog_player(shard_3_master)

        # delete the original tablets in the original shard
        tablet.kill_tablets([
            shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
            shard_1_rdonly1
        ])
        for t in [
                shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
                shard_1_rdonly1
        ]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
        utils.run_vtctl(
            ['DeleteTablet', '-allow_master', shard_1_master.tablet_alias],
            auto_log=True)

        # rebuild the serving graph, all mentions of the old shards shoud be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # test RemoveShardCell
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/-80', 'test_nj'],
                        auto_log=True,
                        expect_fail=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_nj'],
                        auto_log=True)
        utils.run_vtctl(['RemoveShardCell', 'test_keyspace/80-', 'test_ny'],
                        auto_log=True)
        shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
        self.assertNotIn('cells', shard)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)

        # kill everything
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_ny_rdonly, shard_2_master,
            shard_2_replica1, shard_2_replica2, shard_2_rdonly1,
            shard_3_master, shard_3_replica, shard_3_rdonly1
        ])
Beispiel #43
0
  def test_resharding(self):
    # we're going to reparent and swap these two
    global shard_2_master, shard_2_replica1

    utils.run_vtctl(['CreateKeyspace',
                     '--sharding_column_name', 'bad_column',
                     '--sharding_column_type', 'bytes',
                     'test_keyspace'])
    utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace',
                     'custom_ksid_col', 'uint64'], expect_fail=True)
    utils.run_vtctl(['SetKeyspaceShardingInfo', '-force',
                     'test_keyspace',
                     'custom_ksid_col', base_sharding.keyspace_id_type])

    shard_0_master.init_tablet('master', 'test_keyspace', '-80')
    shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
    shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')
    shard_1_master.init_tablet('master', 'test_keyspace', '80-')
    shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
    shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-')
    shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')
    shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-')

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
    ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
    self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')

    # we set full_mycnf_args to True as a test in the KIT_BYTES case
    full_mycnf_args = (base_sharding.keyspace_id_type ==
                       keyrange_constants.KIT_BYTES)

    # create databases so vttablet can start behaving somewhat normally
    for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly,
              shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
              shard_1_rdonly1]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args)

    # wait for the tablets (replication is not setup, the slaves won't be
    # healthy)
    shard_0_master.wait_for_vttablet_state('SERVING')
    shard_0_replica.wait_for_vttablet_state('NOT_SERVING')
    shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
    shard_1_master.wait_for_vttablet_state('SERVING')
    shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
    shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')
    shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
    shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING')

    # reparent to make the tablets work
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80',
                     shard_0_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-',
                     shard_1_master.tablet_alias], auto_log=True)

    # check the shards
    shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace'])
    self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards))
    self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards))
    self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards))

    # create the tables
    self._create_schema()
    self._insert_startup_values()
    self._test_keyrange_constraints()

    # run a health check on source replicas so they respond to discovery
    # (for binlog players) and on the source rdonlys (for workers)
    for t in [shard_0_replica, shard_1_slave1]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
    for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

    # create the split shards
    shard_2_master.init_tablet('master', 'test_keyspace', '80-c0')
    shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0')
    shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0')
    shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0')
    shard_3_master.init_tablet('master', 'test_keyspace', 'c0-')
    shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-')
    shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-')

    # start vttablet on the split shards (no db created,
    # so they're all not serving)
    shard_2_master.start_vttablet(wait_for_state=None)
    shard_3_master.start_vttablet(wait_for_state=None)
    for t in [shard_2_replica1, shard_2_replica2, shard_2_rdonly1,
              shard_3_replica, shard_3_rdonly1]:
      t.start_vttablet(wait_for_state=None)
    for t in [shard_2_master, shard_2_replica1, shard_2_replica2,
              shard_2_rdonly1,
              shard_3_master, shard_3_replica, shard_3_rdonly1]:
      t.wait_for_vttablet_state('NOT_SERVING')

    utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0',
                     shard_2_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-',
                     shard_3_master.tablet_alias], auto_log=True)

    # check the shards
    shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace'])
    for s in ['-80', '80-', '80-c0', 'c0-']:
      self.assertIn(s, shards, 'unexpected shards: %s' % str(shards))
    self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards))

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                    auto_log=True)
    utils.check_srv_keyspace(
        'test_nj', 'test_keyspace',
        'Partitions(master): -80 80-\n'
        'Partitions(rdonly): -80 80-\n'
        'Partitions(replica): -80 80-\n',
        keyspace_id_type=base_sharding.keyspace_id_type,
        sharding_column_name='custom_ksid_col')

    # disable shard_1_slave2, so we're sure filtered replication will go
    # from shard_1_slave1
    utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare'])
    shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')

    # we need to create the schema, and the worker will do data copying
    for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'):
      utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated',
                       shard_1_rdonly1.tablet_alias, keyspace_shard],
                      auto_log=True)

    # Run vtworker as daemon for the following SplitClone commands.
    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--command_display_interval', '10ms'],
        auto_log=True)

    # Copy the data from the source to the destination shards.
    # min_table_size_for_split is set to 1 as to force a split even on the
    # small table we have.
    # --max_tps is only specified to enable the throttler and ensure that the
    # code is executed. But the intent here is not to throttle the test, hence
    # the rate limit is set very high.
    #
    # Initial clone (online).
    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--offline=false',
         '--exclude_tables', 'unrelated',
         '--min_table_size_for_split', '1',
         '--min_healthy_rdonly_tablets', '1',
         '--max_tps', '9999',
         'test_keyspace/80-'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        2, 0, 0)

    # Reset vtworker such that we can run the next command.
    workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
    utils.wait_procs([workerclient_proc])

    # Modify the destination shard. SplitClone will revert the changes.
    # Delete row 2 (provokes an insert).
    shard_2_master.mquery('vt_test_keyspace',
                          'delete from resharding1 where id=2', write=True)
    # Update row 3 (provokes an update).
    shard_3_master.mquery('vt_test_keyspace',
                          "update resharding1 set msg='msg-not-3' where id=3",
                          write=True)
    # Insert row 4 (provokes a delete).
    self._insert_value(shard_3_master, 'resharding1', 4, 'msg4',
                       0xD000000000000000)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--exclude_tables', 'unrelated',
         '--min_table_size_for_split', '1',
         '--min_healthy_rdonly_tablets', '1',
         '--max_tps', '9999',
         'test_keyspace/80-'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    # Change tablet, which was taken offline, back to rdonly.
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias,
                     'rdonly'], auto_log=True)
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        1, 1, 1)
    self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1',
                                        0, 0, 0)
    # Terminate worker daemon because it is no longer needed.
    utils.kill_sub_process(worker_proc, soft=True)

    # TODO(alainjobart): experiment with the dontStartBinlogPlayer option

    # check the startup values are in the right place
    self._check_startup_values()

    # check the schema too
    utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated',
                     'test_keyspace'], auto_log=True)

    # check the binlog players are running and exporting vars
    self.check_destination_master(shard_2_master, ['test_keyspace/80-'])
    self.check_destination_master(shard_3_master, ['test_keyspace/80-'])

    # check that binlog server exported the stats vars
    self.check_binlog_server_vars(shard_1_slave1, horizontal=True)

    # Check that the throttler was enabled.
    self.check_binlog_throttler(shard_2_master.rpc_endpoint(),
                                ['BinlogPlayer/0'], 9999)
    self.check_binlog_throttler(shard_3_master.rpc_endpoint(),
                                ['BinlogPlayer/0'], 9999)

    # testing filtered replication: insert a bunch of data on shard 1,
    # check we get most of it after a few seconds, wait for binlog server
    # timeout, check we get all of it.
    logging.debug('Inserting lots of data on source shard')
    self._insert_lots(1000)
    logging.debug('Checking 80 percent of data is sent quickly')
    v = self._check_lots_timeout(1000, 80, 5)
    if v != 100:
      # small optimization: only do this check if we don't have all the data
      # already anyway.
      logging.debug('Checking all data goes through eventually')
      self._check_lots_timeout(1000, 100, 20)
    logging.debug('Checking no data was sent the wrong way')
    self._check_lots_not_present(1000)
    self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'],
                                  seconds_behind_master_max=30)
    self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'],
                                  seconds_behind_master_max=30)
    self.check_binlog_server_vars(shard_1_slave1, horizontal=True,
                                  min_statements=1000, min_transactions=1000)

    # use vtworker to compare the data (after health-checking the destination
    # rdonly tablets so discovery works)
    utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias])
    logging.debug('Running vtworker SplitDiff')
    utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        'test_keyspace/c0-'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
                    auto_log=True)

    utils.pause('Good time to test vtworker for diffs')

    # get status for destination master tablets, make sure we have it all
    self.check_running_binlog_player(shard_2_master, 4000, 2000)
    self.check_running_binlog_player(shard_3_master, 4000, 2000)

    # start a thread to insert data into shard_1 in the background
    # with current time, and monitor the delay
    insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000,
                                   0x9000000000000000)
    insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001,
                                   0xD000000000000000)
    monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1)
    monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2)

    # tests a failover switching serving to a different replica
    utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
    utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
    shard_1_slave2.wait_for_vttablet_state('SERVING')
    shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
    utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias])

    # test data goes through again
    logging.debug('Inserting lots of data on source shard')
    self._insert_lots(1000, base=1000)
    logging.debug('Checking 80 percent of data was sent quickly')
    self._check_lots_timeout(1000, 80, 5, base=1000)
    self.check_binlog_server_vars(shard_1_slave2, horizontal=True,
                                  min_statements=800, min_transactions=800)

    # check we can't migrate the master just yet
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                    expect_fail=True)

    # check query service is off on master 2 and master 3, as filtered
    # replication is enabled. Even health check that is enabled on
    # master 3 should not interfere (we run it to be sure).
    utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias],
                    auto_log=True)
    for master in [shard_2_master, shard_3_master]:
      utils.check_tablet_query_service(self, master, False, False)
      stream_health = utils.run_vtctl_json(['VtTabletStreamHealth',
                                            '-count', '1',
                                            master.tablet_alias])
      logging.debug('Got health: %s', str(stream_health))
      self.assertIn('realtime_stats', stream_health)
      self.assertNotIn('serving', stream_health)

    # check the destination master 3 is healthy, even though its query
    # service is not running (if not healthy this would exception out)
    shard_3_master.get_healthz()

    # now serve rdonly from the split shards, in test_nj only
    utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj',
                     'test_keyspace/80-', 'rdonly'], auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_srv_keyspace('test_ny', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
    utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False)
    utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

    # now serve rdonly from the split shards, everywhere
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_srv_keyspace('test_ny', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
    utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True)
    utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)

    # then serve replica from the split shards
    destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-']

    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-c0 c0-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_1_slave2, False, True)

    # move replica back and forth
    utils.run_vtctl(
        ['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
        auto_log=True)
    # After a backwards migration, queryservice should be enabled on
    # source and disabled on destinations
    utils.check_tablet_query_service(self, shard_1_slave2, True, False)
    # Destination tablets would have query service disabled for other
    # reasons than the migration, so check the shard record instead of
    # the tablets directly.
    utils.check_shard_query_services(self, destination_shards,
                                     topodata_pb2.REPLICA, False)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
                    auto_log=True)
    # After a forwards migration, queryservice should be disabled on
    # source and enabled on destinations
    utils.check_tablet_query_service(self, shard_1_slave2, False, True)
    # Destination tablets would have query service disabled for other
    # reasons than the migration, so check the shard record instead of
    # the tablets directly
    utils.check_shard_query_services(self, destination_shards,
                                     topodata_pb2.REPLICA, True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-c0 c0-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # reparent shard_2 to shard_2_replica1, then insert more data and
    # see it flow through still
    utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-c0',
                     shard_2_replica1.tablet_alias])

    # update our test variables to point at the new master
    shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master

    logging.debug('Inserting lots of data on source shard after reparenting')
    self._insert_lots(3000, base=2000)
    logging.debug('Checking 80 percent of data was sent fairly quickly')
    self._check_lots_timeout(3000, 80, 10, base=2000)

    # use vtworker to compare the data again
    logging.debug('Running vtworker SplitDiff')
    utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        'test_keyspace/c0-'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
                    auto_log=True)

    # going to migrate the master now, check the delays
    monitor_thread_1.done = True
    monitor_thread_2.done = True
    insert_thread_1.done = True
    insert_thread_2.done = True
    logging.debug('DELAY 1: %s max_lag=%d ms avg_lag=%d ms',
                  monitor_thread_1.thread_name,
                  monitor_thread_1.max_lag_ms,
                  monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count)
    logging.debug('DELAY 2: %s max_lag=%d ms avg_lag=%d ms',
                  monitor_thread_2.thread_name,
                  monitor_thread_2.max_lag_ms,
                  monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count)

    # mock with the SourceShard records to test 'vtctl SourceShardDelete'
    # and 'vtctl SourceShardAdd'
    utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'],
                    auto_log=True)
    utils.run_vtctl(['SourceShardAdd', '--key_range=80-',
                     'test_keyspace/c0-', '0', 'test_keyspace/80-'],
                    auto_log=True)

    # then serve master from the split shards, make sure the source master's
    # query service is now turned off
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-c0 c0-\n'
                             'Partitions(rdonly): -80 80-c0 c0-\n'
                             'Partitions(replica): -80 80-c0 c0-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_1_master, False, True)

    # check the binlog players are gone now
    self.check_no_binlog_player(shard_2_master)
    self.check_no_binlog_player(shard_3_master)

    # delete the original tablets in the original shard
    tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2,
                         shard_1_ny_rdonly, shard_1_rdonly1])
    for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
              shard_1_rdonly1]:
      utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
    utils.run_vtctl(['DeleteTablet', '-allow_master',
                     shard_1_master.tablet_alias], auto_log=True)

    # rebuild the serving graph, all mentions of the old shards shoud be gone
    utils.run_vtctl(
        ['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    # test RemoveShardCell
    utils.run_vtctl(
        ['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True,
        expect_fail=True)
    utils.run_vtctl(
        ['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True)
    utils.run_vtctl(
        ['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True)
    shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
    self.assertNotIn('cells', shard)

    # delete the original shard
    utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)

    # kill everything
    tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly,
                         shard_2_master, shard_2_replica1, shard_2_replica2,
                         shard_2_rdonly1,
                         shard_3_master, shard_3_replica, shard_3_rdonly1])
Beispiel #44
0
def run_test_secure():
    zkocc_server = utils.zkocc_start()

    # start the tablets
    shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                  key=cert_dir + "/vt-server-key.pem")
    shard_0_slave.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                 key=cert_dir + "/vt-server-key.pem",
                                 repl_extra_flags={
                                     'flags': 2048,
                                     'ssl_ca': cert_dir + "/ca-cert.pem",
                                     'ssl_cert': cert_dir + "/client-cert.pem",
                                     'ssl_key': cert_dir + "/client-key.pem",
                                 })

    # Reparent using SSL
    for t in [shard_0_master, shard_0_slave]:
        t.reset_replication()
    utils.run_vtctl('ReparentShard -force test_keyspace/0 ' +
                    shard_0_master.tablet_alias,
                    auto_log=True)

    # then get the topology and check it
    zkocc_client = zkocc.ZkOccConnection(
        "localhost:%u" % utils.zkocc_port_base, "test_nj", 30.0)
    topology.read_keyspaces(zkocc_client)

    shard_0_master_addrs = topology.get_host_port_by_name(
        zkocc_client, "test_keyspace.0.master:_vts")
    if len(shard_0_master_addrs) != 1:
        raise utils.TestError(
            'topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts", got: %s'
            % " ".join([
                "%s:%u(%s)" % (h, p, str(e))
                for (h, p, e) in shard_0_master_addrs
            ]))
    if shard_0_master_addrs[0][2] != True:
        raise utils.TestError(
            'topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts" is not encrypted'
        )
    utils.debug("shard 0 master addrs: %s" % " ".join(
        ["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs]))

    # make sure asking for optionally secure connections works too
    auto_addrs = topology.get_host_port_by_name(
        zkocc_client, "test_keyspace.0.master:_vtocc", encrypted=True)
    if auto_addrs != shard_0_master_addrs:
        raise utils.TestError(
            'topology.get_host_port_by_name doesn\'t resolve encrypted addresses properly: %s != %s'
            % (str(shard_0_master_addrs), str(auto_addrs)))

    # try to connect with regular client
    try:
        conn = tablet3.TabletConnection(
            "%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
            "test_keyspace", "0", 10.0)
        conn.dial()
        raise utils.TestError("No exception raised to secure port")
    except tablet3.FatalError as e:
        if not e.args[0][0].startswith('Unexpected EOF in handshake to'):
            raise utils.TestError("Unexpected exception: %s" % str(e))

    sconn = utils.get_vars(shard_0_master.port)["SecureConns"]
    if sconn != 0:
        raise utils.TestError("unexpected conns %s" % sconn)

    # connect to encrypted port
    conn = tablet3.TabletConnection(
        "%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
        "test_keyspace",
        "0",
        5.0,
        encrypted=True)
    conn.dial()
    (results, rowcount, lastrowid,
     fields) = conn._execute("select 1 from dual", {})
    if (len(results) != 1 or \
          results[0][0] != 1):
        print "conn._execute returned:", results
        raise utils.TestError('wrong conn._execute output')

    sconn = utils.get_vars(shard_0_master.port)["SecureConns"]
    if sconn != 1:
        raise utils.TestError("unexpected conns %s" % sconn)
    saccept = utils.get_vars(shard_0_master.port)["SecureAccepts"]
    if saccept == 0:
        raise utils.TestError("unexpected accepts %s" % saccept)

    # trigger a time out on a secure connection, see what exception we get
    try:
        conn._execute("select sleep(100) from dual", {})
        raise utils.TestError("No timeout exception")
    except tablet3.TimeoutError as e:
        utils.debug("Got the right exception for SSL timeout: %s" % str(e))

    # kill everything
    utils.kill_sub_process(zkocc_server)
Beispiel #45
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
    if mysql_down:
      logging.debug('Shutting down mysqld on destination masters.')
      utils.wait_procs(
          [shard_0_master.shutdown_mysql(),
           shard_1_master.shutdown_mysql()])

    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--use_v3_resharding_mode=false'],
        auto_log=True)

    # --max_tps is only specified to enable the throttler and ensure that the
    # code is executed. But the intent here is not to throttle the test, hence
    # the rate limit is set very high.
    # --chunk_count is 2 because rows are currently ordered by primary key such
    # that all rows of the first shard come first and then the second shard.
    # TODO(mberlin): Remove --offline=false once vtworker ensures that the
    #                destination shards are not behind the master's replication
    #                position.
    args = ['SplitClone',
            '--offline=false',
            '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1',
            '--max_tps', '9999']
    if not mysql_down:
      # Make the clone as slow as necessary such that there is enough time to
      # run PlannedReparent in the meantime.
      # TODO(mberlin): Once insert_values is fixed to uniformly distribute the
      #                rows across shards when sorted by primary key, remove
      #                --chunk_count 2, --min_rows_per_chunk 1 and set
      #                --source_reader_count back to 1.
      args.extend(['--source_reader_count', '2',
                   '--chunk_count', '2',
                   '--min_rows_per_chunk', '1',
                   '--write_query_max_rows', '1'])
    args.append('test_keyspace/0')
    workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port)

    if mysql_down:
      # If MySQL is down, we wait until vtworker retried at least once to make
      # sure it reached the point where a write failed due to MySQL being down.
      # There should be two retries at least, one for each destination shard.
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerRetryCount >= 2',
          condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
      logging.debug('Worker has retried at least twice, starting reparent now')

      # vtworker is blocked at this point. This is a good time to test that its
      # throttler server is reacting to RPCs.
      self.check_throttler_service('localhost:%d' % worker_rpc_port,
                                   ['test_keyspace/-80', 'test_keyspace/80-'],
                                   9999)

      # Bring back masters. Since we test with semi-sync now, we need at least
      # one replica for the new master. This test is already quite expensive,
      # so we bring back the old master as a replica rather than having a third
      # replica up the whole time.
      logging.debug('Restarting mysqld on destination masters')
      utils.wait_procs(
          [shard_0_master.start_mysql(),
           shard_1_master.start_mysql()])

      # Reparent away from the old masters.
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80',
           '-new_master', shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-',
           '-new_master', shard_1_replica.tablet_alias], auto_log=True)

    else:
      # NOTE: There is a race condition around this:
      #   It's possible that the SplitClone vtworker command finishes before the
      #   PlannedReparentShard vtctl command, which we start below, succeeds.
      #   Then the test would fail because vtworker did not have to retry.
      #
      # To workaround this, the test takes a parameter to increase the number of
      # rows that the worker has to copy (with the idea being to slow the worker
      # down).
      # You should choose a value for num_insert_rows, such that this test
      # passes for your environment (trial-and-error...)
      # Make sure that vtworker got past the point where it picked a master
      # for each destination shard ("finding targets" state).
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerState == cloning the data (online)',
          condition_fn=lambda v: v.get('WorkerState') == 'cloning the'
          ' data (online)')
      logging.debug('Worker is in copy state, starting reparent now')

      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80',
           '-new_master', shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-',
           '-new_master', shard_1_replica.tablet_alias], auto_log=True)

    utils.wait_procs([workerclient_proc])

    # Verify that we were forced to re-resolve and retry.
    worker_vars = utils.get_vars(worker_port)
    self.assertGreater(worker_vars['WorkerRetryCount'], 1,
                       "expected vtworker to retry each of the two reparented"
                       " destination masters at least once, but it didn't")
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                        "expected vtworker to retry, but it didn't")
    utils.kill_sub_process(worker_proc, soft=True)

    # Wait for the destination RDONLYs to catch up or the following offline
    # clone will try to insert rows which already exist.
    # TODO(mberlin): Remove this once SplitClone supports it natively.
    utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1)
    utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1)
    # Run final offline clone to enable filtered replication.
    _, _ = utils.run_vtworker(['-cell', 'test_nj',
                               '--use_v3_resharding_mode=false',
                               'SplitClone',
                               '--online=false',
                               '--min_healthy_rdonly_tablets', '1',
                               'test_keyspace/0'], auto_log=True)

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Beispiel #46
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
    if mysql_down:
      logging.debug('Shutting down mysqld on destination masters.')
      utils.wait_procs(
          [shard_0_master.shutdown_mysql(),
           shard_1_master.shutdown_mysql()])

    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj'],
        auto_log=True)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--source_reader_count', '1',
         '--destination_pack_count', '1',
         '--destination_writer_count', '1',
         'test_keyspace/0'],
        worker_rpc_port)

    if mysql_down:
      # If MySQL is down, we wait until resolving at least twice (to verify that
      # we do reresolve and retry due to MySQL being down).
      worker_vars = utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerDestinationActualResolves >= 2',
          condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 2)
      self.assertNotEqual(
          worker_vars['WorkerRetryCount'], {},
          "expected vtworker to retry, but it didn't")
      logging.debug('Worker has resolved at least twice, starting reparent now')

      # Bring back masters. Since we test with semi-sync now, we need at least
      # one replica for the new master. This test is already quite expensive,
      # so we bring back the old master as a replica rather than having a third
      # replica up the whole time.
      logging.debug('Restarting mysqld on destination masters')
      utils.wait_procs(
          [shard_0_master.start_mysql(),
           shard_1_master.start_mysql()])

      # Reparent away from the old masters.
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/-80',
           shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/80-',
           shard_1_replica.tablet_alias], auto_log=True)

    else:
      # NOTE: There is a race condition around this:
      #   It's possible that the SplitClone vtworker command finishes before the
      #   PlannedReparentShard vtctl command, which we start below, succeeds.
      #   Then the test would fail because vtworker did not have to resolve the
      #   master tablet again (due to the missing reparent).
      #
      # To workaround this, the test takes a parameter to increase the number of
      # rows that the worker has to copy (with the idea being to slow the worker
      # down).
      # You should choose a value for num_insert_rows, such that this test
      # passes for your environment (trial-and-error...)
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerDestinationActualResolves >= 1',
          condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 1)
      logging.debug('Worker has resolved at least once, starting reparent now')

      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/-80',
           shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', 'test_keyspace/80-',
           shard_1_replica.tablet_alias], auto_log=True)

    utils.wait_procs([workerclient_proc])

    # Verify that we were forced to reresolve and retry.
    worker_vars = utils.get_vars(worker_port)
    self.assertGreater(worker_vars['WorkerDestinationActualResolves'], 1)
    self.assertGreater(worker_vars['WorkerDestinationAttemptedResolves'], 1)
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                        "expected vtworker to retry, but it didn't")
    utils.kill_sub_process(worker_proc, soft=True)

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Beispiel #47
0
def tearDownModule():
  utils.kill_sub_process(vtgateclienttest_process, soft=True)
  vtgateclienttest_process.wait()

  environment.topo_server().teardown()
Beispiel #48
0
  def test_zkocc(self):
    # preload the test_nj cell
    zkocc_14850 = utils.zkocc_start(extra_params=['-connect-timeout=2s', '-cache-refresh-interval=1s'])
    time.sleep(1)

    # create a python client. The first address is bad, will test the retry logic
    bad_port = environment.reserve_ports(3)
    zkocc_client = zkocc.ZkOccConnection("localhost:%u,localhost:%u,localhost:%u" % (bad_port, environment.zkocc_port_base, bad_port+1), "test_nj", 30)
    zkocc_client.dial()

    # test failure for a python client that cannot connect
    bad_zkocc_client = zkocc.ZkOccConnection("localhost:%u,localhost:%u" % (bad_port+2, bad_port), "test_nj", 30)
    try:
      bad_zkocc_client.dial()
      self.fail('exception expected')
    except zkocc.ZkOccError as e:
      if not str(e).startswith("Cannot dial to any server, tried: "):
        self.fail('unexpected exception: %s' % str(e))
    level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.ERROR)

    # FIXME(ryszard): This can be changed into a self.assertRaises.
    try:
      bad_zkocc_client.get("/zk/test_nj/vt/zkocc1/data1")
      self.fail('exception expected')
    except zkocc.ZkOccError as e:
      if not str(e).startswith("Cannot dial to any server, tried: "):
        self.fail('unexpected exception: %s' % str(e))

    logging.getLogger().setLevel(level)

    # get test
    out, err = utils.run(environment.binary_path('zkclient2')+' -server localhost:%u /zk/test_nj/vt/zkocc1/data1' % environment.zkocc_port_base, trap_output=True)
    self.assertEqual(err, "/zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=false, Stale=false)\n")

    zk_data = zkocc_client.get("/zk/test_nj/vt/zkocc1/data1")
    self.assertDictContainsSubset({'Data': "Test data 1",
                                   'Cached': True,
                                   'Stale': False,},
                                  zk_data)
    self.assertDictContainsSubset({'NumChildren': 0, 'Version': 0}, zk_data['Stat'])

    # getv test
    out, err = utils.run(environment.binary_path('zkclient2')+' -server localhost:%u /zk/test_nj/vt/zkocc1/data1 /zk/test_nj/vt/zkocc1/data2 /zk/test_nj/vt/zkocc1/data3' % environment.zkocc_port_base, trap_output=True)
    self.assertEqualNormalized(err, """[0] /zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=false)
  [1] /zk/test_nj/vt/zkocc1/data2 = Test data 2 (NumChildren=0, Version=0, Cached=false, Stale=false)
  [2] /zk/test_nj/vt/zkocc1/data3 = Test data 3 (NumChildren=0, Version=0, Cached=false, Stale=false)
  """)
    zk_data = zkocc_client.getv(["/zk/test_nj/vt/zkocc1/data1", "/zk/test_nj/vt/zkocc1/data2", "/zk/test_nj/vt/zkocc1/data3"])['Nodes']
    self.assertEqual(len(zk_data), 3)
    for i, d in enumerate(zk_data):
      self.assertEqual(d['Data'], 'Test data %s' % (i + 1))
      self.assertTrue(d['Cached'])
      self.assertFalse(d['Stale'])
      self.assertDictContainsSubset({'NumChildren': 0, 'Version': 0}, d['Stat'])

    # children test
    out, err = utils.run(environment.binary_path('zkclient2')+' -server localhost:%u -mode children /zk/test_nj/vt' % environment.zkocc_port_base, trap_output=True)
    self.assertEqualNormalized(err, """Path = /zk/test_nj/vt
  Child[0] = zkocc1
  Child[1] = zkocc2
  NumChildren = 2
  CVersion = 2
  Cached = false
  Stale = false
  """)

    # zk command tests
    self._check_zk_output("cat /zk/test_nj/vt/zkocc1/data1", "Test data 1")
    self._check_zk_output("ls -l /zk/test_nj/vt/zkocc1", """total: 3
  -rw-rw-rw- zk zk       11  %s data1
  -rw-rw-rw- zk zk       11  %s data2
  -rw-rw-rw- zk zk       11  %s data3
  """ % (_format_time(zk_data[0]['Stat']['MTime']),
         _format_time(zk_data[1]['Stat']['MTime']),
         _format_time(zk_data[2]['Stat']['MTime'])))

    # test /zk/local is not resolved and rejected
    out, err = utils.run(environment.binary_path('zkclient2')+' -server localhost:%u /zk/local/vt/zkocc1/data1' % environment.zkocc_port_base, trap_output=True, raise_on_error=False)
    self.assertIn("zkocc: cannot resolve local cell", err)

    # start a background process to query the same value over and over again
    # while we kill the zk server and restart it
    outfd = tempfile.NamedTemporaryFile(dir=environment.tmproot, delete=False)
    filename = outfd.name
    querier = utils.run_bg('/bin/bash -c "while true ; do '+environment.binary_path('zkclient2')+' -server localhost:%u /zk/test_nj/vt/zkocc1/data1 ; sleep 0.1 ; done"' % environment.zkocc_port_base, stderr=outfd.file)
    outfd.close()
    time.sleep(1)

    # kill zk server, sleep a bit, restart zk server, sleep a bit
    utils.run(environment.binary_path('zkctl')+' -zk.cfg 1@'+utils.hostname+':%u:%u:%u shutdown' % (environment.zk_port_base, environment.zk_port_base+1, environment.zk_port_base+2))
    time.sleep(3)
    utils.run(environment.binary_path('zkctl')+' -zk.cfg 1@'+utils.hostname+':%u:%u:%u start' % (environment.zk_port_base, environment.zk_port_base+1, environment.zk_port_base+2))
    time.sleep(3)

    utils.kill_sub_process(querier)

    logging.debug("Checking %s", filename)
    fd = open(filename, "r")
    state = 0
    for line in fd:
      if line == "/zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=false)\n":
        stale = False
      elif line == "/zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=true)\n":
        stale = True
      else:
        self.fail('unexpected line: %s' % line)
      if state == 0:
        if stale:
          state = 1
      elif state == 1:
        if not stale:
          state = 2
      else:
        if stale:
          self.fail('unexpected stale state')
    self.assertEqual(state, 2)
    fd.close()

    utils.zkocc_kill(zkocc_14850)

    # check that after the server is gone, the python client fails correctly
    level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.ERROR)
    try:
      zkocc_client.get("/zk/test_nj/vt/zkocc1/data1")
      self.fail('exception expected')
    except zkocc.ZkOccError as e:
      if not str(e).startswith("Cannot dial to any server, tried: "):
        self.fail('unexpected exception: %s', str(e))
    logging.getLogger().setLevel(level)
Beispiel #49
0
    def test_sharding(self):

        shard_0_master.init_tablet("master", "test_keyspace", "-80")
        shard_0_replica.init_tablet("replica", "test_keyspace", "-80")
        shard_1_master.init_tablet("master", "test_keyspace", "80-")
        shard_1_replica.init_tablet("replica", "test_keyspace", "80-")

        utils.run_vtctl("RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*", auto_log=True)

        utils.run_vtctl("RebuildKeyspaceGraph /zk/global/vt/keyspaces/*", auto_log=True)

        # run checks now before we start the tablets
        utils.validate_topology()

        # create databases
        shard_0_master.create_db("vt_test_keyspace")
        shard_0_replica.create_db("vt_test_keyspace")
        shard_1_master.create_db("vt_test_keyspace")
        shard_1_replica.create_db("vt_test_keyspace")

        # start the tablets
        shard_0_master.start_vttablet()
        shard_0_replica.start_vttablet()
        shard_1_master.start_vttablet()
        shard_1_replica.start_vttablet()

        # apply the schema on the first shard through vtctl, so all tablets
        # are the same (replication is not enabled yet, so allow_replication=false
        # is just there to be tested)
        utils.run_vtctl(
            [
                "ApplySchema",
                "-stop-replication",
                "-sql=" + create_vt_select_test.replace("\n", ""),
                shard_0_master.tablet_alias,
            ]
        )
        utils.run_vtctl(
            [
                "ApplySchema",
                "-stop-replication",
                "-sql=" + create_vt_select_test.replace("\n", ""),
                shard_0_replica.tablet_alias,
            ]
        )

        # start zkocc, we'll use it later
        zkocc_server = utils.zkocc_start()

        for t in [shard_0_master, shard_0_replica, shard_1_master, shard_1_replica]:
            t.reset_replication()
        utils.run_vtctl("ReparentShard -force test_keyspace/-80 " + shard_0_master.tablet_alias, auto_log=True)
        utils.run_vtctl("ReparentShard -force test_keyspace/80- " + shard_1_master.tablet_alias, auto_log=True)

        # apply the schema on the second shard using a simple schema upgrade
        utils.run_vtctl(
            [
                "ApplySchemaShard",
                "-simple",
                "-sql=" + create_vt_select_test_reverse.replace("\n", ""),
                "test_keyspace/80-",
            ]
        )

        # insert some values directly (db is RO after minority reparent)
        # FIXME(alainjobart) these values don't match the shard map
        utils.run_vtctl("SetReadWrite " + shard_0_master.tablet_alias)
        utils.run_vtctl("SetReadWrite " + shard_1_master.tablet_alias)
        shard_0_master.mquery(
            "vt_test_keyspace", "insert into vt_select_test (id, msg) values (1, 'test 1')", write=True
        )
        shard_1_master.mquery(
            "vt_test_keyspace", "insert into vt_select_test (id, msg) values (10, 'test 10')", write=True
        )

        utils.validate_topology(ping_tablets=True)

        utils.pause("Before the sql scatter query")

        # note the order of the rows is not guaranteed, as the go routines
        # doing the work can go out of order
        self._check_rows(["Index\tid\tmsg", "1\ttest 1", "10\ttest 10"])

        # write a value, re-read them all
        utils.vtclient2(
            3803,
            "/test_nj/test_keyspace/master",
            "insert into vt_select_test (id, msg) values (:keyspace_id, 'test 2')",
            bindvars='{"keyspace_id": 2}',
            driver="vtdb",
            verbose=True,
        )
        self._check_rows(["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"])

        # make sure the '2' value was written on first shard
        rows = shard_0_master.mquery("vt_test_keyspace", "select id, msg from vt_select_test order by id")
        self.assertEqual(rows, ((1, "test 1"), (2, "test 2")), "wrong mysql_query output: %s" % str(rows))

        utils.pause("After db writes")

        # now use zkocc or streaming or both for the same query
        self._check_rows(["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"], driver="vtdb-zkocc")
        self._check_rows(["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"], driver="vtdb-streaming")
        self._check_rows(["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"], driver="vtdb-zkocc-streaming")

        # make sure the schema checking works
        self._check_rows_schema_diff("vtdb-zkocc")
        self._check_rows_schema_diff("vtdb")

        # throw in some schema validation step
        # we created the schema differently, so it should show
        utils.run_vtctl("ValidateSchemaShard test_keyspace/-80")
        utils.run_vtctl("ValidateSchemaShard test_keyspace/80-")
        out, err = utils.run_vtctl("ValidateSchemaKeyspace test_keyspace", trap_output=True, raise_on_error=False)
        if (
            "test_nj-0000062344 and test_nj-0000062346 disagree on schema for table vt_select_test:\nCREATE TABLE"
            not in err
            or "test_nj-0000062344 and test_nj-0000062347 disagree on schema for table vt_select_test:\nCREATE TABLE"
            not in err
        ):
            self.fail("wrong ValidateSchemaKeyspace output: " + err)

        # validate versions
        utils.run_vtctl("ValidateVersionShard test_keyspace/-80", auto_log=True)
        utils.run_vtctl("ValidateVersionKeyspace test_keyspace", auto_log=True)

        # show and validate permissions
        utils.run_vtctl("GetPermissions test_nj-0000062344", auto_log=True)
        utils.run_vtctl("ValidatePermissionsShard test_keyspace/-80", auto_log=True)
        utils.run_vtctl("ValidatePermissionsKeyspace test_keyspace", auto_log=True)

        # and create zkns on this complex keyspace, make sure a few files are created
        utils.run_vtctl("ExportZknsForKeyspace test_keyspace")
        out, err = utils.run(utils.vtroot + "/bin/zk ls -R /zk/test_nj/zk?s/vt/test_keysp*", trap_output=True)
        lines = out.splitlines()
        for base in ["-80", "80-"]:
            for db_type in ["master", "replica"]:
                for sub_path in ["", ".vdns", "/0", "/_vtocc.vdns"]:
                    expected = "/zk/test_nj/zkns/vt/test_keyspace/" + base + "/" + db_type + sub_path
                    if expected not in lines:
                        self.fail("missing zkns part:\n%s\nin:%s" % (expected, out))

        # now try to connect using the python client and shard-aware connection
        # to both shards
        # first get the topology and check it
        zkocc_client = zkocc.ZkOccConnection("localhost:%u" % utils.zkocc_port_base, "test_nj", 30.0)
        topology.read_keyspaces(zkocc_client)

        shard_0_master_addrs = topology.get_host_port_by_name(zkocc_client, "test_keyspace.-80.master:_vtocc")
        if len(shard_0_master_addrs) != 1:
            self.fail(
                'topology.get_host_port_by_name failed for "test_keyspace.-80.master:_vtocc", got: %s'
                % " ".join(["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs])
            )
        logging.debug(
            "shard 0 master addrs: %s", " ".join(["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs])
        )

        # connect to shard -80
        conn = tablet3.TabletConnection(
            "%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]), "test_keyspace", "-80", 10.0
        )
        conn.dial()
        (results, rowcount, lastrowid, fields) = conn._execute("select id, msg from vt_select_test order by id", {})
        self.assertEqual(results, [(1, "test 1"), (2, "test 2")], "wrong conn._execute output: %s" % str(results))

        # connect to shard 80-
        shard_1_master_addrs = topology.get_host_port_by_name(zkocc_client, "test_keyspace.80-.master:_vtocc")
        conn = tablet3.TabletConnection(
            "%s:%u" % (shard_1_master_addrs[0][0], shard_1_master_addrs[0][1]), "test_keyspace", "80-", 10.0
        )
        conn.dial()
        (results, rowcount, lastrowid, fields) = conn._execute("select id, msg from vt_select_test order by id", {})
        self.assertEqual(results, [(10, "test 10")], "wrong conn._execute output: %s" % str(results))

        # try to connect with bad shard
        try:
            conn = tablet3.TabletConnection("localhost:%u" % shard_0_master.port, "test_keyspace", "-90", 10.0)
            conn.dial()
            self.fail("expected an exception")
        except Exception as e:
            if "fatal: Shard mismatch, expecting -80, received -90" not in str(e):
                self.fail("unexpected exception: " + str(e))

        utils.kill_sub_process(zkocc_server)
        shard_0_master.kill_vttablet()
        shard_0_replica.kill_vttablet()
        shard_1_master.kill_vttablet()
        shard_1_replica.kill_vttablet()
Beispiel #50
0
  def test_zkocc(self):
    # preload the test_nj cell
    zkocc_14850 = utils.zkocc_start(extra_params=['-connect-timeout=2s', '-cache-refresh-interval=1s'])
    time.sleep(1)

    # create a python client. The first address is bad, will test the retry logic
    bad_port = utils.reserve_ports(3)
    zkocc_client = zkocc.ZkOccConnection("localhost:%u,localhost:%u,localhost:%u" % (bad_port, utils.zkocc_port_base, bad_port+1), "test_nj", 30)
    zkocc_client.dial()

    # test failure for a python client that cannot connect
    bad_zkocc_client = zkocc.ZkOccConnection("localhost:%u,localhost:%u" % (bad_port+2, bad_port), "test_nj", 30)
    try:
      bad_zkocc_client.dial()
      raise utils.TestError('exception expected')
    except zkocc.ZkOccError as e:
      if str(e) != "Cannot dial to any server":
        raise
    level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.ERROR)

    # FIXME(ryszard): This can be changed into a self.assertRaises.
    try:
      bad_zkocc_client.get("/zk/test_nj/vt/zkocc1/data1")
      self.fail('exception expected')
    except zkocc.ZkOccError as e:
      if str(e) != "Cannot dial to any server":
        raise

    logging.getLogger().setLevel(level)

    # get test
    utils.prog_compile(['zkclient2'])
    out, err = utils.run(utils.vtroot+'/bin/zkclient2 -server localhost:%u /zk/test_nj/vt/zkocc1/data1' % utils.zkocc_port_base, trap_output=True)
    self.assertEqual(err, "/zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=false, Stale=false)\n")

    zk_data = zkocc_client.get("/zk/test_nj/vt/zkocc1/data1")
    self.assertDictContainsSubset({'Data': "Test data 1",
                                   'Cached': True,
                                   'Stale': False,},
                                  zk_data)
    self.assertDictContainsSubset({'NumChildren': 0, 'Version': 0}, zk_data['Stat'])

    # getv test
    out, err = utils.run(utils.vtroot+'/bin/zkclient2 -server localhost:%u /zk/test_nj/vt/zkocc1/data1 /zk/test_nj/vt/zkocc1/data2 /zk/test_nj/vt/zkocc1/data3' % utils.zkocc_port_base, trap_output=True)
    self.assertEqualNormalized(err, """[0] /zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=false)
  [1] /zk/test_nj/vt/zkocc1/data2 = Test data 2 (NumChildren=0, Version=0, Cached=false, Stale=false)
  [2] /zk/test_nj/vt/zkocc1/data3 = Test data 3 (NumChildren=0, Version=0, Cached=false, Stale=false)
  """)
    zk_data = zkocc_client.getv(["/zk/test_nj/vt/zkocc1/data1", "/zk/test_nj/vt/zkocc1/data2", "/zk/test_nj/vt/zkocc1/data3"])['Nodes']
    self.assertEqual(len(zk_data), 3)
    for i, d in enumerate(zk_data):
      self.assertEqual(d['Data'], 'Test data %s' % (i + 1))
      self.assertTrue(d['Cached'])
      self.assertFalse(d['Stale'])
      self.assertDictContainsSubset({'NumChildren': 0, 'Version': 0}, d['Stat'])

    # children test
    out, err = utils.run(utils.vtroot+'/bin/zkclient2 -server localhost:%u -mode children /zk/test_nj/vt' % utils.zkocc_port_base, trap_output=True)
    self.assertEqualNormalized(err, """Path = /zk/test_nj/vt
  Child[0] = zkocc1
  Child[1] = zkocc2
  NumChildren = 2
  CVersion = 2
  Cached = false
  Stale = false
  """)

    # zk command tests
    self._check_zk_output("cat /zk/test_nj/vt/zkocc1/data1", "Test data 1")
    self._check_zk_output("ls -l /zk/test_nj/vt/zkocc1", """total: 3
  -rw-rw-rw- zk zk       11  %s data1
  -rw-rw-rw- zk zk       11  %s data2
  -rw-rw-rw- zk zk       11  %s data3
  """ % (_format_time(zk_data[0]['Stat']['MTime']),
         _format_time(zk_data[1]['Stat']['MTime']),
         _format_time(zk_data[2]['Stat']['MTime'])))

    # test /zk/local is not resolved and rejected
    out, err = utils.run(utils.vtroot+'/bin/zkclient2 -server localhost:%u /zk/local/vt/zkocc1/data1' % utils.zkocc_port_base, trap_output=True, raise_on_error=False)
    self.assertIn("zkocc: cannot resolve local cell", err)

    # start a background process to query the same value over and over again
    # while we kill the zk server and restart it
    outfd = tempfile.NamedTemporaryFile(dir=utils.tmp_root, delete=False)
    filename = outfd.name
    querier = utils.run_bg('/bin/bash -c "while true ; do '+utils.vtroot+'/bin/zkclient2 -server localhost:%u /zk/test_nj/vt/zkocc1/data1 ; sleep 0.1 ; done"' % utils.zkocc_port_base, stderr=outfd.file)
    outfd.close()
    time.sleep(1)

    # kill zk server, sleep a bit, restart zk server, sleep a bit
    utils.run(utils.vtroot+'/bin/zkctl -zk.cfg 1@'+utils.hostname+':%u:%u:%u shutdown' % (utils.zk_port_base, utils.zk_port_base+1, utils.zk_port_base+2))
    time.sleep(3)
    utils.run(utils.vtroot+'/bin/zkctl -zk.cfg 1@'+utils.hostname+':%u:%u:%u start' % (utils.zk_port_base, utils.zk_port_base+1, utils.zk_port_base+2))
    time.sleep(3)

    utils.kill_sub_process(querier)

    logging.debug("Checking %s", filename)
    fd = open(filename, "r")
    state = 0
    for line in fd:
      if line == "/zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=false)\n":
        stale = False
      elif line == "/zk/test_nj/vt/zkocc1/data1 = Test data 1 (NumChildren=0, Version=0, Cached=true, Stale=true)\n":
        stale = True
      else:
        raise utils.TestError('unexpected line: ', line)
      if state == 0:
        if stale:
          state = 1
      elif state == 1:
        if not stale:
          state = 2
      else:
        if stale:
          self.fail('unexpected stale state')
    self.assertEqual(state, 2)
    fd.close()

    utils.zkocc_kill(zkocc_14850)

    # check that after the server is gone, the python client fails correctly
    level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.ERROR)
    try:
      zkocc_client.get("/zk/test_nj/vt/zkocc1/data1")
      self.fail('exception expected')
    except zkocc.ZkOccError as e:
      if str(e) != "Cannot dial to any server":
        raise
    logging.getLogger().setLevel(level)
Beispiel #51
0
    def test_sharding(self):

        shard_0_master.init_tablet('master', 'test_keyspace', '-80')
        shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
        shard_1_master.init_tablet('master', 'test_keyspace', '80-')
        shard_1_replica.init_tablet('replica', 'test_keyspace', '80-')

        utils.run_vtctl(
            'RebuildShardGraph /zk/global/vt/keyspaces/test_keyspace/shards/*',
            auto_log=True)

        utils.run_vtctl('RebuildKeyspaceGraph /zk/global/vt/keyspaces/*',
                        auto_log=True)

        # run checks now before we start the tablets
        utils.validate_topology()

        # create databases
        shard_0_master.create_db('vt_test_keyspace')
        shard_0_replica.create_db('vt_test_keyspace')
        shard_1_master.create_db('vt_test_keyspace')
        shard_1_replica.create_db('vt_test_keyspace')

        # start the tablets
        shard_0_master.start_vttablet()
        shard_0_replica.start_vttablet()
        shard_1_master.start_vttablet()
        shard_1_replica.start_vttablet()

        # apply the schema on the first shard through vtctl, so all tablets
        # are the same (replication is not enabled yet, so allow_replication=false
        # is just there to be tested)
        utils.run_vtctl([
            'ApplySchema', '-stop-replication',
            '-sql=' + create_vt_select_test.replace("\n", ""),
            shard_0_master.tablet_alias
        ])
        utils.run_vtctl([
            'ApplySchema', '-stop-replication',
            '-sql=' + create_vt_select_test.replace("\n", ""),
            shard_0_replica.tablet_alias
        ])

        # start zkocc, we'll use it later
        zkocc_server = utils.zkocc_start()

        for t in [
                shard_0_master, shard_0_replica, shard_1_master,
                shard_1_replica
        ]:
            t.reset_replication()
        utils.run_vtctl('ReparentShard -force test_keyspace/-80 ' +
                        shard_0_master.tablet_alias,
                        auto_log=True)
        utils.run_vtctl('ReparentShard -force test_keyspace/80- ' +
                        shard_1_master.tablet_alias,
                        auto_log=True)

        # apply the schema on the second shard using a simple schema upgrade
        utils.run_vtctl([
            'ApplySchemaShard', '-simple',
            '-sql=' + create_vt_select_test_reverse.replace("\n", ""),
            'test_keyspace/80-'
        ])

        # insert some values directly (db is RO after minority reparent)
        # FIXME(alainjobart) these values don't match the shard map
        utils.run_vtctl('SetReadWrite ' + shard_0_master.tablet_alias)
        utils.run_vtctl('SetReadWrite ' + shard_1_master.tablet_alias)
        shard_0_master.mquery(
            'vt_test_keyspace',
            "insert into vt_select_test (id, msg) values (1, 'test 1')",
            write=True)
        shard_1_master.mquery(
            'vt_test_keyspace',
            "insert into vt_select_test (id, msg) values (10, 'test 10')",
            write=True)

        utils.validate_topology(ping_tablets=True)

        utils.pause("Before the sql scatter query")

        # note the order of the rows is not guaranteed, as the go routines
        # doing the work can go out of order
        self._check_rows(["Index\tid\tmsg", "1\ttest 1", "10\ttest 10"])

        # write a value, re-read them all
        utils.vtclient2(
            3803,
            "/test_nj/test_keyspace/master",
            "insert into vt_select_test (id, msg) values (:keyspace_id, 'test 2')",
            bindvars='{"keyspace_id": 2}',
            driver="vtdb",
            verbose=True)
        self._check_rows(
            ["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"])

        # make sure the '2' value was written on first shard
        rows = shard_0_master.mquery(
            'vt_test_keyspace',
            "select id, msg from vt_select_test order by id")
        self.assertEqual(rows, (
            (1, 'test 1'),
            (2, 'test 2'),
        ), 'wrong mysql_query output: %s' % str(rows))

        utils.pause("After db writes")

        # now use zkocc or streaming or both for the same query
        self._check_rows(
            ["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"],
            driver="vtdb-zkocc")
        self._check_rows(
            ["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"],
            driver="vtdb-streaming")
        self._check_rows(
            ["Index\tid\tmsg", "1\ttest 1", "2\ttest 2", "10\ttest 10"],
            driver="vtdb-zkocc-streaming")

        # make sure the schema checking works
        self._check_rows_schema_diff("vtdb-zkocc")
        self._check_rows_schema_diff("vtdb")

        # throw in some schema validation step
        # we created the schema differently, so it should show
        utils.run_vtctl('ValidateSchemaShard test_keyspace/-80')
        utils.run_vtctl('ValidateSchemaShard test_keyspace/80-')
        out, err = utils.run_vtctl('ValidateSchemaKeyspace test_keyspace',
                                   trap_output=True,
                                   raise_on_error=False)
        if "test_nj-0000062344 and test_nj-0000062346 disagree on schema for table vt_select_test:\nCREATE TABLE" not in err or \
           "test_nj-0000062344 and test_nj-0000062347 disagree on schema for table vt_select_test:\nCREATE TABLE" not in err:
            self.fail('wrong ValidateSchemaKeyspace output: ' + err)

        # validate versions
        utils.run_vtctl('ValidateVersionShard test_keyspace/-80',
                        auto_log=True)
        utils.run_vtctl('ValidateVersionKeyspace test_keyspace', auto_log=True)

        # show and validate permissions
        utils.run_vtctl('GetPermissions test_nj-0000062344', auto_log=True)
        utils.run_vtctl('ValidatePermissionsShard test_keyspace/-80',
                        auto_log=True)
        utils.run_vtctl('ValidatePermissionsKeyspace test_keyspace',
                        auto_log=True)

        # and create zkns on this complex keyspace, make sure a few files are created
        utils.run_vtctl('ExportZknsForKeyspace test_keyspace')
        out, err = utils.run(utils.vtroot +
                             '/bin/zk ls -R /zk/test_nj/zk?s/vt/test_keysp*',
                             trap_output=True)
        lines = out.splitlines()
        for base in ['-80', '80-']:
            for db_type in ['master', 'replica']:
                for sub_path in ['', '.vdns', '/0', '/_vtocc.vdns']:
                    expected = '/zk/test_nj/zkns/vt/test_keyspace/' + base + '/' + db_type + sub_path
                    if expected not in lines:
                        self.fail('missing zkns part:\n%s\nin:%s' %
                                  (expected, out))

        # now try to connect using the python client and shard-aware connection
        # to both shards
        # first get the topology and check it
        zkocc_client = zkocc.ZkOccConnection(
            "localhost:%u" % utils.zkocc_port_base, "test_nj", 30.0)
        topology.read_keyspaces(zkocc_client)

        shard_0_master_addrs = topology.get_host_port_by_name(
            zkocc_client, "test_keyspace.-80.master:_vtocc")
        if len(shard_0_master_addrs) != 1:
            self.fail(
                'topology.get_host_port_by_name failed for "test_keyspace.-80.master:_vtocc", got: %s'
                % " ".join([
                    "%s:%u(%s)" % (h, p, str(e))
                    for (h, p, e) in shard_0_master_addrs
                ]))
        logging.debug(
            "shard 0 master addrs: %s", " ".join([
                "%s:%u(%s)" % (h, p, str(e))
                for (h, p, e) in shard_0_master_addrs
            ]))

        # connect to shard -80
        conn = tablet3.TabletConnection(
            "%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
            "test_keyspace", "-80", 10.0)
        conn.dial()
        (results, rowcount, lastrowid, fields) = conn._execute(
            "select id, msg from vt_select_test order by id", {})
        self.assertEqual(results, [
            (1, 'test 1'),
            (2, 'test 2'),
        ], 'wrong conn._execute output: %s' % str(results))

        # connect to shard 80-
        shard_1_master_addrs = topology.get_host_port_by_name(
            zkocc_client, "test_keyspace.80-.master:_vtocc")
        conn = tablet3.TabletConnection(
            "%s:%u" % (shard_1_master_addrs[0][0], shard_1_master_addrs[0][1]),
            "test_keyspace", "80-", 10.0)
        conn.dial()
        (results, rowcount, lastrowid, fields) = conn._execute(
            "select id, msg from vt_select_test order by id", {})
        self.assertEqual(results, [
            (10, 'test 10'),
        ], 'wrong conn._execute output: %s' % str(results))

        # try to connect with bad shard
        try:
            conn = tablet3.TabletConnection(
                "localhost:%u" % shard_0_master.port, "test_keyspace", "-90",
                10.0)
            conn.dial()
            self.fail('expected an exception')
        except Exception as e:
            if "fatal: Shard mismatch, expecting -80, received -90" not in str(
                    e):
                self.fail('unexpected exception: ' + str(e))

        utils.kill_sub_process(zkocc_server)
        shard_0_master.kill_vttablet()
        shard_0_replica.kill_vttablet()
        shard_1_master.kill_vttablet()
        shard_1_replica.kill_vttablet()
Beispiel #52
0
def tearDownModule():
  utils.kill_sub_process(vtgateclienttest_process, soft=True)
  if vtgateclienttest_process:
    vtgateclienttest_process.wait()

  environment.topo_server().teardown()
Beispiel #53
0
  def test_secure(self):
    zkocc_server = utils.zkocc_start()

    # start the tablets
    shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                  key=cert_dir + "/vt-server-key.pem")
    shard_0_slave.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                 key=cert_dir + "/vt-server-key.pem",
                                 repl_extra_flags={
        'flags': "2048",
        'ssl-ca': cert_dir + "/ca-cert.pem",
        'ssl-cert': cert_dir + "/client-cert.pem",
        'ssl-key': cert_dir + "/client-key.pem",
        })

    # Reparent using SSL
    for t in [shard_0_master, shard_0_slave]:
      t.reset_replication()
    utils.run_vtctl('ReparentShard -force test_keyspace/0 ' + shard_0_master.tablet_alias, auto_log=True)

    # then get the topology and check it
    zkocc_client = zkocc.ZkOccConnection("localhost:%u" % utils.zkocc_port_base,
                                         "test_nj", 30.0)
    topology.read_keyspaces(zkocc_client)

    shard_0_master_addrs = topology.get_host_port_by_name(zkocc_client, "test_keyspace.0.master:_vts")
    if len(shard_0_master_addrs) != 1:
      raise utils.TestError('topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts", got: %s' % " ".join(["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs]))
    if shard_0_master_addrs[0][2] != True:
      raise utils.TestError('topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts" is not encrypted')
    logging.debug("shard 0 master addrs: %s", " ".join(["%s:%u(%s)" % (h, p, str(e)) for (h, p, e) in shard_0_master_addrs]))

    # make sure asking for optionally secure connections works too
    auto_addrs = topology.get_host_port_by_name(zkocc_client, "test_keyspace.0.master:_vtocc", encrypted=True)
    if auto_addrs != shard_0_master_addrs:
      raise utils.TestError('topology.get_host_port_by_name doesn\'t resolve encrypted addresses properly: %s != %s' % (str(shard_0_master_addrs), str(auto_addrs)))

    # try to connect with regular client
    try:
      conn = tablet3.TabletConnection("%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
                                      "test_keyspace", "0", 10.0)
      conn.dial()
      raise utils.TestError("No exception raised to secure port")
    except tablet3.FatalError as e:
      if not e.args[0][0].startswith('Unexpected EOF in handshake to'):
        raise utils.TestError("Unexpected exception: %s" % str(e))

    sconn = utils.get_vars(shard_0_master.port)["SecureConnections"]
    if sconn != 0:
      raise utils.TestError("unexpected conns %s" % sconn)

    # connect to encrypted port
    conn = tablet3.TabletConnection("%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
                                    "test_keyspace", "0", 5.0, encrypted=True)
    conn.dial()
    (results, rowcount, lastrowid, fields) = conn._execute("select 1 from dual", {})
    self.assertEqual(results, [(1,),], 'wrong conn._execute output: %s' % str(results))

    sconn = utils.get_vars(shard_0_master.port)["SecureConnections"]
    if sconn != 1:
      raise utils.TestError("unexpected conns %s" % sconn)
    saccept = utils.get_vars(shard_0_master.port)["SecureAccepts"]
    if saccept == 0:
      raise utils.TestError("unexpected accepts %s" % saccept)


    # trigger a time out on a secure connection, see what exception we get
    try:
      conn._execute("select sleep(100) from dual", {})
      raise utils.TestError("No timeout exception")
    except tablet3.TimeoutError as e:
      logging.debug("Got the right exception for SSL timeout: %s", str(e))

    # kill everything
    utils.kill_sub_process(zkocc_server)
Beispiel #54
0
    def test_resharding(self):
        # create the keyspace with just one shard
        shard_master.init_tablet('replica',
                                 keyspace='test_keyspace',
                                 shard='0',
                                 tablet_index=0)
        shard_replica.init_tablet('replica',
                                  keyspace='test_keyspace',
                                  shard='0',
                                  tablet_index=1)
        shard_rdonly1.init_tablet('rdonly',
                                  keyspace='test_keyspace',
                                  shard='0',
                                  tablet_index=2)

        for t in [shard_master, shard_replica, shard_rdonly1]:
            t.create_db('vt_test_keyspace')

        # replica is not started, InitShardMaster should timeout
        shard_master.start_vttablet(wait_for_state=None,
                                    binlog_use_v3_resharding_mode=False)
        shard_rdonly1.start_vttablet(wait_for_state=None,
                                     binlog_use_v3_resharding_mode=False)

        for t in [shard_master, shard_rdonly1]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # reparent to make the tablets work - expect fail
        # because replica tablet is not up
        _, stderr = utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            shard_master.tablet_alias
        ],
                                    auto_log=True,
                                    expect_fail=True)

        self.assertIn('tablet test_nj-0000062345 ResetReplication failed',
                      stderr)
        # start replica
        shard_replica.start_vttablet(wait_for_state=None,
                                     binlog_use_v3_resharding_mode=False)

        shard_replica.wait_for_vttablet_state('NOT_SERVING')

        # reparent to make the tablets work
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            shard_master.tablet_alias
        ],
                        auto_log=True)

        utils.wait_for_tablet_type(shard_replica.tablet_alias, 'replica')
        utils.wait_for_tablet_type(shard_rdonly1.tablet_alias, 'rdonly')
        for t in [shard_master, shard_replica, shard_rdonly1]:
            t.wait_for_vttablet_state('SERVING')

        # create the tables and add startup values
        self._create_schema()
        self._insert_startup_values()

        # reload schema on all tablets so we can query them
        for t in [shard_master, shard_replica, shard_rdonly1]:
            utils.run_vtctl(['ReloadSchema', t.tablet_alias], auto_log=True)

        # We must start vtgate after tablets are up, or else wait until 1min refresh
        # (that is the tablet_refresh_interval parameter for discovery gateway)
        # we want cache_ttl at zero so we re-read the topology for every test query.

        utils.VtGate().start(
            cache_ttl='0',
            tablets=[shard_master, shard_replica, shard_rdonly1])
        utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1)
        utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1)
        utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1)

        # check the Map Reduce API works correctly, should use ExecuteShards,
        # as we're not sharded yet.
        # we have 3 values in the database, asking for 4 splits will get us
        # a single query.
        sql = 'select id, msg from resharding1'
        s = utils.vtgate.split_query(sql, 'test_keyspace', 4)
        self.assertEqual(len(s), 1)
        self.assertEqual(s[0]['shard_part']['shards'][0], '0')

        # change the schema, backfill keyspace_id, and change schema again
        self._add_sharding_key_to_schema()
        self._backfill_keyspace_id(shard_master)
        self._mark_sharding_key_not_null()

        # now we can be a sharded keyspace (and propagate to SrvKeyspace)
        utils.run_vtctl([
            'SetKeyspaceShardingInfo', 'test_keyspace', 'custom_ksid_col',
            base_sharding.keyspace_id_type
        ])
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # run a health check on source replica so it responds to discovery
        utils.run_vtctl(['RunHealthCheck', shard_replica.tablet_alias])

        # create the split shards
        shard_0_master.init_tablet('replica',
                                   keyspace='test_keyspace',
                                   shard='-80',
                                   tablet_index=0)
        shard_0_replica.init_tablet('replica',
                                    keyspace='test_keyspace',
                                    shard='-80',
                                    tablet_index=1)
        shard_0_rdonly1.init_tablet('rdonly',
                                    keyspace='test_keyspace',
                                    shard='-80',
                                    tablet_index=2)
        shard_1_master.init_tablet('replica',
                                   keyspace='test_keyspace',
                                   shard='80-',
                                   tablet_index=0)
        shard_1_replica.init_tablet('replica',
                                    keyspace='test_keyspace',
                                    shard='80-',
                                    tablet_index=1)
        shard_1_rdonly1.init_tablet('rdonly',
                                    keyspace='test_keyspace',
                                    shard='80-',
                                    tablet_index=2)

        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly1,
                shard_1_master, shard_1_replica, shard_1_rdonly1
        ]:
            t.create_db('vt_test_keyspace')
            t.start_vttablet(wait_for_state=None,
                             binlog_use_v3_resharding_mode=False)

        for t in [
                shard_0_master, shard_0_replica, shard_0_rdonly1,
                shard_1_master, shard_1_replica, shard_1_rdonly1
        ]:
            t.wait_for_vttablet_state('NOT_SERVING')

        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/-80',
            shard_0_master.tablet_alias
        ],
                        auto_log=True)
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/80-',
            shard_1_master.tablet_alias
        ],
                        auto_log=True)

        for t in [shard_0_replica, shard_1_replica]:
            utils.wait_for_tablet_type(t.tablet_alias, 'replica')
        for t in [shard_0_rdonly1, shard_1_rdonly1]:
            utils.wait_for_tablet_type(t.tablet_alias, 'rdonly')

        sharded_tablets = [
            shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master,
            shard_1_replica, shard_1_rdonly1
        ]
        for t in sharded_tablets:
            t.wait_for_vttablet_state('SERVING')

        # must restart vtgate after tablets are up, or else wait until 1min refresh
        # we want cache_ttl at zero so we re-read the topology for every test query.
        utils.vtgate.kill()

        utils.vtgate = None
        utils.VtGate().start(cache_ttl='0',
                             tablets=[
                                 shard_master, shard_replica, shard_rdonly1,
                                 shard_0_master, shard_0_replica,
                                 shard_0_rdonly1, shard_1_master,
                                 shard_1_replica, shard_1_rdonly1
                             ])
        var = None

        # Wait for the endpoints, either local or remote.
        utils.vtgate.wait_for_endpoints('test_keyspace.0.master', 1, var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.0.replica', 1, var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.0.rdonly', 1, var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.-80.master', 1, var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.-80.replica',
                                        1,
                                        var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1, var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.80-.master', 1, var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.80-.replica',
                                        1,
                                        var=var)
        utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1, var=var)

        # check the Map Reduce API works correctly, should use ExecuteKeyRanges now,
        # as we are sharded (with just one shard).
        # again, we have 3 values in the database, asking for 4 splits will get us
        # a single query.
        sql = 'select id, msg from resharding1'
        s = utils.vtgate.split_query(sql, 'test_keyspace', 4)
        self.assertEqual(len(s), 1)
        self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace')
        # There must be one empty KeyRange which represents the full keyspace.
        self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1)
        self.assertEqual(s[0]['key_range_part']['key_ranges'][0], {})

        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -\n'
            'Partitions(replica): -\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # we need to create the schema, and the worker will do data copying
        for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)
        utils.run_vtctl(['RunHealthCheck', shard_rdonly1.tablet_alias])

        # Run vtworker as daemon for the following SplitClone commands.
        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            [
                '--cell', 'test_nj', '--command_display_interval', '10ms',
                '--use_v3_resharding_mode=false'
            ],
            auto_log=True)

        # Initial clone (online).
        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--offline=false', '--exclude_tables', 'unrelated',
            '--chunk_count', '10', '--min_rows_per_chunk', '1',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 3, 0, 0, 0)

        # Reset vtworker such that we can run the next command.
        workerclient_proc = utils.run_vtworker_client_bg(['Reset'],
                                                         worker_rpc_port)
        utils.wait_procs([workerclient_proc])

        # Modify the destination shard. SplitClone will revert the changes.
        # Delete row 1 (provokes an insert).
        shard_0_master.mquery('vt_test_keyspace',
                              'delete from resharding1 where id=1',
                              write=True)
        # Delete row 2 (provokes an insert).
        shard_1_master.mquery('vt_test_keyspace',
                              'delete from resharding1 where id=2',
                              write=True)
        # Update row 3 (provokes an update).
        shard_1_master.mquery(
            'vt_test_keyspace',
            "update resharding1 set msg='msg-not-3' where id=3",
            write=True)
        # Insert row 4 (provokes a delete).
        self._insert_value(shard_1_master, 'resharding1', 4, 'msg4',
                           0xD000000000000000)

        workerclient_proc = utils.run_vtworker_client_bg([
            'SplitClone', '--exclude_tables', 'unrelated', '--chunk_count',
            '10', '--min_rows_per_chunk', '1', '--min_healthy_rdonly_tablets',
            '1', 'test_keyspace/0'
        ], worker_rpc_port)
        utils.wait_procs([workerclient_proc])
        self.verify_reconciliation_counters(worker_port, 'Online',
                                            'resharding1', 2, 1, 1, 0)
        self.verify_reconciliation_counters(worker_port, 'Offline',
                                            'resharding1', 0, 0, 0, 3)
        # Terminate worker daemon because it is no longer needed.
        utils.kill_sub_process(worker_proc, soft=True)

        # check the startup values are in the right place
        self._check_startup_values()

        # check the schema too
        utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'],
                        auto_log=True)

        # check the binlog players are running
        logging.debug('Waiting for binlog players to start on new masters...')
        self.check_destination_master(shard_0_master, ['test_keyspace/0'])
        self.check_destination_master(shard_1_master, ['test_keyspace/0'])

        # check that binlog server exported the stats vars
        self.check_binlog_server_vars(shard_replica, horizontal=True)

        # testing filtered replication: insert a bunch of data on shard 1,
        # check we get most of it after a few seconds, wait for binlog server
        # timeout, check we get all of it.
        logging.debug('Inserting lots of data on source shard')
        self._insert_lots(1000)
        logging.debug('Checking 80 percent of data is sent quickly')
        v = self._check_lots_timeout(1000, 80, 5)
        if v != 100:
            logging.debug('Checking all data goes through eventually')
            self._check_lots_timeout(1000, 100, 20)
        logging.debug('Checking no data was sent the wrong way')
        self._check_lots_not_present(1000)
        self.check_binlog_player_vars(shard_0_master, ['test_keyspace/0'],
                                      seconds_behind_master_max=30)
        self.check_binlog_player_vars(shard_1_master, ['test_keyspace/0'],
                                      seconds_behind_master_max=30)
        self.check_binlog_server_vars(shard_replica,
                                      horizontal=True,
                                      min_statements=1000,
                                      min_transactions=1000)

        # use vtworker to compare the data
        for t in [shard_0_rdonly1, shard_1_rdonly1]:
            utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

        if base_sharding.use_multi_split_diff:
            logging.debug('Running vtworker MultiSplitDiff for 0')
            utils.run_vtworker([
                '-cell', 'test_nj', '--use_v3_resharding_mode=false',
                'MultiSplitDiff', '--min_healthy_rdonly_tablets', '1',
                'test_keyspace/0'
            ],
                               auto_log=True)
        else:
            logging.debug('Running vtworker SplitDiff for -80')
            utils.run_vtworker([
                '-cell', 'test_nj', '--use_v3_resharding_mode=false',
                'SplitDiff', '--min_healthy_rdonly_tablets', '1',
                'test_keyspace/-80'
            ],
                               auto_log=True)
            logging.debug('Running vtworker SplitDiff for 80-')
            utils.run_vtworker([
                '-cell', 'test_nj', '--use_v3_resharding_mode=false',
                'SplitDiff', '--min_healthy_rdonly_tablets', '1',
                'test_keyspace/80-'
            ],
                               auto_log=True)

        utils.pause('Good time to test vtworker for diffs')

        # get status for the destination master tablet, make sure we have it all
        self.check_running_binlog_player(shard_0_master, 2000, 2000)
        self.check_running_binlog_player(shard_1_master, 6000, 2000)

        # check we can't migrate the master just yet
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        expect_fail=True)

        # now serve rdonly from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'rdonly'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # make sure rdonly tablets are back to serving before hitting vtgate.
        for t in [shard_0_rdonly1, shard_1_rdonly1]:
            t.wait_for_vttablet_state('SERVING')

        utils.vtgate.wait_for_endpoints('test_keyspace.-80.rdonly', 1)
        utils.vtgate.wait_for_endpoints('test_keyspace.80-.rdonly', 1)

        # check the Map Reduce API works correctly, should use ExecuteKeyRanges
        # on both destination shards now.
        # we ask for 2 splits to only have one per shard
        sql = 'select id, msg from resharding1'
        timeout = 10.0
        while True:
            try:
                s = utils.vtgate.split_query(sql, 'test_keyspace', 2)
                break
            except Exception:  # pylint: disable=broad-except
                timeout = utils.wait_step(
                    'vtgate executes split_query properly', timeout)
        self.assertEqual(len(s), 2)
        self.assertEqual(s[0]['key_range_part']['keyspace'], 'test_keyspace')
        self.assertEqual(s[1]['key_range_part']['keyspace'], 'test_keyspace')
        self.assertEqual(len(s[0]['key_range_part']['key_ranges']), 1)
        self.assertEqual(len(s[1]['key_range_part']['key_ranges']), 1)

        # then serve replica from the split shards
        source_tablet = shard_replica
        destination_tablets = [shard_0_replica, shard_1_replica]

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # move replica back and forth
        utils.run_vtctl(
            ['MigrateServedTypes', '-reverse', 'test_keyspace/0', 'replica'],
            auto_log=True)
        # After a backwards migration, queryservice should be enabled on
        # source and disabled on destinations
        utils.check_tablet_query_service(self, source_tablet, True, False)
        utils.check_tablet_query_services(self, destination_tablets, False,
                                          True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'replica'],
                        auto_log=True)
        # After a forwards migration, queryservice should be disabled on
        # source and enabled on destinations
        utils.check_tablet_query_service(self, source_tablet, False, True)
        utils.check_tablet_query_services(self, destination_tablets, True,
                                          False)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # then serve master from the split shards
        utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/0', 'master'],
                        auto_log=True)
        utils.check_srv_keyspace(
            'test_nj',
            'test_keyspace', 'Partitions(master): -80 80-\n'
            'Partitions(rdonly): -80 80-\n'
            'Partitions(replica): -80 80-\n',
            keyspace_id_type=base_sharding.keyspace_id_type,
            sharding_column_name='custom_ksid_col')

        # check the binlog players are gone now
        self.check_no_binlog_player(shard_0_master)
        self.check_no_binlog_player(shard_1_master)

        # make sure we can't delete a shard with tablets
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], expect_fail=True)

        # remove the original tablets in the original shard
        tablet.kill_tablets([shard_master, shard_replica, shard_rdonly1])
        for t in [shard_replica, shard_rdonly1]:
            utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
        utils.run_vtctl(
            ['DeleteTablet', '-allow_master', shard_master.tablet_alias],
            auto_log=True)

        # rebuild the serving graph, all mentions of the old shards should be gone
        utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                        auto_log=True)

        # delete the original shard
        utils.run_vtctl(['DeleteShard', 'test_keyspace/0'], auto_log=True)

        # kill everything else
        tablet.kill_tablets([
            shard_0_master, shard_0_replica, shard_0_rdonly1, shard_1_master,
            shard_1_replica, shard_1_rdonly1
        ])
Beispiel #55
0
  def test_merge_sharding(self):
    utils.run_vtctl(['CreateKeyspace',
                     '--sharding_column_name', 'custom_ksid_col',
                     '--sharding_column_type', base_sharding.keyspace_id_type,
                     'test_keyspace'])

    shard_0_master.init_tablet('replica', 'test_keyspace', '-40')
    shard_0_replica.init_tablet('replica', 'test_keyspace', '-40')
    shard_0_rdonly.init_tablet('rdonly', 'test_keyspace', '-40')
    shard_1_master.init_tablet('replica', 'test_keyspace', '40-80')
    shard_1_replica.init_tablet('replica', 'test_keyspace', '40-80')
    shard_1_rdonly.init_tablet('rdonly', 'test_keyspace', '40-80')
    shard_2_master.init_tablet('replica', 'test_keyspace', '80-')
    shard_2_replica.init_tablet('replica', 'test_keyspace', '80-')
    shard_2_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')

    # rebuild and check SrvKeyspace
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
    ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
    self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')

    # create databases so vttablet can start behaving normally
    for t in [shard_0_master, shard_0_replica, shard_0_rdonly,
              shard_1_master, shard_1_replica, shard_1_rdonly,
              shard_2_master, shard_2_replica, shard_2_rdonly]:
      t.create_db('vt_test_keyspace')
      t.start_vttablet(wait_for_state=None,
                       binlog_use_v3_resharding_mode=False)

    # won't be serving, no replication state
    for t in [shard_0_master, shard_0_replica, shard_0_rdonly,
              shard_1_master, shard_1_replica, shard_1_rdonly,
              shard_2_master, shard_2_replica, shard_2_rdonly]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # reparent to make the tablets work
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-40',
                     shard_0_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/40-80',
                     shard_1_master.tablet_alias], auto_log=True)
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/80-',
                     shard_2_master.tablet_alias], auto_log=True)

    # create the tables
    self._create_schema()
    self._insert_startup_values()

    # run a health check on source replicas so they respond to discovery
    # (for binlog players) and on the source rdonlys (for workers)
    for t in [shard_0_replica, shard_1_replica]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
    for t in [shard_0_rdonly, shard_1_rdonly]:
      utils.run_vtctl(['RunHealthCheck', t.tablet_alias])

    # create the merge shards
    shard_dest_master.init_tablet('replica', 'test_keyspace', '-80')
    shard_dest_replica.init_tablet('replica', 'test_keyspace', '-80')
    shard_dest_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')

    # start vttablet on the destination shard (no db created,
    # so they're all not serving)
    for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]:
      t.start_vttablet(wait_for_state=None,
                       binlog_use_v3_resharding_mode=False)
    for t in [shard_dest_master, shard_dest_replica, shard_dest_rdonly]:
      t.wait_for_vttablet_state('NOT_SERVING')

    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/-80',
                     shard_dest_master.tablet_alias], auto_log=True)

    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
                    auto_log=True)
    utils.check_srv_keyspace(
        'test_nj', 'test_keyspace',
        'Partitions(master): -40 40-80 80-\n'
        'Partitions(rdonly): -40 40-80 80-\n'
        'Partitions(replica): -40 40-80 80-\n',
        keyspace_id_type=base_sharding.keyspace_id_type,
        sharding_column_name='custom_ksid_col')

    # copy the schema
    utils.run_vtctl(['CopySchemaShard', shard_0_rdonly.tablet_alias,
                     'test_keyspace/-80'], auto_log=True)

    # copy the data (will also start filtered replication), reset source
    # Run vtworker as daemon for the following SplitClone commands.
    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--command_display_interval', '10ms',
          '--use_v3_resharding_mode=false'],
        auto_log=True)

    # Initial clone (online).
    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--offline=false',
         '--chunk_count', '10',
         '--min_rows_per_chunk', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/-80'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        2, 0, 0, 0)

    # Reset vtworker such that we can run the next command.
    workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
    utils.wait_procs([workerclient_proc])

    # Modify the destination shard. SplitClone will revert the changes.
    # Delete row 1 (provokes an insert).
    shard_dest_master.mquery('vt_test_keyspace',
                             'delete from resharding1 where id=1', write=True)
    # Update row 2 (provokes an update).
    shard_dest_master.mquery(
        'vt_test_keyspace', "update resharding1 set msg='msg-not-2' where id=2",
        write=True)
    # Insert row 0 (provokes a delete).
    self._insert_value(shard_dest_master, 'resharding1', 0, 'msg0',
                       0x5000000000000000)

    workerclient_proc = utils.run_vtworker_client_bg(
        ['SplitClone',
         '--chunk_count', '10',
         '--min_rows_per_chunk', '1',
         '--min_healthy_rdonly_tablets', '1',
         'test_keyspace/-80'],
        worker_rpc_port)
    utils.wait_procs([workerclient_proc])
    # Change tablets, which were taken offline, back to rdonly.
    utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
                                        1, 1, 1, 0)
    self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1',
                                        0, 0, 0, 2)
    # Terminate worker daemon because it is no longer needed.
    utils.kill_sub_process(worker_proc, soft=True)

    # check the startup values are in the right place
    self._check_startup_values()

    # check the schema too
    utils.run_vtctl(['ValidateSchemaKeyspace', 'test_keyspace'], auto_log=True)

    # check binlog player variables
    self.check_destination_master(shard_dest_master,
                                  ['test_keyspace/-40', 'test_keyspace/40-80'])

    # check that binlog server exported the stats vars
    self.check_binlog_server_vars(shard_0_replica, horizontal=True)
    self.check_binlog_server_vars(shard_1_replica, horizontal=True)

    # testing filtered replication: insert a bunch of data on shard 0 and 1,
    # check we get most of it after a few seconds, wait for binlog server
    # timeout, check we get all of it.
    logging.debug('Inserting lots of data on source shards')
    self._insert_lots(1000)
    logging.debug('Checking 80 percent of data is sent quickly')
    v = self._check_lots_timeout(1000, 80, 10)
    if v != 100:
      # small optimization: only do this check if we don't have all the data
      # already anyway.
      logging.debug('Checking all data goes through eventually')
      self._check_lots_timeout(1000, 100, 30)
    self.check_binlog_player_vars(shard_dest_master,
                                  ['test_keyspace/-40', 'test_keyspace/40-80'],
                                  seconds_behind_master_max=30)
    self.check_binlog_server_vars(shard_0_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)
    self.check_binlog_server_vars(shard_1_replica, horizontal=True,
                                  min_statements=1000, min_transactions=1000)

    # use vtworker to compare the data (after health-checking the destination
    # rdonly tablets so discovery works)
    utils.run_vtctl(['RunHealthCheck', shard_dest_rdonly.tablet_alias])
    logging.debug('Running vtworker SplitDiff on first half')
    utils.run_vtworker(['-cell', 'test_nj',
                        '--use_v3_resharding_mode=false',
                        'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        '--source_uid', '0',
                        'test_keyspace/-80'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_0_rdonly.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)
    logging.debug('Running vtworker SplitDiff on second half')
    utils.run_vtworker(['-cell', 'test_nj',
                        '--use_v3_resharding_mode=false',
                        'SplitDiff',
                        '--exclude_tables', 'unrelated',
                        '--min_healthy_rdonly_tablets', '1',
                        '--source_uid', '1',
                        'test_keyspace/-80'],
                       auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly.tablet_alias, 'rdonly'],
                    auto_log=True)
    utils.run_vtctl(['ChangeSlaveType', shard_dest_rdonly.tablet_alias,
                     'rdonly'], auto_log=True)

    # get status for the destination master tablet, make sure we have it all
    self.check_running_binlog_player(shard_dest_master, 3000, 1000)

    # check destination master query service is not running
    utils.check_tablet_query_service(self, shard_dest_master, False, False)
    stream_health = utils.run_vtctl_json(['VtTabletStreamHealth',
                                          '-count', '1',
                                          shard_dest_master.tablet_alias])
    logging.debug('Got health: %s', str(stream_health))
    self.assertIn('realtime_stats', stream_health)
    self.assertNotIn('serving', stream_health)

    # check the destination master 3 is healthy, even though its query
    # service is not running (if not healthy this would exception out)
    shard_dest_master.get_healthz()

    # now serve rdonly from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'rdonly'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -40 40-80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -40 40-80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # now serve replica from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'replica'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -40 40-80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')

    # now serve master from the split shards
    utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/-80', 'master'],
                    auto_log=True)
    utils.check_srv_keyspace('test_nj', 'test_keyspace',
                             'Partitions(master): -80 80-\n'
                             'Partitions(rdonly): -80 80-\n'
                             'Partitions(replica): -80 80-\n',
                             keyspace_id_type=base_sharding.keyspace_id_type,
                             sharding_column_name='custom_ksid_col')
    utils.check_tablet_query_service(self, shard_0_master, False, True)
    utils.check_tablet_query_service(self, shard_1_master, False, True)

    # check the binlog players are gone now
    self.check_no_binlog_player(shard_dest_master)

    # kill the original tablets in the original shards
    tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_rdonly,
                         shard_1_master, shard_1_replica, shard_1_rdonly])
    for t in [shard_0_replica, shard_0_rdonly,
              shard_1_replica, shard_1_rdonly]:
      utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
    for t in [shard_0_master, shard_1_master]:
      utils.run_vtctl(['DeleteTablet', '-allow_master', t.tablet_alias],
                      auto_log=True)

    # delete the original shards
    utils.run_vtctl(['DeleteShard', 'test_keyspace/-40'], auto_log=True)
    utils.run_vtctl(['DeleteShard', 'test_keyspace/40-80'], auto_log=True)

    # rebuild the serving graph, all mentions of the old shards shoud be gone
    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)

    # kill everything else
    tablet.kill_tablets([shard_2_master, shard_2_replica, shard_2_rdonly,
                         shard_dest_master, shard_dest_replica,
                         shard_dest_rdonly])
Beispiel #56
0
 def tearDown(self):
     utils.kill_sub_process(self.worker_proc)
Beispiel #57
0
    def test_secure(self):
        zkocc_server = utils.zkocc_start()

        # start the tablets
        shard_0_master.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                      key=cert_dir + "/vt-server-key.pem")
        shard_0_slave.start_vttablet(cert=cert_dir + "/vt-server-cert.pem",
                                     key=cert_dir + "/vt-server-key.pem",
                                     repl_extra_flags={
                                         'flags': "2048",
                                         'ssl-ca': cert_dir + "/ca-cert.pem",
                                         'ssl-cert':
                                         cert_dir + "/client-cert.pem",
                                         'ssl-key':
                                         cert_dir + "/client-key.pem",
                                     })

        # Reparent using SSL
        for t in [shard_0_master, shard_0_slave]:
            t.reset_replication()
        utils.run_vtctl('ReparentShard -force test_keyspace/0 ' +
                        shard_0_master.tablet_alias,
                        auto_log=True)

        # then get the topology and check it
        zkocc_client = zkocc.ZkOccConnection(
            "localhost:%u" % environment.zkocc_port_base, "test_nj", 30.0)
        topology.read_keyspaces(zkocc_client)

        shard_0_master_addrs = topology.get_host_port_by_name(
            zkocc_client, "test_keyspace.0.master:_vts")
        if len(shard_0_master_addrs) != 1:
            self.fail(
                'topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts", got: %s'
                % " ".join([
                    "%s:%u(%s)" % (h, p, str(e))
                    for (h, p, e) in shard_0_master_addrs
                ]))
        if shard_0_master_addrs[0][2] != True:
            self.fail(
                'topology.get_host_port_by_name failed for "test_keyspace.0.master:_vts" is not encrypted'
            )
        logging.debug(
            "shard 0 master addrs: %s", " ".join([
                "%s:%u(%s)" % (h, p, str(e))
                for (h, p, e) in shard_0_master_addrs
            ]))

        # make sure asking for optionally secure connections works too
        auto_addrs = topology.get_host_port_by_name(
            zkocc_client, "test_keyspace.0.master:_vtocc", encrypted=True)
        if auto_addrs != shard_0_master_addrs:
            self.fail(
                'topology.get_host_port_by_name doesn\'t resolve encrypted addresses properly: %s != %s'
                % (str(shard_0_master_addrs), str(auto_addrs)))

        # try to connect with regular client
        try:
            conn = tablet3.TabletConnection(
                "%s:%u" %
                (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]), "",
                "test_keyspace", "0", 10.0)
            conn.dial()
            self.fail("No exception raised to secure port")
        except dbexceptions.FatalError as e:
            if not e.args[0][0].startswith('Unexpected EOF in handshake to'):
                self.fail("Unexpected exception: %s" % str(e))

        sconn = utils.get_vars(shard_0_master.port)["SecureConnections"]
        if sconn != 0:
            self.fail("unexpected conns %s" % sconn)

        # connect to encrypted port
        conn = tablet3.TabletConnection(
            "%s:%u" % (shard_0_master_addrs[0][0], shard_0_master_addrs[0][1]),
            "",
            "test_keyspace",
            "0",
            5.0,
            encrypted=True)
        conn.dial()
        (results, rowcount, lastrowid,
         fields) = conn._execute("select 1 from dual", {})
        self.assertEqual(results, [
            (1, ),
        ], 'wrong conn._execute output: %s' % str(results))

        sconn = utils.get_vars(shard_0_master.port)["SecureConnections"]
        if sconn != 1:
            self.fail("unexpected conns %s" % sconn)
        saccept = utils.get_vars(shard_0_master.port)["SecureAccepts"]
        if saccept == 0:
            self.fail("unexpected accepts %s" % saccept)

        # trigger a time out on a secure connection, see what exception we get
        try:
            conn._execute("select sleep(100) from dual", {})
            self.fail("No timeout exception")
        except dbexceptions.TimeoutError as e:
            logging.debug("Got the right exception for SSL timeout: %s",
                          str(e))

        # start a vtgate to connect to that tablet
        gate_proc, gate_port, gate_secure_port = utils.vtgate_start(
            tablet_bson_encrypted=True,
            cert=cert_dir + "/vt-server-cert.pem",
            key=cert_dir + "/vt-server-key.pem")

        # try to connect to vtgate with regular client
        timeout = 2.0
        try:
            conn = vtgatev2.connect([
                "localhost:%s" % (gate_secure_port),
            ], timeout)
            self.fail("No exception raised to VTGate secure port")
        except dbexceptions.OperationalError as e:
            exception_type = e.args[2]
            exception_msg = str(e.args[2][0][0])
            self.assertIsInstance(exception_type, dbexceptions.FatalError,
                                  "unexpected exception type")
            if not exception_msg.startswith('Unexpected EOF in handshake to'):
                self.fail("Unexpected exception message: %s" % exception_msg)

        sconn = utils.get_vars(gate_port)["SecureConnections"]
        if sconn != 0:
            self.fail("unexpected conns %s" % sconn)

        # connect to vtgate with encrypted port
        conn = vtgatev2.connect([
            "localhost:%s" % (gate_secure_port),
        ],
                                timeout,
                                encrypted=True)
        (results, rowcount, lastrowid, fields) = conn._execute(
            "select 1 from dual", {},
            "test_keyspace",
            "master",
            keyranges=[
                keyrange.KeyRange(keyrange_constants.NON_PARTIAL_KEYRANGE),
            ])
        self.assertEqual(rowcount, 1, "want 1, got %d" % (rowcount))
        self.assertEqual(len(fields), 1, "want 1, got %d" % (len(fields)))
        self.assertEqual(results, [
            (1, ),
        ], 'wrong conn._execute output: %s' % str(results))

        sconn = utils.get_vars(gate_port)["SecureConnections"]
        if sconn != 1:
            self.fail("unexpected conns %s" % sconn)
        saccept = utils.get_vars(gate_port)["SecureAccepts"]
        if saccept == 0:
            self.fail("unexpected accepts %s" % saccept)

        # trigger a time out on a vtgate secure connection, see what exception we get
        try:
            conn._execute("select sleep(4) from dual", {},
                          "test_keyspace",
                          "master",
                          keyranges=[
                              keyrange.KeyRange(
                                  keyrange_constants.NON_PARTIAL_KEYRANGE),
                          ])
            self.fail("No timeout exception")
        except dbexceptions.TimeoutError as e:
            logging.debug("Got the right exception for SSL timeout: %s",
                          str(e))
        conn.close()
        utils.vtgate_kill(gate_proc)

        # kill everything
        utils.kill_sub_process(zkocc_server)