Beispiel #1
0
  def _test_vtctl_copyschemashard(self, source):
    # Apply initial schema to the whole keyspace before creating shard 2.
    self._apply_initial_schema()

    _setup_shard_2()

    try:
      # InitShardMaster creates the db, but there shouldn't be any tables yet.
      self._check_tables(shard_2_master, 0)
      self._check_tables(shard_2_replica1, 0)

      # Run the command twice to make sure it's idempotent.
      for _ in range(2):
        utils.run_vtctl(['CopySchemaShard',
                         source,
                         'test_keyspace/2'],
                        auto_log=True)

        # shard_2_master should look the same as the replica we copied from
        self._check_tables(shard_2_master, 4)
        utils.wait_for_replication_pos(shard_2_master, shard_2_replica1)
        self._check_tables(shard_2_replica1, 4)
        shard_0_schema = self._get_schema(shard_0_master.tablet_alias)
        shard_2_schema = self._get_schema(shard_2_master.tablet_alias)
        self.assertEqual(shard_0_schema, shard_2_schema)
    finally:
      _teardown_shard_2()
Beispiel #2
0
  def _test_vtctl_copyschemashard(self, source):
    # Apply initial schema to the whole keyspace before creating shard 2.
    self._apply_initial_schema()

    _setup_shard_2()

    try:
      # InitShardMaster creates the db, but there shouldn't be any tables yet.
      self._check_tables(shard_2_master, 0)
      self._check_tables(shard_2_replica1, 0)

      # Run the command twice to make sure it's idempotent.
      for _ in range(2):
        utils.run_vtctl(['CopySchemaShard',
                         source,
                         'test_keyspace/2'],
                        auto_log=True)

        # shard_2_master should look the same as the replica we copied from
        self._check_tables(shard_2_master, 4)
        utils.wait_for_replication_pos(shard_2_master, shard_2_replica1)
        self._check_tables(shard_2_replica1, 4)
        shard_0_schema = self._get_schema(shard_0_master.tablet_alias)
        shard_2_schema = self._get_schema(shard_2_master.tablet_alias)
        self.assertEqual(shard_0_schema, shard_2_schema)
    finally:
      _teardown_shard_2()
Beispiel #3
0
        def external_reparent():
            # Demote master.
            master.mquery('', mysql_flavor().demote_master_commands())
            if master.semi_sync_enabled():
                master.set_semi_sync_enabled(master=False)

            # Wait for replica to catch up to master.
            utils.wait_for_replication_pos(master, replica)

            # Promote replica to new master.
            replica.mquery('', mysql_flavor().promote_slave_commands())
            if replica.semi_sync_enabled():
                replica.set_semi_sync_enabled(master=True)
            old_master = master
            new_master = replica

            # Configure old master to use new master.
            new_pos = mysql_flavor().master_position(new_master)
            logging.debug('New master position: %s', str(new_pos))
            # Use 'localhost' as hostname because Travis CI worker hostnames
            # are too long for MySQL replication.
            change_master_cmds = mysql_flavor().change_master_commands(
                'localhost', new_master.mysql_port, new_pos)
            old_master.mquery('', ['RESET SLAVE'] + change_master_cmds +
                              ['START SLAVE'])

            # Notify the new vttablet master about the reparent.
            utils.run_vtctl(
                ['TabletExternallyReparented', new_master.tablet_alias])
Beispiel #4
0
    def setUp(self):
        """Creates the necessary shards, starts the tablets, and inserts some data."""
        self.run_shard_tablets('0', shard_tablets)
        # create the split shards
        self.run_shard_tablets('-80',
                               shard_0_tablets,
                               create_db=False,
                               create_table=False,
                               wait_state='NOT_SERVING')
        self.run_shard_tablets('80-',
                               shard_1_tablets,
                               create_db=False,
                               create_table=False,
                               wait_state='NOT_SERVING')

        # Copy the schema to the destination shards
        for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
            utils.run_vtctl([
                'CopySchemaShard', '--exclude_tables', 'unrelated',
                shard_rdonly1.tablet_alias, keyspace_shard
            ],
                            auto_log=True)

        logging.debug("Start inserting initial data: %s rows",
                      utils.options.num_insert_rows)
        self.insert_values(shard_master, utils.options.num_insert_rows, 2)
        logging.debug(
            "Done inserting initial data, waiting for replication to catch up")
        utils.wait_for_replication_pos(shard_master, shard_rdonly1)
        logging.debug("Replication on source rdonly tablet is caught up")
Beispiel #5
0
    def setUp(self):
        """Creates shards, starts the tablets, and inserts some data."""
        self.run_shard_tablets("0", all_shard_tablets)
        # create the split shards
        self.run_shard_tablets("-80", shard_0_tablets, create_db=False, create_table=False, wait_state="NOT_SERVING")
        self.run_shard_tablets("80-", shard_1_tablets, create_db=False, create_table=False, wait_state="NOT_SERVING")

        logging.debug("Start inserting initial data: %s rows", utils.options.num_insert_rows)
        self.insert_values(shard_master, utils.options.num_insert_rows, 2)
        logging.debug("Done inserting initial data, waiting for replication to catch up")
        utils.wait_for_replication_pos(shard_master, shard_rdonly1)
        logging.debug("Replication on source rdonly tablet is caught up")
Beispiel #6
0
  def setUp(self):
    """Creates shards, starts the tablets, and inserts some data."""
    self.run_shard_tablets('0', shard_tablets)
    # create the split shards
    self.run_shard_tablets('-80', shard_0_tablets, create_db=False,
      create_table=False, wait_state='NOT_SERVING')
    self.run_shard_tablets('80-', shard_1_tablets, create_db=False,
      create_table=False, wait_state='NOT_SERVING')

    logging.debug(
        'Start inserting initial data: %s rows', utils.options.num_insert_rows)
    self.insert_values(shard_master, utils.options.num_insert_rows, 2)
    logging.debug(
        'Done inserting initial data, waiting for replication to catch up')
    utils.wait_for_replication_pos(shard_master, shard_rdonly1)
    logging.debug('Replication on source rdonly tablet is caught up')
Beispiel #7
0
    def external_reparent(self):
        # Demote master.
        start = time.time()
        master.mquery('',
                      mysql_flavor().demote_master_commands(),
                      log_query=True)
        if master.semi_sync_enabled():
            master.set_semi_sync_enabled(master=False)

        # Wait for replica to catch up to master.
        utils.wait_for_replication_pos(master, replica)

        # Wait for at least one second to articially prolong the failover and give
        # the buffer a chance to observe it.
        d = time.time() - start
        min_unavailability_s = 1
        if d < min_unavailability_s:
            w = min_unavailability_s - d
            logging.debug(
                'Waiting for %.1f seconds because the failover was too fast'
                ' (took only %.3f seconds)', w, d)
            time.sleep(w)

        # Promote replica to new master.
        replica.mquery('',
                       mysql_flavor().promote_slave_commands(),
                       log_query=True)
        if replica.semi_sync_enabled():
            replica.set_semi_sync_enabled(master=True)
        old_master = master
        new_master = replica

        # Configure old master to use new master.
        new_pos = mysql_flavor().master_position(new_master)
        logging.debug('New master position: %s', str(new_pos))
        # Use 'localhost' as hostname because Travis CI worker hostnames
        # are too long for MySQL replication.
        change_master_cmds = mysql_flavor().change_master_commands(
            'localhost', new_master.mysql_port, new_pos)
        old_master.mquery('', ['RESET SLAVE'] + change_master_cmds +
                          ['START SLAVE'],
                          log_query=True)

        # Notify the new vttablet master about the reparent.
        utils.run_vtctl(
            ['TabletExternallyReparented', new_master.tablet_alias],
            auto_log=True)
Beispiel #8
0
  def setUp(self):
    """Creates shards, starts the tablets, and inserts some data."""
    try:
      self.run_shard_tablets('0', all_shard_tablets)
      # create the split shards
      self.run_shard_tablets(
          '-80', shard_0_tablets, create_table=False)
      self.run_shard_tablets(
          '80-', shard_1_tablets, create_table=False)

      logging.debug(
          'Start inserting initial data: %s rows', utils.options.num_insert_rows)
      self.insert_values(shard_master, utils.options.num_insert_rows, 2)
      logging.debug(
          'Done inserting initial data, waiting for replication to catch up')
      utils.wait_for_replication_pos(shard_master, shard_rdonly1)
      logging.debug('Replication on source rdonly tablet is caught up')
    except:
      self.tearDown()
Beispiel #9
0
    def setUp(self):
        """Creates shards, starts the tablets, and inserts some data."""
        try:
            self.run_shard_tablets('0', all_shard_tablets)
            # create the split shards
            self.run_shard_tablets('-80', shard_0_tablets, create_table=False)
            self.run_shard_tablets('80-', shard_1_tablets, create_table=False)

            logging.debug('Start inserting initial data: %s rows',
                          self.num_insert_rows)
            self.insert_values(shard_master, self.num_insert_rows, 2)
            logging.debug(
                'Done inserting initial data, waiting for replication to catch up'
            )
            utils.wait_for_replication_pos(shard_master, shard_rdonly1)
            logging.debug('Replication on source rdonly tablet is caught up')
        except:
            self.tearDown()
            raise
Beispiel #10
0
  def external_reparent(self):
    # Demote master.
    start = time.time()
    master.mquery('', mysql_flavor().demote_master_commands(), log_query=True)
    if master.semi_sync_enabled():
      master.set_semi_sync_enabled(master=False)

    # Wait for replica to catch up to master.
    utils.wait_for_replication_pos(master, replica)

    # Wait for at least one second to articially prolong the failover and give
    # the buffer a chance to observe it.
    d = time.time() - start
    min_unavailability_s = 1
    if d < min_unavailability_s:
      w = min_unavailability_s - d
      logging.debug('Waiting for %.1f seconds because the failover was too fast'
                    ' (took only %.3f seconds)', w, d)
      time.sleep(w)

    # Promote replica to new master.
    replica.mquery('', mysql_flavor().promote_slave_commands(),
                   log_query=True)
    if replica.semi_sync_enabled():
      replica.set_semi_sync_enabled(master=True)
    old_master = master
    new_master = replica

    # Configure old master to use new master.
    new_pos = mysql_flavor().master_position(new_master)
    logging.debug('New master position: %s', str(new_pos))
    # Use 'localhost' as hostname because Travis CI worker hostnames
    # are too long for MySQL replication.
    change_master_cmds = mysql_flavor().change_master_commands(
        'localhost', new_master.mysql_port, new_pos)
    old_master.mquery('', ['RESET SLAVE'] + change_master_cmds +
                      ['START SLAVE'], log_query=True)

    # Notify the new vttablet master about the reparent.
    utils.run_vtctl(['TabletExternallyReparented', new_master.tablet_alias],
                    auto_log=True)
Beispiel #11
0
    def _test_vtctl_copyschemashard(self, source):
        self._apply_initial_schema()

        self._setUp_tablets_shard_2()

        # CopySchemaShard is responsible for creating the db; one shouldn't exist before
        # the command is run.
        self._check_db_not_created(shard_2_master)
        self._check_db_not_created(shard_2_replica1)

        # Run the command twice to make sure it's idempotent.
        for _ in range(2):
            utils.run_vtctl(["CopySchemaShard", source, "test_keyspace/2"], auto_log=True)

            # shard_2_master should look the same as the replica we copied from
            self._check_tables(shard_2_master, 4)
            utils.wait_for_replication_pos(shard_2_master, shard_2_replica1)
            self._check_tables(shard_2_replica1, 4)
            shard_0_schema = self._get_schema(shard_0_master.tablet_alias)
            shard_2_schema = self._get_schema(shard_2_master.tablet_alias)
            self.assertEqual(shard_0_schema, shard_2_schema)
Beispiel #12
0
    def _test_vtctl_copyschemashard(self, source):
        self._apply_initial_schema()

        self._setUp_tablets_shard_2()

        # CopySchemaShard is responsible for creating the db; one shouldn't exist before
        # the command is run.
        self._check_db_not_created(shard_2_master)
        self._check_db_not_created(shard_2_replica1)

        # Run the command twice to make sure it's idempotent.
        for _ in range(2):
            utils.run_vtctl(['CopySchemaShard', source, 'test_keyspace/2'],
                            auto_log=True)

            # shard_2_master should look the same as the replica we copied from
            self._check_tables(shard_2_master, 4)
            utils.wait_for_replication_pos(shard_2_master, shard_2_replica1)
            self._check_tables(shard_2_replica1, 4)
            shard_0_schema = self._get_schema(shard_0_master.tablet_alias)
            shard_2_schema = self._get_schema(shard_2_master.tablet_alias)
            self.assertEqual(shard_0_schema, shard_2_schema)
Beispiel #13
0
  def setUp(self):
    """Creates the necessary shards, starts the tablets, and inserts some data."""
    self.run_shard_tablets('0', shard_tablets)
    # create the split shards
    self.run_shard_tablets('-80', shard_0_tablets, create_db=False,
      create_table=False, wait_state='NOT_SERVING')
    self.run_shard_tablets('80-', shard_1_tablets, create_db=False,
      create_table=False, wait_state='NOT_SERVING')

    # Copy the schema to the destination shards
    for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
      utils.run_vtctl(['CopySchemaShard',
                       '--exclude_tables', 'unrelated',
                       shard_rdonly1.tablet_alias,
                       keyspace_shard],
                      auto_log=True)

    logging.debug("Start inserting initial data: %s rows", utils.options.num_insert_rows)
    self.insert_values(shard_master, utils.options.num_insert_rows, 2)
    logging.debug("Done inserting initial data, waiting for replication to catch up")
    utils.wait_for_replication_pos(shard_master, shard_rdonly1)
    logging.debug("Replication on source rdonly tablet is caught up")
Beispiel #14
0
    def _test_reparent_from_outside(self, brutal=False):
        """This test will start a master and 3 slaves.

    Then:
    - one slave will be the new master
    - one slave will be reparented to that new master
    - one slave will be busted and dead in the water
    and we'll call TabletExternallyReparented.

    Args:
      brutal: kills the old master first
    """
        utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

        # create the database so vttablets start, as they are serving
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.create_db('vt_test_keyspace')

        # Start up a master mysql and vttablet
        tablet_62344.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # Create a few slaves for testing reparenting.
        tablet_62044.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_41983.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)
        tablet_31981.init_tablet('replica',
                                 'test_keyspace',
                                 '0',
                                 start=True,
                                 wait_for_start=False)

        # wait for all tablets to start
        for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
            t.wait_for_vttablet_state('NOT_SERVING')

        # Reparent as a starting point
        utils.run_vtctl([
            'InitShardMaster', '-force', 'test_keyspace/0',
            tablet_62344.tablet_alias
        ],
                        auto_log=True)

        # now manually reparent 1 out of 2 tablets
        # 62044 will be the new master
        # 31981 won't be re-parented, so it will be busted

        # Shutdown the old master first.
        if not brutal:
            tablet_62344.mquery('', mysql_flavor().demote_master_commands())

            # Get the position of the old master and wait for the new one to catch up.
            utils.wait_for_replication_pos(tablet_62344, tablet_62044)

        # Promote the new master.
        tablet_62044.mquery('', mysql_flavor().promote_slave_commands())
        new_pos = mysql_flavor().master_position(tablet_62044)
        logging.debug('New master position: %s', str(new_pos))
        # Use 'localhost' as hostname because Travis CI worker hostnames
        # are too long for MySQL replication.
        change_master_cmds = mysql_flavor().change_master_commands(
            'localhost', tablet_62044.mysql_port, new_pos)

        # 62344 will now be a slave of 62044
        tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                            change_master_cmds + ['START SLAVE'])

        # 41983 will be a slave of 62044
        tablet_41983.mquery('', ['STOP SLAVE'] + change_master_cmds +
                            ['START SLAVE'])

        # in brutal mode, we kill the old master first
        # and delete its tablet record
        if brutal:
            tablet_62344.kill_vttablet()
            utils.run_vtctl(
                ['DeleteTablet', '-allow_master', tablet_62344.tablet_alias],
                auto_log=True)

        base_time = time.time()

        # update topology with the new server
        utils.run_vtctl(
            ['TabletExternallyReparented', tablet_62044.tablet_alias],
            mode=utils.VTCTL_VTCTL,
            auto_log=True)

        self._test_reparent_from_outside_check(brutal, base_time)

        if not brutal:
            tablet_62344.kill_vttablet()
        tablet.kill_tablets([tablet_31981, tablet_62044, tablet_41983])
Beispiel #15
0
  def _test_reparent_from_outside(self, brutal=False):
    """This test will start a master and 3 slaves.

    Then:
    - one slave will be the new master
    - one slave will be reparented to that new master
    - one slave will be busted and dead in the water
    and we'll call TabletExternallyReparented.

    Args:
      brutal: kills the old master first
    """
    utils.run_vtctl(['CreateKeyspace', 'test_keyspace'])

    # create the database so vttablets start, as they are serving
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.create_db('vt_test_keyspace')

    # Start up a master mysql and vttablet
    tablet_62344.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)

    # Create a few slaves for testing reparenting.
    tablet_62044.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)
    tablet_41983.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)
    tablet_31981.init_tablet('replica', 'test_keyspace', '0', start=True,
                             wait_for_start=False)

    # wait for all tablets to start
    for t in [tablet_62344, tablet_62044, tablet_41983, tablet_31981]:
      t.wait_for_vttablet_state('NOT_SERVING')

    # Reparent as a starting point
    utils.run_vtctl(['InitShardMaster', '-force', 'test_keyspace/0',
                     tablet_62344.tablet_alias], auto_log=True)

    # now manually reparent 1 out of 2 tablets
    # 62044 will be the new master
    # 31981 won't be re-parented, so it will be busted

    # Shutdown the old master first.
    if not brutal:
      tablet_62344.mquery('', mysql_flavor().demote_master_commands())

      # Get the position of the old master and wait for the new one to catch up.
      utils.wait_for_replication_pos(tablet_62344, tablet_62044)

    # Promote the new master.
    tablet_62044.mquery('', mysql_flavor().promote_slave_commands())
    new_pos = mysql_flavor().master_position(tablet_62044)
    logging.debug('New master position: %s', str(new_pos))
    # Use 'localhost' as hostname because Travis CI worker hostnames
    # are too long for MySQL replication.
    change_master_cmds = mysql_flavor().change_master_commands(
        'localhost',
        tablet_62044.mysql_port,
        new_pos)

    # 62344 will now be a slave of 62044
    tablet_62344.mquery('', ['RESET MASTER', 'RESET SLAVE'] +
                        change_master_cmds +
                        ['START SLAVE'])

    # 41983 will be a slave of 62044
    tablet_41983.mquery('', ['STOP SLAVE'] +
                        change_master_cmds +
                        ['START SLAVE'])

    # in brutal mode, we kill the old master first
    # and delete its tablet record
    if brutal:
      tablet_62344.kill_vttablet()
      utils.run_vtctl(['DeleteTablet', '-allow_master',
                       tablet_62344.tablet_alias], auto_log=True)

    base_time = time.time()

    # update topology with the new server
    utils.run_vtctl(['TabletExternallyReparented', tablet_62044.tablet_alias],
                    mode=utils.VTCTL_VTCTL, auto_log=True)

    self._test_reparent_from_outside_check(brutal, base_time)

    if not brutal:
      tablet_62344.kill_vttablet()
    tablet.kill_tablets([tablet_31981, tablet_62044, tablet_41983])
Beispiel #16
0
    def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
        """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
        if mysql_down:
            logging.debug('Shutting down mysqld on destination masters.')
            utils.wait_procs([
                shard_0_master.shutdown_mysql(),
                shard_1_master.shutdown_mysql()
            ])

        worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
            ['--cell', 'test_nj'], auto_log=True)

        # --max_tps is only specified to enable the throttler and ensure that the
        # code is executed. But the intent here is not to throttle the test, hence
        # the rate limit is set very high.
        # --chunk_count is 2 because rows are currently ordered by primary key such
        # that all rows of the first shard come first and then the second shard.
        # TODO(mberlin): Remove --offline=false once vtworker ensures that the
        #                destination shards are not behind the master's replication
        #                position.
        args = [
            'SplitClone', '--offline=false', '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1', '--max_tps', '9999'
        ]
        if not mysql_down:
            # Make the clone as slow as necessary such that there is enough time to
            # run PlannedReparent in the meantime.
            # TOOD(mberlin): Once insert_values is fixed to uniformly distribute the
            #                rows across shards when sorted by primary key, remove
            #                --chunk_count 2, --min_rows_per_chunk 1 and set
            #                --source_reader_count back to 1.
            args.extend([
                '--source_reader_count', '2', '--chunk_count', '2',
                '--min_rows_per_chunk', '1', '--write_query_max_rows', '1'
            ])
        args.append('test_keyspace/0')
        workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port)

        if mysql_down:
            # If MySQL is down, we wait until vtworker retried at least once to make
            # sure it reached the point where a write failed due to MySQL being down.
            # There should be two retries at least, one for each destination shard.
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerRetryCount >= 2',
                condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
            logging.debug(
                'Worker has retried at least twice, starting reparent now')

            # vtworker is blocked at this point. This is a good time to test that its
            # throttler server is reacting to RPCs.
            self.check_throttler_service(
                'localhost:%d' % worker_rpc_port,
                ['test_keyspace/-80', 'test_keyspace/80-'], 9999)

            # Bring back masters. Since we test with semi-sync now, we need at least
            # one replica for the new master. This test is already quite expensive,
            # so we bring back the old master as a replica rather than having a third
            # replica up the whole time.
            logging.debug('Restarting mysqld on destination masters')
            utils.wait_procs(
                [shard_0_master.start_mysql(),
                 shard_1_master.start_mysql()])

            # Reparent away from the old masters.
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        else:
            # NOTE: There is a race condition around this:
            #   It's possible that the SplitClone vtworker command finishes before the
            #   PlannedReparentShard vtctl command, which we start below, succeeds.
            #   Then the test would fail because vtworker did not have to retry.
            #
            # To workaround this, the test takes a parameter to increase the number of
            # rows that the worker has to copy (with the idea being to slow the worker
            # down).
            # You should choose a value for num_insert_rows, such that this test
            # passes for your environment (trial-and-error...)
            # Make sure that vtworker got past the point where it picked a master
            # for each destination shard ("finding targets" state).
            utils.poll_for_vars(
                'vtworker',
                worker_port,
                'WorkerState == cloning the data (online)',
                condition_fn=lambda v: v.get('WorkerState') == 'cloning the'
                ' data (online)')
            logging.debug('Worker is in copy state, starting reparent now')

            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/-80',
                shard_0_replica.tablet_alias
            ],
                            auto_log=True)
            utils.run_vtctl([
                'PlannedReparentShard', 'test_keyspace/80-',
                shard_1_replica.tablet_alias
            ],
                            auto_log=True)

        utils.wait_procs([workerclient_proc])

        # Verify that we were forced to re-resolve and retry.
        worker_vars = utils.get_vars(worker_port)
        self.assertGreater(
            worker_vars['WorkerRetryCount'], 1,
            "expected vtworker to retry each of the two reparented"
            " destination masters at least once, but it didn't")
        self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                            "expected vtworker to retry, but it didn't")
        utils.kill_sub_process(worker_proc, soft=True)

        # Wait for the destination RDONLYs to catch up or the following offline
        # clone will try to insert rows which already exist.
        # TODO(mberlin): Remove this once SplitClone supports it natively.
        utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1)
        utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1)
        # Run final offline clone to enable filtered replication.
        _, _ = utils.run_vtworker([
            '-cell', 'test_nj', 'SplitClone', '--online=false',
            '--min_healthy_rdonly_tablets', '1', 'test_keyspace/0'
        ],
                                  auto_log=True)

        # Make sure that everything is caught up to the same replication point
        self.run_split_diff('test_keyspace/-80', all_shard_tablets,
                            shard_0_tablets)
        self.run_split_diff('test_keyspace/80-', all_shard_tablets,
                            shard_1_tablets)

        self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
        self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
Beispiel #17
0
  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
    """Verifies that vtworker can successfully copy data for a SplitClone.

    Order of operations:
    1. Run a background vtworker
    2. Wait until the worker successfully resolves the destination masters.
    3. Reparent the destination tablets
    4. Wait until the vtworker copy is finished
    5. Verify that the worker was forced to reresolve topology and retry writes
      due to the reparent.
    6. Verify that the data was copied successfully to both new shards

    Args:
      mysql_down: boolean. If True, we take down the MySQL instances on the
        destination masters at first, then bring them back and reparent away.

    Raises:
      AssertionError if things didn't go as expected.
    """
    if mysql_down:
      logging.debug('Shutting down mysqld on destination masters.')
      utils.wait_procs(
          [shard_0_master.shutdown_mysql(),
           shard_1_master.shutdown_mysql()])

    worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
        ['--cell', 'test_nj', '--use_v3_resharding_mode=false'],
        auto_log=True)

    # --max_tps is only specified to enable the throttler and ensure that the
    # code is executed. But the intent here is not to throttle the test, hence
    # the rate limit is set very high.
    # --chunk_count is 2 because rows are currently ordered by primary key such
    # that all rows of the first shard come first and then the second shard.
    # TODO(mberlin): Remove --offline=false once vtworker ensures that the
    #                destination shards are not behind the master's replication
    #                position.
    args = ['SplitClone',
            '--offline=false',
            '--destination_writer_count', '1',
            '--min_healthy_rdonly_tablets', '1',
            '--max_tps', '9999']
    if not mysql_down:
      # Make the clone as slow as necessary such that there is enough time to
      # run PlannedReparent in the meantime.
      # TODO(mberlin): Once insert_values is fixed to uniformly distribute the
      #                rows across shards when sorted by primary key, remove
      #                --chunk_count 2, --min_rows_per_chunk 1 and set
      #                --source_reader_count back to 1.
      args.extend(['--source_reader_count', '2',
                   '--chunk_count', '2',
                   '--min_rows_per_chunk', '1',
                   '--write_query_max_rows', '1'])
    args.append('test_keyspace/0')
    workerclient_proc = utils.run_vtworker_client_bg(args, worker_rpc_port)

    if mysql_down:
      # If MySQL is down, we wait until vtworker retried at least once to make
      # sure it reached the point where a write failed due to MySQL being down.
      # There should be two retries at least, one for each destination shard.
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerRetryCount >= 2',
          condition_fn=lambda v: v.get('WorkerRetryCount') >= 2)
      logging.debug('Worker has retried at least twice, starting reparent now')

      # vtworker is blocked at this point. This is a good time to test that its
      # throttler server is reacting to RPCs.
      self.check_throttler_service('localhost:%d' % worker_rpc_port,
                                   ['test_keyspace/-80', 'test_keyspace/80-'],
                                   9999)

      # Bring back masters. Since we test with semi-sync now, we need at least
      # one replica for the new master. This test is already quite expensive,
      # so we bring back the old master as a replica rather than having a third
      # replica up the whole time.
      logging.debug('Restarting mysqld on destination masters')
      utils.wait_procs(
          [shard_0_master.start_mysql(),
           shard_1_master.start_mysql()])

      # Reparent away from the old masters.
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80',
           '-new_master', shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-',
           '-new_master', shard_1_replica.tablet_alias], auto_log=True)

    else:
      # NOTE: There is a race condition around this:
      #   It's possible that the SplitClone vtworker command finishes before the
      #   PlannedReparentShard vtctl command, which we start below, succeeds.
      #   Then the test would fail because vtworker did not have to retry.
      #
      # To workaround this, the test takes a parameter to increase the number of
      # rows that the worker has to copy (with the idea being to slow the worker
      # down).
      # You should choose a value for num_insert_rows, such that this test
      # passes for your environment (trial-and-error...)
      # Make sure that vtworker got past the point where it picked a master
      # for each destination shard ("finding targets" state).
      utils.poll_for_vars(
          'vtworker', worker_port,
          'WorkerState == cloning the data (online)',
          condition_fn=lambda v: v.get('WorkerState') == 'cloning the'
          ' data (online)')
      logging.debug('Worker is in copy state, starting reparent now')

      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/-80',
           '-new_master', shard_0_replica.tablet_alias], auto_log=True)
      utils.run_vtctl(
          ['PlannedReparentShard', '-keyspace_shard', 'test_keyspace/80-',
           '-new_master', shard_1_replica.tablet_alias], auto_log=True)

    utils.wait_procs([workerclient_proc])

    # Verify that we were forced to re-resolve and retry.
    worker_vars = utils.get_vars(worker_port)
    self.assertGreater(worker_vars['WorkerRetryCount'], 1,
                       "expected vtworker to retry each of the two reparented"
                       " destination masters at least once, but it didn't")
    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
                        "expected vtworker to retry, but it didn't")
    utils.kill_sub_process(worker_proc, soft=True)

    # Wait for the destination RDONLYs to catch up or the following offline
    # clone will try to insert rows which already exist.
    # TODO(mberlin): Remove this once SplitClone supports it natively.
    utils.wait_for_replication_pos(shard_0_replica, shard_0_rdonly1)
    utils.wait_for_replication_pos(shard_1_replica, shard_1_rdonly1)
    # Run final offline clone to enable filtered replication.
    _, _ = utils.run_vtworker(['-cell', 'test_nj',
                               '--use_v3_resharding_mode=false',
                               'SplitClone',
                               '--online=false',
                               '--min_healthy_rdonly_tablets', '1',
                               'test_keyspace/0'], auto_log=True)

    # Make sure that everything is caught up to the same replication point
    self.run_split_diff('test_keyspace/-80', all_shard_tablets, shard_0_tablets)
    self.run_split_diff('test_keyspace/80-', all_shard_tablets, shard_1_tablets)

    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)