def get_random_tablet(self, keyspace=None, shard_name=None, cell=None,
                        tablet_type=None, task_number=None):
    """Get a random tablet name.

    Args:
      keyspace: name of the keyspace to get information on the master.
      shard_name: shard to select tablet from (None for random) (string).
      cell: cell to select tablet from (None for random) (string).
      tablet_type: type of tablet to select (None for random) (string).
      task_number: a specific task number (None for random) (int).

    Returns:
      random tablet name (cell-uid) (string).
    """
    keyspace = keyspace or random.choice(self.keyspaces)
    shard_name = shard_name or (
        sharding_utils.get_shard_name(
            random.randint(0, self.shards[self.keyspaces.index(keyspace)])))
    cell = cell or random.choice(self.cells)
    tablets = [t.split(' ') for t in self.vtctl_helper.execute_vtctl_command(
        ['ListShardTablets', '%s/%s' % (keyspace, shard_name)]).split('\n')]
    cell_tablets = [t for t in tablets if self.get_tablet_cell(t[0]) == cell]
    if task_number:
      return cell_tablets[task_number][0]
    if tablet_type:
      return random.choice([t[0] for t in cell_tablets if t[3] == tablet_type])
    return random.choice(cell_tablets)[0]
Esempio n. 2
0
    def implicit_reparent(self,
                          keyspace,
                          shard,
                          num_shards,
                          perform_emergency_reparent=False):
        """Performs an implicit reparent.

    This function will restart the current master task and verify that a new
    task was selected to be the master.

    Args:
      keyspace: Name of the keyspace to reparent (string)
      shard: Numeric ID of the shard to reparent (zero based int)
      num_shards: Total number of shards (int)
      perform_emergency_reparent: Do an emergency reparent as well (bool)
    """

        shard_name = sharding_utils.get_shard_name(shard, num_shards)

        original_master_name = (self.env.get_current_master_name(
            keyspace, shard_name))

        logging.info('Restarting %s/%s, current master: %s', keyspace,
                     shard_name, original_master_name)
        ret_val = self.env.restart_mysql_task(original_master_name, 'mysql',
                                              True)

        self.assertEquals(ret_val,
                          0,
                          msg='restart failed (returned %d)' % ret_val)

        if perform_emergency_reparent:
            next_master = self.env.get_next_master(keyspace, shard_name)[2]
            logging.info('Emergency reparenting %s/%s to %s', keyspace,
                         shard_name, next_master)
            self.env.internal_reparent(keyspace,
                                       shard_name,
                                       next_master,
                                       emergency=True)

        start_time = time.time()
        while time.time() - start_time < self.reparent_timeout_threshold:
            new_master_name = self.env.get_current_master_name(
                keyspace, shard_name)
            if new_master_name != original_master_name:
                break
            time.sleep(1)
        self.assertNotEquals(
            new_master_name,
            original_master_name,
            msg='Expected master tablet to change, but it remained as %s' %
            (new_master_name))
        logging.info('restart on %s/%s resulted in new master: %s', keyspace,
                     shard_name, new_master_name)
Esempio n. 3
0
    def implicit_reparent(self, keyspace, shard, num_shards):
        """Performs an implicit reparent.

    This function will call borg restart on the current master task and
    verify that a new task was selected to be the master.

    Args:
      keyspace: Name of the keyspace to reparent (string)
      shard: Numeric ID of the shard to reparent (zero based int)
      num_shards: Total number of shards (int)
    """

        shard_name = sharding_utils.get_shard_name(shard, num_shards)

        original_master_name = (self.env.get_current_master_name(
            keyspace, shard_name))
        original_master_cell = self.env.get_tablet_cell(original_master_name)
        master_task_num = self.env.get_tablet_task_number(original_master_name)

        logging.info('Restarting %s/%s, current master: %s, task: %d',
                     keyspace, shard_name, original_master_name,
                     master_task_num)
        ret_val = self.env.restart_mysql_task(original_master_cell, keyspace,
                                              shard, master_task_num,
                                              'replica', 'mysql-alloc', True)

        self.assertEquals(ret_val,
                          0,
                          msg='restartalloc failed (returned %d)' % ret_val)

        start_time = time.time()
        while time.time() - start_time < self.reparent_timeout_threshold:
            new_master_name = self.env.get_current_master_name(
                keyspace, shard_name)
            new_master_task_num = self.env.get_tablet_task_number(
                new_master_name)
            if new_master_name != original_master_name:
                break
            time.sleep(1)
        self.assertNotEquals(
            new_master_name,
            original_master_name,
            msg='Expected master tablet to change, but it remained as %s' %
            (new_master_name))
        logging.info(
            'restartalloc on %s/%s resulted in new master: %s, task: %d',
            keyspace, shard_name, new_master_name, new_master_task_num)
  def get_current_master_cell(self, keyspace):
    """Obtains current master cell.

    This gets the master cell for the first shard in the keyspace, and assumes
    that all shards share the same master.

    Args:
      keyspace: name of the keyspace to get the master cell for (string).

    Returns:
      master cell name (string).
    """
    num_shards = self.num_shards[self.keyspaces.index(keyspace)]
    first_shard_name = sharding_utils.get_shard_name(0, num_shards)
    first_shard_master_tablet = (
        self.get_current_master_name(keyspace, first_shard_name))
    return self.get_tablet_cell(first_shard_master_tablet)
Esempio n. 5
0
  def implicit_reparent(
      self, keyspace, shard, num_shards, perform_emergency_reparent=False):
    """Performs an implicit reparent.

    This function will restart the current master task and verify that a new
    task was selected to be the master.

    Args:
      keyspace: Name of the keyspace to reparent (string)
      shard: Numeric ID of the shard to reparent (zero based int)
      num_shards: Total number of shards (int)
      perform_emergency_reparent: Do an emergency reparent as well (bool)
    """

    shard_name = sharding_utils.get_shard_name(shard, num_shards)

    original_master_name = (
        self.env.get_current_master_name(keyspace, shard_name))

    logging.info('Restarting %s/%s, current master: %s',
                 keyspace, shard_name, original_master_name)
    ret_val = self.env.restart_mysql_task(original_master_name, 'mysql', True)

    self.assertEquals(ret_val, 0,
                      msg='restart failed (returned %d)' % ret_val)

    if perform_emergency_reparent:
      next_master = self.env.get_next_master(keyspace, shard_name)[2]
      logging.info('Emergency reparenting %s/%s to %s', keyspace, shard_name,
                   next_master)
      self.env.internal_reparent(
          keyspace, shard_name, next_master, emergency=True)

    start_time = time.time()
    while time.time() - start_time < self.reparent_timeout_threshold:
      new_master_name = self.env.get_current_master_name(keyspace, shard_name)
      if new_master_name != original_master_name:
        break
      time.sleep(1)
    self.assertNotEquals(
        new_master_name, original_master_name,
        msg='Expected master tablet to change, but it remained as %s' % (
            new_master_name))
    logging.info('restart on %s/%s resulted in new master: %s',
                 keyspace, shard_name, new_master_name)
Esempio n. 6
0
  def test_backup(self):
    logging.info('Performing %s backup cycles', self.num_backups)
    for attempt in xrange(self.num_backups):
      logging.info('Backup iteration %d of %d', attempt + 1, self.num_backups)
      for keyspace, num_shards in zip(self.env.keyspaces, self.env.num_shards):
        backup_tablets = []
        for shard in xrange(num_shards):
          # Pick a random replica tablet in each shard
          tablets = self.env.get_tablet_types_for_shard(
              keyspace, sharding_utils.get_shard_name(shard, num_shards))
          available_tablets = [x for x in tablets if x[1] == 'replica']
          self.assertTrue(
              len(available_tablets), 'No available tablets found to backup!')
          tablet_to_backup_name = random.choice(available_tablets)[0]
          backup_tablets.append(tablet_to_backup_name)

        self.perform_backup(backup_tablets)
        self.perform_restore(backup_tablets, num_shards)
Esempio n. 7
0
  def test_backup(self):
    logging.info('Performing %s backup cycles', self.num_backups)
    for attempt in xrange(self.num_backups):
      logging.info('Backup iteration %d of %d', attempt + 1, self.num_backups)
      for keyspace, num_shards in zip(self.env.keyspaces, self.env.num_shards):
        backup_tablets = []
        for shard in xrange(num_shards):
          # Pick a random replica tablet in each shard
          tablets = self.env.get_tablet_types_for_shard(
              keyspace, sharding_utils.get_shard_name(shard, num_shards))
          available_tablets = [x for x in tablets if x[1] == 'replica']
          self.assertTrue(
              len(available_tablets), 'No available tablets found to backup!')
          tablet_to_backup_name = random.choice(available_tablets)[0]
          backup_tablets.append(tablet_to_backup_name)

        self.perform_backup(backup_tablets)
        self.perform_restore(backup_tablets, num_shards)
Esempio n. 8
0
  def implicit_reparent(self, keyspace, shard, num_shards):
    """Performs an implicit reparent.

    This function will call borg restart on the current master task and
    verify that a new task was selected to be the master.

    Args:
      keyspace: Name of the keyspace to reparent (string)
      shard: Numeric ID of the shard to reparent (zero based int)
      num_shards: Total number of shards (int)
    """

    shard_name = sharding_utils.get_shard_name(shard, num_shards)

    original_master_name = (
        self.env.get_current_master_name(keyspace, shard_name))
    original_master_cell = self.env.get_tablet_cell(original_master_name)
    master_task_num = self.env.get_tablet_task_number(original_master_name)

    logging.info('Restarting %s/%s, current master: %s, task: %d',
                 keyspace, shard_name, original_master_name, master_task_num)
    ret_val = self.env.restart_mysql_task(
        original_master_cell, keyspace, shard, master_task_num, 'replica',
        'mysql-alloc', True)

    self.assertEquals(ret_val, 0,
                      msg='restartalloc failed (returned %d)' % ret_val)

    start_time = time.time()
    while time.time() - start_time < self.reparent_timeout_threshold:
      new_master_name = self.env.get_current_master_name(keyspace, shard_name)
      new_master_task_num = self.env.get_tablet_task_number(new_master_name)
      if new_master_name != original_master_name:
        break
      time.sleep(1)
    self.assertNotEquals(
        new_master_name, original_master_name,
        msg='Expected master tablet to change, but it remained as %s' % (
            new_master_name))
    logging.info('restartalloc on %s/%s resulted in new master: %s, task: %d',
                 keyspace, shard_name, new_master_name, new_master_task_num)
Esempio n. 9
0
  def setUpClass(cls):
    super(ReparentTest, cls).setUpClass()

    # number of reparent iterations
    cls.num_reparents = int(cls.test_params.get('num_reparents', '1'))

    # max allowable median master downtime in seconds
    cls.master_downtime_threshold = int(cls.test_params.get(
        'master_downtime_threshold', '20'))

    # seconds to wait for reparent to result in a new master
    cls.reparent_timeout_threshold = int(cls.test_params.get(
        'reparent_timeout_threshold', '30'))

    for keyspace, num_shards in zip(cls.env.keyspaces, cls.env.num_shards):
      for shard in xrange(num_shards):
        shard_name = sharding_utils.get_shard_name(shard, num_shards)
        backup_tablet_uid = cls.env.get_random_tablet(
            keyspace, shard_name, tablet_type='replica')
        logging.info('Taking a backup on tablet %s for %s/%s',
                     backup_tablet_uid, keyspace, shard_name)
        cls.env.vtctl_helper.execute_vtctl_command(
            ['Backup', backup_tablet_uid])
Esempio n. 10
0
    def setUpClass(cls):
        super(ReparentTest, cls).setUpClass()

        # number of reparent iterations
        cls.num_reparents = int(cls.test_params.get('num_reparents', '1'))

        # max allowable median master downtime in seconds
        cls.master_downtime_threshold = int(
            cls.test_params.get('master_downtime_threshold', '20'))

        # seconds to wait for reparent to result in a new master
        cls.reparent_timeout_threshold = int(
            cls.test_params.get('reparent_timeout_threshold', '30'))

        for keyspace, num_shards in zip(cls.env.keyspaces, cls.env.num_shards):
            for shard in xrange(num_shards):
                shard_name = sharding_utils.get_shard_name(shard, num_shards)
                backup_tablet_uid = cls.env.get_random_tablet(
                    keyspace, shard_name, tablet_type='replica')
                logging.info('Taking a backup on tablet %s for %s/%s',
                             backup_tablet_uid, keyspace, shard_name)
                cls.env.vtctl_helper.execute_vtctl_command(
                    ['Backup', backup_tablet_uid])
Esempio n. 11
0
  def explicit_reparent(self, keyspace, num_shards, external=False,
                        cross_cell=False):
    """Performs an explicit reparent.

    This function will explicitly select a new master and verify that the
    topology is updated.

    Args:
      keyspace: Name of the keyspace to reparent (string)
      num_shards: Total number of shards (int)
      external: Whether the reparent should be external or through vtctl (bool)
      cross_cell: Whether to reparent to a different cell (bool)

    Returns:
      How long we waited for the reparent.
      The time begins just before calling an explicit reparent.
      This is a list of floats, one for each shard.
      For cross-cell reparents, it returns [].
    """
    next_masters = []
    durations = []

    for shard in xrange(num_shards):
      shard_name = sharding_utils.get_shard_name(shard, num_shards)
      original_master = self.env.get_current_master_name(keyspace, shard_name)

      next_master = self.env.get_next_master(keyspace, shard_name, cross_cell)
      next_masters.append(next_master)

      self.env.wait_for_good_failover_status(keyspace, shard_name)

      # Call Reparent in a separate thread.
      def reparent_shard(shard, shard_name, original_master, next_master):
        logging.info('Reparenting %s/%s from %s to %s', keyspace, shard_name,
                     original_master, next_master[2])
        if external:
          return_code, return_output = self.env.external_reparent(
              keyspace, next_master[0], shard, new_task_num=next_master[1])
        else:
          return_code, return_output = self.env.internal_reparent(
              keyspace, shard_name, next_master[2])
        logging.info('Reparent returned %d for %s/%s: %s',
                     return_code, keyspace, shard_name, return_output)

      thread = threading.Thread(target=reparent_shard,
                                args=[shard, shard_name, original_master,
                                      next_master])
      start_time = time.time()
      thread.start()

      # Wait for the reparent.
      while time.time() - start_time < self.reparent_timeout_threshold:
        try:
          tablet_health = json.loads(
              self.env.vtctl_helper.execute_vtctl_command(
                  ['VtTabletStreamHealth', next_master[2]]))
          if tablet_health['target']['tablet_type'] == topodata_pb2.MASTER:
            duration = time.time() - start_time
            durations.append(duration)
            logging.info('Reparent took %f seconds', duration)
            break
        except (IndexError, KeyError, vtctl_helper.VtctlClientError):
          pass
      else:
        self.fail('Timed out waiting for reparent on %s/%s' % (
            keyspace, shard_name))

      thread.join()

    return durations
Esempio n. 12
0
    def explicit_reparent(self,
                          keyspace,
                          num_shards,
                          external=False,
                          cross_cell=False):
        """Performs an explicit reparent.

    This function will explicitly select a new master and verify that the
    topology is updated.

    Args:
      keyspace: Name of the keyspace to reparent (string)
      num_shards: Total number of shards (int)
      external: Whether the reparent should be external or through vtctl (bool)
      cross_cell: Whether to reparent to a different cell (bool)

    Returns:
      How long we waited for the reparent.
      The time begins just before calling an explicit reparent.
      This is a list of floats, one for each shard.
      For cross-cell reparents, it returns [].
    """
        next_masters = []
        durations = []

        for shard in xrange(num_shards):
            shard_name = sharding_utils.get_shard_name(shard, num_shards)
            original_master = self.env.get_current_master_name(
                keyspace, shard_name)

            next_master = self.env.get_next_master(keyspace, shard_name,
                                                   cross_cell)
            next_masters.append(next_master)

            self.env.wait_for_good_failover_status(keyspace, shard_name)

            # Call Reparent in a separate thread.
            def reparent_shard(shard_name, original_master, next_master):
                logging.info('Reparenting %s/%s from %s to %s', keyspace,
                             shard_name, original_master, next_master[2])
                reparent_fn = self.env.external_reparent if external else (
                    self.env.internal_reparent)
                return_code, return_output = reparent_fn(
                    keyspace, shard_name, next_master[2])
                logging.info('Reparent returned %d for %s/%s: %s', return_code,
                             keyspace, shard_name, return_output)

            thread = threading.Thread(
                target=reparent_shard,
                args=[shard_name, original_master, next_master])
            start_time = time.time()
            thread.start()

            # Wait for the reparent.
            while time.time() - start_time < self.reparent_timeout_threshold:
                try:
                    tablet_health = json.loads(
                        self.env.vtctl_helper.execute_vtctl_command(
                            ['VtTabletStreamHealth', next_master[2]]))
                    if tablet_health['target'][
                            'tablet_type'] == topodata_pb2.MASTER:
                        duration = time.time() - start_time
                        durations.append(duration)
                        logging.info('Reparent took %f seconds', duration)
                        break
                except (IndexError, KeyError,
                        vtctl_helper.VtctlClientError) as e:
                    logging.info(
                        'While waiting for reparent, got the following error: %s',
                        e)
            else:
                self.fail('Timed out waiting for reparent on %s/%s' %
                          (keyspace, shard_name))

            thread.join()

        return durations