def test_health_check(self): utils.run_vtctl('CreateKeyspace test_keyspace') # one master, one replica that starts in spare tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0') for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') tablet_62344.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62044.start_vttablet(wait_for_state=None, target_tablet_type='replica') tablet_62344.wait_for_vttablet_state('SERVING') tablet_62044.wait_for_vttablet_state('NOT_SERVING') utils.run_vtctl(['ReparentShard', '-force', 'test_keyspace/0', tablet_62344.tablet_alias]) # make sure the 'spare' slave goes to 'replica' timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if ti['Type'] == "replica": logging.info("Slave tablet went to replica, good") break timeout = utils.wait_step('slave tablet going to replica', timeout) # make sure the master is still master ti = utils.run_vtctl_json(['GetTablet', tablet_62344.tablet_alias]) self.assertEqual(ti['Type'], 'master', "unexpected master type: %s" % ti['Type']) # stop replication on the slave, see it trigger the slave going # slightly unhealthy tablet_62044.mquery('', 'stop slave') timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_62044.tablet_alias]) if 'Health' in ti and ti['Health']: if 'replication_lag' in ti['Health']: if ti['Health']['replication_lag'] == 'high': logging.info("Slave tablet replication_lag went to high, good") break timeout = utils.wait_step('slave has high replication lag', timeout) # make sure the serving graph was updated ep = utils.run_vtctl_json(['GetEndPoints', 'test_nj', 'test_keyspace/0', 'replica']) if not ep['entries'][0]['health']: self.fail('Replication lag parameter not propagated to serving graph: %s' % str(ep)) self.assertEqual(ep['entries'][0]['health']['replication_lag'], 'high', 'Replication lag parameter not propagated to serving graph: %s' % str(ep)) tablet.kill_tablets([tablet_62344, tablet_62044])
def _check_lots_timeout(self, count, threshold, timeout, base=0): while True: value = self._check_lots(count, base=base) if value >= threshold: return value timeout = utils.wait_step('waiting for %d%% of the data' % threshold, timeout, sleep_time=1)
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db("vt_test_keyspace") pos = mysql_flavor().master_position(tablet_62344) changeMasterCmds = mysql_flavor().change_master_commands(utils.hostname, tablet_62344.mysql_port, pos) tablet_62044.mquery("", ["RESET MASTER", "RESET SLAVE"] + changeMasterCmds + ["START SLAVE"]) # now shutdown all mysqld shutdown_procs = [tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql()] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet("master", "test_keyspace", "0") tablet_62044.init_tablet("spare", "test_keyspace", "0", include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet( wait_for_state=None, target_tablet_type="replica", full_mycnf_args=True, include_mysql_port=False ) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state("NOT_SERVING") self.check_healthz(t, False) # restart mysqld start_procs = [tablet_62344.start_mysql(), tablet_62044.start_mysql()] utils.wait_procs(start_procs) # the master should still be healthy utils.run_vtctl(["RunHealthCheck", tablet_62344.tablet_alias, "replica"], auto_log=True) self.check_healthz(tablet_62344, True) # the slave won't be healthy at first, as replication is not running utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True) self.check_healthz(tablet_62044, False) tablet_62044.wait_for_vttablet_state("NOT_SERVING") # restart replication tablet_62044.mquery("", ["START SLAVE"]) # wait for the tablet to become healthy and fix its mysql port utils.run_vtctl(["RunHealthCheck", tablet_62044.tablet_alias, "replica"], auto_log=True) tablet_62044.wait_for_vttablet_state("SERVING") self.check_healthz(tablet_62044, True) for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(["GetTablet", t.tablet_alias]) if "mysql" in ti["Portmap"]: break timeout = utils.wait_step("mysql port in tablet record", timeout) self.assertEqual(ti["Portmap"]["mysql"], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def test_vtaction_dies_hard(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as it is serving tablet_62344.create_db('vt_test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True) # start a 'vtctl Sleep' command, don't wait for it action_path, _ = utils.run_vtctl(['-no-wait', 'Sleep', tablet_62344.tablet_alias, '60s'], trap_output=True) action_path = action_path.strip() # wait for the action to be 'Running', capture its pid timeout = 10 while True: an = utils.run_vtctl_json(['ReadTabletAction', action_path]) if an.get('State', None) == 'Running': pid = an['Pid'] logging.info("Action is running with pid %u, good", pid) break timeout = utils.wait_step('sleep action to run', timeout) # let's kill it hard, wait until it's gone for good os.kill(pid, signal.SIGKILL) try: os.waitpid(pid, 0) except OSError: # this means the process doesn't exist any more, we're good pass # Then let's make sure the next action cleans up properly and can execute. # If that doesn't work, this will time out and the test will fail. utils.run_vtctl(['Ping', tablet_62344.tablet_alias]) tablet_62344.kill_vttablet()
def test_schema_changes(self): self._apply_initial_schema() self._apply_schema( test_keyspace, self._alter_test_table_sql('vt_select_test03', 'msg')) shard_0_schema = self._get_schema(shard_0_master.tablet_alias) shard_1_schema = self._get_schema(shard_1_master.tablet_alias) # all shards should have the same schema self.assertEqual(shard_0_schema, shard_1_schema) # test schema changes os.makedirs(os.path.join(utils.vtctld.schema_change_dir, test_keyspace)) input_path = os.path.join( utils.vtctld.schema_change_dir, test_keyspace, 'input') os.makedirs(input_path) sql_path = os.path.join(input_path, 'create_test_table_x.sql') with open(sql_path, 'w') as handler: handler.write('create table test_table_x (id int)') timeout = 10 # wait until this sql file being consumed by autoschema while os.path.isfile(sql_path): timeout = utils.wait_step( 'waiting for vtctld to pick up schema changes', timeout, sleep_time=0.2) # check number of tables self._check_tables(shard_0_master, 5) self._check_tables(shard_1_master, 5)
def test_webinterface(self): worker_base_url = 'http://localhost:%u' % int(self.worker_port) # Wait for /status to become available. timeout = 10 while True: done = False try: urllib2.urlopen(worker_base_url + '/status').read() done = True except: pass if done: break timeout = utils.wait_step('worker /status webpage must be available', timeout) # Run the command twice to make sure it's idempotent. for _ in range(2): # Run Ping command. try: urllib2.urlopen(worker_base_url + '/Debugging/Ping', data=urllib.urlencode({'message':'pong'})).read() raise Exception("Should have thrown an HTTPError for the redirect.") except urllib2.HTTPError as e: self.assertEqual(e.code, 307) # Verify that the command logged something and its available at /status. status = urllib2.urlopen(worker_base_url + '/status').read() self.assertIn("Ping command was called with message: 'pong'", status, "Command did not log output to /status") # Reset the job. urllib2.urlopen(worker_base_url + '/reset').read() status_after_reset = urllib2.urlopen(worker_base_url + '/status').read() self.assertIn("This worker is idle.", status_after_reset, "/status does not indicate that the reset was successful")
def test_sigterm(self): utils.run_vtctl(['CreateKeyspace', 'test_keyspace']) # create the database so vttablets start, as it is serving tablet_62344.create_db('vt_test_keyspace') tablet_62344.init_tablet('master', 'test_keyspace', '0', start=True) # start a 'vtctl Sleep' command, don't wait for it action_path, _ = utils.run_vtctl(['-no-wait', 'Sleep', tablet_62344.tablet_alias, '60s'], trap_output=True) action_path = action_path.strip() # wait for the action to be 'Running', capture its pid timeout = 10 while True: an = utils.run_vtctl_json(['ReadTabletAction', action_path]) if an.get('State', None) == 'Running': pid = an['Pid'] logging.info("Action is running with pid %u, good", pid) break timeout = utils.wait_step('sleep action to run', timeout) # let's kill the vtaction process with a regular SIGTERM os.kill(pid, signal.SIGTERM) # check the vtctl command got the right remote error back out, err = utils.run_vtctl(['WaitForAction', action_path], trap_output=True, raise_on_error=False) if "vtaction interrupted by signal" not in err: self.fail("cannot find expected output in error: " + err) logging.debug("vtaction was interrupted correctly:\n" + err) tablet_62344.kill_vttablet()
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): expr = re.compile('^' + expected + '$') while True: v = utils.get_vars(port or self.port) last_seen_state = '?' if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for state %s' % expected) logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'TabletStateName' not in v: logging.debug( ' vttablet %s not exporting TabletStateName, waiting...', self.tablet_alias) else: s = v['TabletStateName'] last_seen_state = s if expr.match(s): break else: logging.debug(' vttablet %s in state %s != %s', self.tablet_alias, s, expected) timeout = utils.wait_step( 'waiting for %s state %s (last seen state: %s)' % (self.tablet_alias, expected, last_seen_state), timeout, sleep_time=0.1)
def check_throttler_service_maxrates(self, throttler_server, names, rate): """Checks the vtctl ThrottlerMaxRates and ThrottlerSetRate commands.""" # Avoid flakes by waiting for all throttlers. (Necessary because filtered # replication on vttablet will register the throttler asynchronously.) timeout_s = 10 while True: stdout, _ = utils.run_vtctl(['ThrottlerMaxRates', '--server', throttler_server], auto_log=True, trap_output=True) if '%d active throttler(s)' % len(names) in stdout: break timeout_s = utils.wait_step('all throttlers registered', timeout_s) for name in names: self.assertIn('| %s | %d |' % (name, rate), stdout) self.assertIn('%d active throttler(s)' % len(names), stdout) # Check that it's possible to change the max rate on the throttler. new_rate = 'unlimited' stdout, _ = utils.run_vtctl(['ThrottlerSetMaxRate', '--server', throttler_server, new_rate], auto_log=True, trap_output=True) self.assertIn('%d active throttler(s)' % len(names), stdout) stdout, _ = utils.run_vtctl(['ThrottlerMaxRates', '--server', throttler_server], auto_log=True, trap_output=True) for name in names: self.assertIn('| %s | %s |' % (name, new_rate), stdout) self.assertIn('%d active throttler(s)' % len(names), stdout)
def test_schema_changes(self): self._apply_initial_schema() self._apply_schema( test_keyspace, self._alter_test_table_sql('vt_select_test03', 'msg')) shard_0_schema = self._get_schema(shard_0_master.tablet_alias) shard_1_schema = self._get_schema(shard_1_master.tablet_alias) # all shards should have the same schema self.assertEqual(shard_0_schema, shard_1_schema) # test schema changes os.makedirs(os.path.join(utils.vtctld.schema_change_dir, test_keyspace)) input_path = os.path.join( utils.vtctld.schema_change_dir, test_keyspace, 'input') os.makedirs(input_path) sql_path = os.path.join(input_path, 'create_test_table_x.sql') with open(sql_path, 'w') as handler: handler.write('create table test_table_x (id int)') # wait until this sql file being consumed by autoschema timeout = 10 while os.path.isfile(sql_path): timeout = utils.wait_step( 'waiting for vtctld to pick up schema changes', timeout, sleep_time=0.2) # check number of tables self._check_tables(shard_0_master, 5) self._check_tables(shard_1_master, 5)
def wait_for_mysqlctl_socket(self, timeout=10.0): mysql_sock = os.path.join(self.tablet_dir, 'mysql.sock') mysqlctl_sock = os.path.join(self.tablet_dir, 'mysqlctl.sock') while True: if os.path.exists(mysql_sock) and os.path.exists(mysqlctl_sock): return timeout = utils.wait_step('waiting for mysql and mysqlctl socket files: %s %s' % (mysql_sock, mysqlctl_sock), timeout)
def wait_for_binlog_server_state(self, expected, timeout=30.0): """Wait for the tablet's binlog server to be in the provided state. Args: expected: the state to wait for. timeout: how long to wait before error. """ while True: v = utils.get_vars(self.port) if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for binlog state %s' % expected) logging.debug(' vttablet not answering at /debug/vars, waiting...') else: if 'UpdateStreamState' not in v: logging.debug( ' vttablet not exporting BinlogServerState, waiting...') else: s = v['UpdateStreamState'] if s != expected: logging.debug(" vttablet's binlog server in state %s != %s", s, expected) else: break timeout = utils.wait_step( 'waiting for binlog server state %s' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog service is in state %s', self.tablet_alias, expected)
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): expr = re.compile('^' + expected + '$') while True: v = utils.get_vars(port or self.port) last_seen_state = '?' if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for state %s' % expected) logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'TabletStateName' not in v: logging.debug( ' vttablet %s not exporting TabletStateName, waiting...', self.tablet_alias) else: s = v['TabletStateName'] last_seen_state = s if expr.match(s): break else: logging.debug( ' vttablet %s in state %s != %s', self.tablet_alias, s, expected) timeout = utils.wait_step( 'waiting for %s state %s (last seen state: %s)' % (self.tablet_alias, expected, last_seen_state), timeout, sleep_time=0.1)
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): # wait for zookeeper PID just to be sure we have it if environment.topo_server_implementation == 'zookeeper': if not self.checked_zk_pid: utils.run(environment.binary_args('zk') + ['wait', '-e', self.zk_pid], stdout=utils.devnull) self.checked_zk_pid = True while True: v = utils.get_vars(port or self.port) if v == None: logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'Voltron' not in v: logging.debug( ' vttablet %s not exporting Voltron, waiting...', self.tablet_alias) else: s = v['TabletStateName'] if s != expected: logging.debug( ' vttablet %s in state %s != %s', self.tablet_alias, s, expected) else: break timeout = utils.wait_step('waiting for state %s' % expected, timeout, sleep_time=0.1)
def test_stream_parity(self): """Tests parity of streams between master and replica for the same writes. Also tests transactions are retrieved properly. """ global master_start_position timeout = 30 while True: master_start_position = _get_master_current_position() replica_start_position = _get_repl_current_position() if master_start_position == replica_start_position: break timeout = utils.wait_step("%s == %s" % (master_start_position, replica_start_position), timeout) logging.debug("run_test_stream_parity starting @ %s", master_start_position) self._exec_vt_txn(self._populate_vt_a(15)) self._exec_vt_txn(self._populate_vt_b(14)) self._exec_vt_txn(["delete from vt_a"]) self._exec_vt_txn(["delete from vt_b"]) # get master events master_conn = self._get_master_stream_conn() master_events = [] for event in master_conn.stream_update( "test_keyspace", "0", topodata_pb2.MASTER, position=master_start_position ): master_events.append(event) if event.event_token.position: break master_conn.close() # get replica events replica_events = [] replica_conn = self._get_replica_stream_conn() for event in replica_conn.stream_update( "test_keyspace", "0", topodata_pb2.REPLICA, position=replica_start_position ): replica_events.append(event) if event.event_token.position: break replica_conn.close() # and compare if len(master_events) != len(replica_events): logging.debug("Test Failed - # of records mismatch, master %s replica %s", master_events, replica_events) for master_event, replica_event in zip(master_events, replica_events): # The timestamp is from when the event was written to the binlogs. # the master uses the timestamp of when it wrote it originally, # the slave of when it applied the logs. These can differ and make this # test flaky. So we just blank them out, easier. We really want to # compare the replication positions. master_event.event_token.timestamp = 123 replica_event.event_token.timestamp = 123 self.assertEqual( master_event, replica_event, "Test failed, data mismatch - master '%s' and replica '%s'" % (master_event, replica_event), ) logging.debug("Test Writes: PASS")
def wait_for_binlog_server_state(self, expected, timeout=30.0): while True: v = utils.get_vars(self.port) if v == None: logging.debug( ' vttablet not answering at /debug/vars, waiting...') else: if 'UpdateStreamState' not in v: logging.debug( ' vttablet not exporting BinlogServerState, waiting...' ) else: s = v['UpdateStreamState'] if s != expected: logging.debug( " vttablet's binlog server in state %s != %s", s, expected) else: break timeout = utils.wait_step('waiting for binlog server state %s' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog service is in state %s', self.tablet_alias, expected)
def wait_for_binlog_player_count(self, expected, timeout=30.0): while True: v = utils.get_vars(self.port) if v == None: logging.debug( ' vttablet not answering at /debug/vars, waiting...') else: if 'BinlogPlayerMapSize' not in v: logging.debug( ' vttablet not exporting BinlogPlayerMapSize, waiting...' ) else: s = v['BinlogPlayerMapSize'] if s != expected: logging.debug( " vttablet's binlog player map has count %u != %u", s, expected) else: break timeout = utils.wait_step('waiting for binlog player count %d' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog player has %d players', self.tablet_alias, expected)
def wait_for_binlog_server_state(self, expected, timeout=30.0): """Wait for the tablet's binlog server to be in the provided state. Args: expected: the state to wait for. timeout: how long to wait before error. """ while True: v = utils.get_vars(self.port) if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for binlog state %s' % expected) logging.debug( ' vttablet not answering at /debug/vars, waiting...') else: if 'UpdateStreamState' not in v: logging.debug( ' vttablet not exporting BinlogServerState, waiting...' ) else: s = v['UpdateStreamState'] if s != expected: logging.debug( " vttablet's binlog server in state %s != %s", s, expected) else: break timeout = utils.wait_step('waiting for binlog server state %s' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog service is in state %s', self.tablet_alias, expected)
def wait_for_binlog_player_count(self, expected, timeout=30.0): """Wait for a tablet to have binlog players. Args: expected: number of expected binlog players to wait for. timeout: how long to wait. """ while True: v = utils.get_vars(self.port) if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for binlog count %s' % expected) logging.debug( ' vttablet not answering at /debug/vars, waiting...') else: if 'BinlogPlayerMapSize' not in v: logging.debug( ' vttablet not exporting BinlogPlayerMapSize, waiting...' ) else: s = v['BinlogPlayerMapSize'] if s != expected: logging.debug( " vttablet's binlog player map has count %d != %d", s, expected) else: break timeout = utils.wait_step('waiting for binlog player count %d' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog player has %d players', self.tablet_alias, expected)
def test_service_disabled(self): # perform some inserts, then change state to stop the invalidator self.perform_insert(500) inv_before = self.replica_stats()['Totals']['Invalidations'] invStats_before = self.replica_vars() utils.run_vtctl(['ChangeSlaveType', replica_tablet.tablet_alias, 'spare']) # wait until it's stopped timeout = 30 while True: invStats_after = self.replica_vars() if invStats_after['RowcacheInvalidatorState'] == 'Stopped': break timeout = utils.wait_step( 'RowcacheInvalidatorState, got %s expecting Stopped' % invStats_after['RowcacheInvalidatorState'], timeout, sleep_time=0.1) # check all data is right inv_after = self.replica_stats()['Totals']['Invalidations'] invStats_after = self.replica_vars() logging.debug( 'Tablet Replica->Spare\n\tBefore: Invalidations: %d InvalidatorStats ' '%s\n\tAfter: Invalidations: %d InvalidatorStats %s', inv_before, invStats_before['RowcacheInvalidatorPosition'], inv_after, invStats_after['RowcacheInvalidatorPosition']) self.assertEqual(inv_after, 0, 'Row-cache invalid. should be disabled, no invalidations') self.assertEqual(invStats_after['RowcacheInvalidatorState'], 'Stopped', 'Row-cache invalidator should be disabled') # and restore the type utils.run_vtctl( ['ChangeSlaveType', replica_tablet.tablet_alias, 'replica'])
def wait_for_vtocc_state(self, expected, timeout=60.0, port=None): while True: v = utils.get_vars(port or self.port) last_seen_state = "?" if v == None: logging.debug( ' vttablet %s not answering at /debug/vars, waiting...', self.tablet_alias) else: if 'TabletStateName' not in v: logging.debug( ' vttablet %s not exporting TabletStateName, waiting...', self.tablet_alias) else: s = v['TabletStateName'] last_seen_state = s if s != expected: logging.debug( ' vttablet %s in state %s != %s', self.tablet_alias, s, expected) else: break timeout = utils.wait_step('waiting for state %s (last seen state: %s)' % (expected, last_seen_state), timeout, sleep_time=0.1)
def wait_for_binlog_player_count(self, expected, timeout=30.0): """Wait for a tablet to have binlog players. Args: expected: number of expected binlog players to wait for. timeout: how long to wait. """ while True: v = utils.get_vars(self.port) if v is None: if self.proc.poll() is not None: raise utils.TestError( 'vttablet died while test waiting for binlog count %s' % expected) logging.debug(' vttablet not answering at /debug/vars, waiting...') else: if 'BinlogPlayerMapSize' not in v: logging.debug( ' vttablet not exporting BinlogPlayerMapSize, waiting...') else: s = v['BinlogPlayerMapSize'] if s != expected: logging.debug(" vttablet's binlog player map has count %d != %d", s, expected) else: break timeout = utils.wait_step( 'waiting for binlog player count %d' % expected, timeout, sleep_time=0.5) logging.debug('tablet %s binlog player has %d players', self.tablet_alias, expected)
def wait_for_cache_stats(self, stats, **kwargs): timeout = 10 while True: if self.cache.stats_diff(stats, **kwargs): return timeout = utils.wait_step('cache stats update %s' % str(kwargs), timeout)
def test_stop_replication(self): # wait for replication to catch up. self._wait_for_replica() # restart the replica tablet so the stats are reset replica_tablet.kill_vttablet() replica_tablet.start_vttablet(memcache=True) # insert 100 values, should cause 100 invalidations self.perform_insert(100) self._wait_for_replica() # wait until the slave processed all data timeout = 30 while True: inv_count1 = self.replica_stats()['Totals']['Invalidations'] if inv_count1 == 100: break timeout = utils.wait_step( 'invalidation count, got %d expecting %d' % (inv_count1, 100), timeout, sleep_time=0.1) # stop replication insert more data, restart replication replica_tablet.mquery('vt_test_keyspace', 'stop slave') self.perform_insert(100) time.sleep(2) replica_tablet.mquery('vt_test_keyspace', 'start slave') self._wait_for_replica() # wait until the slave processed all data timeout = 30 while True: inv_count2 = self.replica_stats()['Totals']['Invalidations'] if inv_count2 == 200: break timeout = utils.wait_step( 'invalidation count, got %d expecting %d' % (inv_count2, 200), timeout, sleep_time=0.1) # check and display some stats invalidatorStats = self.replica_vars() logging.debug('invalidatorStats %s', invalidatorStats['RowcacheInvalidatorPosition']) self.assertEqual(invalidatorStats['RowcacheInvalidatorState'], 'Running', 'Row-cache invalidator should be enabled')
def wait_for_vars(var, key, value): timeout = 20.0 while True: v = utils.get_vars(utils.vtgate.port) if v and var in v and key in v[var] and v[var][key] == value: break timeout = utils.wait_step( 'waiting for /debug/vars of %s/%s' % (var, key), timeout)
def wait_for_tablet_type_change(self, tablet_alias, expected_type): timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_alias]) if ti['Type'] == expected_type: logging.debug("Slave tablet went to %s, good" % expected_type) break timeout = utils.wait_step('slave becomes ' + expected_type, timeout)
def test_no_mysql_healthcheck(self): """This test starts a vttablet with no mysql port, while mysql is down. It makes sure vttablet will start properly and be unhealthy. Then we start mysql, and make sure vttablet becomes healthy. """ # we need replication to be enabled, so the slave tablet can be healthy. for t in tablet_62344, tablet_62044: t.create_db('vt_test_keyspace') pos = mysql_flavor().master_position(tablet_62344) changeMasterCmds = mysql_flavor().change_master_commands( utils.hostname, tablet_62344.mysql_port, pos) tablet_62044.mquery('', ['RESET MASTER', 'RESET SLAVE'] + changeMasterCmds + ['START SLAVE']) # now shutdown all mysqld shutdown_procs = [ tablet_62344.shutdown_mysql(), tablet_62044.shutdown_mysql(), ] utils.wait_procs(shutdown_procs) # start the tablets, wait for them to be NOT_SERVING (mysqld not there) tablet_62344.init_tablet('master', 'test_keyspace', '0') tablet_62044.init_tablet('spare', 'test_keyspace', '0', include_mysql_port=False) for t in tablet_62344, tablet_62044: t.start_vttablet(wait_for_state=None, target_tablet_type='replica', full_mycnf_args=True, include_mysql_port=False) for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('NOT_SERVING') # restart mysqld start_procs = [ tablet_62344.start_mysql(), tablet_62044.start_mysql(), ] utils.wait_procs(start_procs) # wait for the tablets to become healthy and fix their mysql port for t in tablet_62344, tablet_62044: t.wait_for_vttablet_state('SERVING') for t in tablet_62344, tablet_62044: # wait for mysql port to show up timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', t.tablet_alias]) if 'mysql' in ti['Portmap']: break timeout = utils.wait_step('mysql port in tablet record', timeout) self.assertEqual(ti['Portmap']['mysql'], t.mysql_port) # all done tablet.kill_tablets([tablet_62344, tablet_62044])
def wait_for_tablet_type_change(self, tablet_alias, expected_type): t = tablet.Tablet.tablet_type_value[expected_type.upper()] timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_alias]) if ti['type'] == t: logging.debug('Slave tablet went to %s, good', expected_type) break timeout = utils.wait_step('slave becomes ' + expected_type, timeout)
def wait_for_mysqlctl_socket(self, timeout=30.0): mysql_sock = os.path.join(self.tablet_dir, 'mysql.sock') mysqlctl_sock = os.path.join(self.tablet_dir, 'mysqlctl.sock') while True: if os.path.exists(mysql_sock) and os.path.exists(mysqlctl_sock): return timeout = utils.wait_step( 'waiting for mysql and mysqlctl socket files: %s %s' % (mysql_sock, mysqlctl_sock), timeout)
def test_stream_parity(self): """Tests parity of streams between master and replica for the same writes. Also tests transactions are retrieved properly. """ global master_start_position timeout = 30 while True: master_start_position = _get_master_current_position() replica_start_position = _get_repl_current_position() if master_start_position == replica_start_position: break timeout = utils.wait_step( '%s == %s' % (master_start_position, replica_start_position), timeout ) logging.debug('run_test_stream_parity starting @ %s', master_start_position) self._exec_vt_txn(self._populate_vt_a(15)) self._exec_vt_txn(self._populate_vt_b(14)) self._exec_vt_txn(['delete from vt_a']) self._exec_vt_txn(['delete from vt_b']) master_conn = self._get_master_stream_conn() master_events = [] for stream_event in master_conn.stream_update(master_start_position): master_events.append(stream_event) if stream_event.category == update_stream.StreamEvent.POS: break replica_events = [] replica_conn = self._get_replica_stream_conn() for stream_event in replica_conn.stream_update(replica_start_position): replica_events.append(stream_event) if stream_event.category == update_stream.StreamEvent.POS: break if len(master_events) != len(replica_events): logging.debug( 'Test Failed - # of records mismatch, master %s replica %s', master_events, replica_events) for master_val, replica_val in zip(master_events, replica_events): master_data = master_val.__dict__ replica_data = replica_val.__dict__ # the timestamp is from when the event was written to the binlogs. # the master uses the timestamp of when it wrote it originally, # the slave of when it applied the logs. These can differ and make this # test flaky. So we just blank them out, easier. We really want to # compare the replication positions. master_data['timestamp'] = 'XXX' replica_data['timestamp'] = 'XXX' self.assertEqual( master_data, replica_data, "Test failed, data mismatch - master '%s' and replica position '%s'" % (master_data, replica_data)) master_conn.close() replica_conn.close() logging.debug('Test Writes: PASS')
def test_schema_changes(self): schema_changes = ';'.join([ self._create_test_table_sql('vt_select_test01'), self._create_test_table_sql('vt_select_test02'), self._create_test_table_sql('vt_select_test03'), self._create_test_table_sql('vt_select_test04')]) tables = ','.join([ 'vt_select_test01', 'vt_select_test02', 'vt_select_test03', 'vt_select_test04']) # apply schema changes to the test keyspace self._apply_schema(test_keyspace, schema_changes) # check number of tables self._check_tables(shard_0_master, 4) self._check_tables(shard_1_master, 4) self._check_tables(shard_2_master, 4) # get schema for each shard shard_0_schema = self._get_schema(shard_0_master.tablet_alias, tables) shard_1_schema = self._get_schema(shard_1_master.tablet_alias, tables) shard_2_schema = self._get_schema(shard_2_master.tablet_alias, tables) # all shards should have the same schema self.assertEqual(shard_0_schema, shard_1_schema) self.assertEqual(shard_0_schema, shard_2_schema) self._apply_schema(test_keyspace, self._alter_test_table_sql('vt_select_test03', 'msg')) shard_0_schema = self._get_schema(shard_0_master.tablet_alias, tables) shard_1_schema = self._get_schema(shard_1_master.tablet_alias, tables) shard_2_schema = self._get_schema(shard_2_master.tablet_alias, tables) # all shards should have the same schema self.assertEqual(shard_0_schema, shard_1_schema) self.assertEqual(shard_0_schema, shard_2_schema) # test schema changes os.makedirs(os.path.join(utils.vtctld.schema_change_dir, test_keyspace)) input_path = os.path.join(utils.vtctld.schema_change_dir, test_keyspace, "input") os.makedirs(input_path) sql_path = os.path.join(input_path, "create_test_table_x.sql") with open(sql_path, 'w') as handler: handler.write("create table test_table_x (id int)") timeout = 10 # wait until this sql file being consumed by autoschema while os.path.isfile(sql_path): timeout = utils.wait_step('waiting for vtctld to pick up schema changes', timeout, sleep_time=0.2) # check number of tables self._check_tables(shard_0_master, 5) self._check_tables(shard_1_master, 5) self._check_tables(shard_2_master, 5)
def wait_for_tablet_type_change(self, tablet_alias, expected_type): t = topodata_pb2.TabletType.Value(expected_type.upper()) timeout = 10 while True: ti = utils.run_vtctl_json(['GetTablet', tablet_alias]) if ti['type'] == t: logging.debug('Slave tablet went to %s, good', expected_type) break timeout = utils.wait_step('slave becomes ' + expected_type, timeout)
def _wait_for_schema_propagation(self, source=shard_0_master, targets=all_tablets): """Wait until the current schema has propagated to all tablets.""" schema = self._get_schema(source) timeout = 60 # seconds condition_msg = 'propagation of schema: %s' % schema for target in targets: while schema != self._get_schema(target): timeout = utils.wait_step(condition_msg, timeout)
def test_stop_replication(self): # wait for replication to catch up. self._wait_for_replica() # restart the replica tablet so the stats are reset replica_tablet.kill_vttablet() replica_tablet.start_vttablet(memcache=True) # insert 100 values, should cause 100 invalidations self.perform_insert(100) self._wait_for_replica() # wait until the slave processed all data timeout = 30 while True: inv_count1 = self.replica_stats()['Totals']['Invalidations'] if inv_count1 == 100: break timeout = utils.wait_step('invalidation count, got %d expecting %d' % (inv_count1, 100), timeout, sleep_time=0.1) # stop replication insert more data, restart replication replica_tablet.mquery('vt_test_keyspace', 'stop slave') self.perform_insert(100) time.sleep(2) replica_tablet.mquery('vt_test_keyspace', 'start slave') self._wait_for_replica() # wait until the slave processed all data timeout = 30 while True: inv_count2 = self.replica_stats()['Totals']['Invalidations'] if inv_count2 == 200: break timeout = utils.wait_step('invalidation count, got %d expecting %d' % (inv_count2, 200), timeout, sleep_time=0.1) # check and display some stats invalidator_stats = self.replica_vars() logging.debug('invalidator_stats %s', invalidator_stats['RowcacheInvalidatorPosition']) self.assertEqual(invalidator_stats['RowcacheInvalidatorState'], 'Running', 'Row-cache invalidator should be enabled')
def _check_vt_insert_test(self, tablet_obj, index): # wait until it gets the data timeout = 10.0 while True: result = tablet_obj.mquery("vt_test_keyspace", "select msg from vt_insert_test where id=%d" % index) if len(result) == 1: break timeout = utils.wait_step( "waiting for replication to catch up on %s" % tablet_obj.tablet_alias, timeout, sleep_time=0.1 )
def test_topocustomrule(self): # Empty rule file. topocustomrule_file = environment.tmproot+'/rules.json' with open(topocustomrule_file, 'w') as fd: fd.write('[]\n') # Start up a master mysql and vttablet utils.run_vtctl(['CreateKeyspace', '-force', 'test_keyspace']) utils.run_vtctl(['createshard', '-force', 'test_keyspace/0']) tablet_62344.init_tablet('master', 'test_keyspace', '0', parent=False) utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace']) utils.validate_topology() # Copy config file into topo. topocustomrule_path = '/keyspaces/test_keyspace/configs/CustomRules' utils.run_vtctl(['TopoCp', '-to_topo', topocustomrule_file, topocustomrule_path]) # Put some data in, start master. tablet_62344.populate('vt_test_keyspace', self._create_vt_select_test, self._populate_vt_select_test) tablet_62344.start_vttablet(topocustomrule_path=topocustomrule_path) # make sure the query service is working qr = tablet_62344.execute('select id, msg from vt_select_test') self.assertEqual(len(qr['rows']), 4, 'expected 4 rows in vt_select_test: %s' % str(qr)) # Now update the topocustomrule file. with open(topocustomrule_file, 'w') as fd: fd.write(''' [{ "Name": "rule1", "Description": "disallow select on table vt_select_test", "TableNames" : ["vt_select_test"], "Query" : "(select)|(SELECT)" }]''') utils.run_vtctl(['TopoCp', '-to_topo', topocustomrule_file, topocustomrule_path]) # And wait until the query fails with the right error. timeout = 10.0 while True: try: tablet_62344.execute('select id, msg from vt_select_test') timeout = utils.wait_step('query rule in place', timeout) except Exception as e: print e expected = ('disallowed due to rule: disallow select' ' on table vt_select_test') self.assertIn(expected, str(e)) break # Cleanup. tablet_62344.kill_vttablet()
def _check_lots_timeout(self, replica_tablet, count, threshold, timeout, base=0): while True: value = self._check_lots(replica_tablet, count, base=base) if value >= threshold: return value timeout = utils.wait_step('enough data went through', timeout)
def _wait_for_value(self, expected_result): timeout = 10 while True: result = self._exec_replica_query( 'select * from vt_insert_test where id = 1000000') if result == expected_result: return timeout = utils.wait_step( 'replica rowcache updated, got %s expected %s' % (str(result), str(expected_result)), timeout, sleep_time=0.1)
def wait_for_mysqlctl_socket(self, timeout=60.0): mysql_sock = os.path.join(self.tablet_dir, "mysql.sock") mysqlctl_sock = os.path.join(self.tablet_dir, "mysqlctl.sock") while True: wait_for = [] if not os.path.exists(mysql_sock): wait_for.append(mysql_sock) if not os.path.exists(mysqlctl_sock): wait_for.append(mysqlctl_sock) if not wait_for: return timeout = utils.wait_step("waiting for socket files: %s" % str(wait_for), timeout, sleep_time=2.0)
def test_stream_parity(self): timeout = 30#s while True: master_start_position = _get_master_current_position() replica_start_position = _get_repl_current_position() if master_start_position == replica_start_position: break timeout = utils.wait_step( "%s == %s" % (master_start_position, replica_start_position), timeout ) logging.debug('run_test_stream_parity starting @ %s', master_start_position) master_txn_count = 0 replica_txn_count = 0 self._exec_vt_txn(self._populate_vt_a(15)) self._exec_vt_txn(self._populate_vt_b(14)) self._exec_vt_txn(['delete from vt_a']) self._exec_vt_txn(['delete from vt_b']) master_conn = self._get_master_stream_conn() master_conn.dial() master_events = [] data = master_conn.stream_start(master_start_position) master_events.append(data) for i in xrange(21): data = master_conn.stream_next() master_events.append(data) if data['Category'] == 'POS': master_txn_count += 1 break replica_events = [] replica_conn = self._get_replica_stream_conn() replica_conn.dial() data = replica_conn.stream_start(replica_start_position) replica_events.append(data) for i in xrange(21): data = replica_conn.stream_next() replica_events.append(data) if data['Category'] == 'POS': replica_txn_count += 1 break if len(master_events) != len(replica_events): logging.debug( 'Test Failed - # of records mismatch, master %s replica %s', master_events, replica_events) for master_val, replica_val in zip(master_events, replica_events): master_data = master_val replica_data = replica_val self.assertEqual( master_data, replica_data, "Test failed, data mismatch - master '%s' and replica position '%s'" % (master_data, replica_data)) logging.debug('Test Writes: PASS')
def _check_vt_insert_test(self, tablet_obj, index): # wait until it gets the data timeout = 10.0 while True: result = tablet_obj.mquery( 'vt_test_keyspace', 'select msg from vt_insert_test where id=%d' % index) if len(result) == 1: break timeout = utils.wait_step('waiting for replication to catch up on %s' % tablet_obj.tablet_alias, timeout, sleep_time=0.1)
def wait_for_mysqlctl_socket(self, timeout=60.0): mysql_sock = os.path.join(self.tablet_dir, 'mysql.sock') mysqlctl_sock = os.path.join(self.tablet_dir, 'mysqlctl.sock') while True: wait_for = [] if not os.path.exists(mysql_sock): wait_for.append(mysql_sock) if not os.path.exists(mysqlctl_sock): wait_for.append(mysqlctl_sock) if not wait_for: return timeout = utils.wait_step('waiting for socket files: %s' % str(wait_for), timeout, sleep_time=2.0)
def test_stream_parity(self): """test_stream_parity checks the parity of streams received from master and replica for the same writes. Also tests transactions are retrieved properly. """ timeout = 30 while True: master_start_position = _get_master_current_position() replica_start_position = _get_repl_current_position() if master_start_position == replica_start_position: break timeout = utils.wait_step( '%s == %s' % (master_start_position, replica_start_position), timeout ) logging.debug('run_test_stream_parity starting @ %s', master_start_position) master_txn_count = 0 replica_txn_count = 0 self._exec_vt_txn(self._populate_vt_a(15)) self._exec_vt_txn(self._populate_vt_b(14)) self._exec_vt_txn(['delete from vt_a']) self._exec_vt_txn(['delete from vt_b']) master_conn = self._get_master_stream_conn() master_events = [] for stream_event in master_conn.stream_update(master_start_position): master_events.append(stream_event) if stream_event.category == update_stream.StreamEvent.POS: master_txn_count += 1 break replica_events = [] replica_conn = self._get_replica_stream_conn() for stream_event in replica_conn.stream_update(replica_start_position): replica_events.append(stream_event) if stream_event.category == update_stream.StreamEvent.POS: replica_txn_count += 1 break if len(master_events) != len(replica_events): logging.debug( 'Test Failed - # of records mismatch, master %s replica %s', master_events, replica_events) for master_val, replica_val in zip(master_events, replica_events): master_data = master_val.__dict__ replica_data = replica_val.__dict__ self.assertEqual( master_data, replica_data, "Test failed, data mismatch - master '%s' and replica position '%s'" % (master_data, replica_data)) master_conn.close() replica_conn.close() logging.debug('Test Writes: PASS')
def test_outofband_statements(self): start = self.replica_vars()['InternalErrors'].get('Invalidation', 0) # Test update statement self._exec_vt_txn( "insert into vt_insert_test (id, msg) values (1000000, 'start')") self._wait_for_replica() self._wait_for_value([[1000000, 'start']]) utils.mysql_write_query( master_tablet.tablet_uid, 'vt_test_keyspace', "update vt_insert_test set msg = 'foo' where id = 1000000") self._wait_for_replica() self._wait_for_value([[1000000, 'foo']]) end1 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) self.assertEqual(start, end1) # Test delete statement utils.mysql_write_query(master_tablet.tablet_uid, 'vt_test_keyspace', 'delete from vt_insert_test where id = 1000000') self._wait_for_replica() self._wait_for_value([]) end2 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) self.assertEqual(end1, end2) # Test insert statement utils.mysql_write_query( master_tablet.tablet_uid, 'vt_test_keyspace', "insert into vt_insert_test (id, msg) values(1000000, 'bar')") self._wait_for_replica() self._wait_for_value([[1000000, 'bar']]) end3 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) self.assertEqual(end2, end3) # Test unrecognized statement utils.mysql_query(master_tablet.tablet_uid, 'vt_test_keyspace', 'truncate table vt_insert_test') self._wait_for_replica() timeout = 10 while True: end4 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) if end4 == end3+1: break timeout = utils.wait_step('invalidation errors, got %d expecting %d' % (end4, end3+1), timeout, sleep_time=0.1) self.assertEqual(end4, end3+1)
def test_outofband_statements(self): start = self.replica_vars()['InternalErrors'].get('Invalidation', 0) # Test update statement self._exec_vt_txn( "insert into vt_insert_test (id, msg) values (1000000, 'start')") self._wait_for_replica() self._wait_for_value([['1000000', 'start']]) utils.mysql_write_query( master_tablet.tablet_uid, 'vt_test_keyspace', "update vt_insert_test set msg = 'foo' where id = 1000000") self._wait_for_replica() self._wait_for_value([['1000000', 'foo']]) end1 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) self.assertEqual(start, end1) # Test delete statement utils.mysql_write_query(master_tablet.tablet_uid, 'vt_test_keyspace', 'delete from vt_insert_test where id = 1000000') self._wait_for_replica() self._wait_for_value([]) end2 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) self.assertEqual(end1, end2) # Test insert statement utils.mysql_write_query( master_tablet.tablet_uid, 'vt_test_keyspace', "insert into vt_insert_test (id, msg) values(1000000, 'bar')") self._wait_for_replica() self._wait_for_value([['1000000', 'bar']]) end3 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) self.assertEqual(end2, end3) # Test unrecognized statement utils.mysql_query(master_tablet.tablet_uid, 'vt_test_keyspace', 'truncate table vt_insert_test') self._wait_for_replica() timeout = 10 while True: end4 = self.replica_vars()['InternalErrors'].get('Invalidation', 0) if end4 == end3+1: break timeout = utils.wait_step('invalidation errors, got %d expecting %d' % (end4, end3+1), timeout, sleep_time=0.1) self.assertEqual(end4, end3+1)
def _check_data(self, t, count, msg): """Check that the specified tablet has the expected number of rows.""" timeout = 10 while True: try: result = t.mquery('vt_test_keyspace', 'select count(*) from vt_insert_test') if result[0][0] == count: break except MySQLdb.DatabaseError: # ignore exceptions, we'll just timeout (the tablet creation # can take some time to replicate, and we get a 'table vt_insert_test # does not exist exception in some rare cases) logging.exception('exception waiting for data to replicate') timeout = utils.wait_step(msg, timeout)
def wait_for_vttablet_state(self, expected, timeout=60.0, port=None): while True: v = utils.get_vars(port or self.port) if v == None: logging.debug(" vttablet %s not answering at /debug/vars, waiting...", self.tablet_alias) else: if 'Voltron' not in v: logging.debug(" vttablet %s not exporting Voltron, waiting...", self.tablet_alias) else: s = v["TabletStateName"] if s != expected: logging.debug(" vttablet %s in state %s != %s", self.tablet_alias, s, expected) else: break timeout = utils.wait_step('waiting for state %s' % expected, timeout, sleep_time=0.1)
def test_webinterface(self): worker_base_url = 'http://localhost:%d' % int(self.worker_port) # Wait for /status to become available. timeout = 10 while True: done = False try: urllib2.urlopen(worker_base_url + '/status').read() done = True except urllib2.URLError: pass if done: break timeout = utils.wait_step( 'worker /status webpage must be available', timeout) # Run the command twice to make sure it's idempotent. for _ in range(2): # Run Ping command. try: urllib2.urlopen(worker_base_url + '/Debugging/Ping', data=urllib.urlencode({'message': 'pong'})).read() raise Exception( 'Should have thrown an HTTPError for the redirect.') except urllib2.HTTPError as e: self.assertEqual(e.code, 307) # Wait for the Ping command to finish. utils.poll_for_vars( 'vtworker', self.worker_port, 'WorkerState == done', condition_fn=lambda v: v.get('WorkerState') == 'done') # Verify that the command logged something and its available at /status. status = urllib2.urlopen(worker_base_url + '/status').read() self.assertIn("Ping command was called with message: 'pong'", status, 'Command did not log output to /status: %s' % status) # Reset the job. urllib2.urlopen(worker_base_url + '/reset').read() status_after_reset = urllib2.urlopen(worker_base_url + '/status').read() self.assertIn( 'This worker is idle.', status_after_reset, '/status does not indicate that the reset was successful')
def reset(self): """Call reset when you want to start using the tailer.""" if self.flush: self.flush() else: time.sleep(self.sleep) # Re-open the file if open. if self.f: self.f.close() self.f = None # Wait for file to exist. timeout = self.timeout while not os.path.exists(self.filepath): timeout = utils.wait_step('file exists: ' + self.filepath, timeout) self.f = open(self.filepath) self.f.seek(0, os.SEEK_END) self.pos = self.f.tell()
def test_restart(self): shard_0_master.create_db('vt_test_keyspace') proc1 = shard_0_master.start_vttablet( cert=cert_dir + "/vt-server-cert.pem", key=cert_dir + "/vt-server-key.pem", wait_for_state='SERVING') proc2 = shard_0_master.start_vttablet( cert=cert_dir + "/vt-server-cert.pem", key=cert_dir + "/vt-server-key.pem", wait_for_state='SERVING') timeout = 10.0 while True: proc1.poll() if proc1.returncode is not None: break timeout = utils.wait_step( "waiting for new vttablet to kill its predecessor", timeout) shard_0_master.kill_vttablet() logging.debug("Done here")