def test_delete_smrlog_after_redis_restart(self): util.print_frame() server = self.cluster['servers'][0] redis = telnetlib.Telnet(server['ip'], server['redis_port']) val = 'x' * 1048576 cmd = '*3\r\n$3\r\nset\r\n$4\r\ntest\r\n$1048576\r\n%s\r\n' % val # create smr log file for i in xrange(640): redis.write(cmd) ret = redis.read_until('\r\n', 3) self.assertEquals(ret, '+OK\r\n') # wait until synced if config.opt_use_memlog: time.sleep(3) loglist = [ f for f in os.listdir('%s/log0' % util.smr_dir(0)) if '.log' in f ] util.log('before log delete') util.log(loglist) self.assertTrue(len(loglist) > 10) self.bgsave(redis) testbase.request_to_shutdown_redis(server) testbase.request_to_shutdown_smr(server) testbase.request_to_start_smr(server, log_delete_delay=1) testbase.request_to_start_redis(server) time.sleep(30) loglist = [ f for f in os.listdir('%s/log0' % util.smr_dir(0)) if '.log' in f ] util.log('after log delete') util.log(loglist) # wait until synced if config.opt_use_memlog: time.sleep(3) self.assertTrue(len(loglist) < 5)
def test_delete_smrlog_after_redis_restart(self): util.print_frame() server = self.cluster['servers'][0] redis = telnetlib.Telnet(server['ip'], server['redis_port']) val = 'x' * 1048576 cmd = '*3\r\n$3\r\nset\r\n$4\r\ntest\r\n$1048576\r\n%s\r\n' % val # create smr log file for i in xrange(640): redis.write(cmd) ret = redis.read_until('\r\n', 3) self.assertEquals(ret, '+OK\r\n') # wait until synced if config.opt_use_memlog: time.sleep(3) loglist = [f for f in os.listdir('%s/log0' % util.smr_dir(0)) if '.log' in f] util.log('before log delete') util.log(loglist) self.assertTrue(len(loglist) > 10) self.bgsave(redis) testbase.request_to_shutdown_redis(server) testbase.request_to_shutdown_smr(server) testbase.request_to_start_smr(server, log_delete_delay=1) testbase.request_to_start_redis(server) time.sleep(30) loglist = [f for f in os.listdir('%s/log0' % util.smr_dir(0)) if '.log' in f] util.log('after log delete') util.log(loglist) # wait until synced if config.opt_use_memlog: time.sleep(3) self.assertTrue(len(loglist) < 5)
def initialize_starting_up_smr_before_redis( cluster, verbose=2, conf=None ): if conf == None: conf = {'smr_log_delete_delay':86400, 'cm_context':''} if conf.has_key('smr_log_delete_delay') == False: conf['smr_log_delete_delay'] = 86400 if conf.has_key('cm_context') == False: conf['cm_context'] = '' if testbase.cleanup_zookeeper_root() is not 0: util.log('failed to cleanup_zookeeper_root') return -1 if testbase.cleanup_processes() is not 0: util.log('failed to cleanup_test_environment') return -1 for server in cluster['servers']: if testbase.cleanup_pgs_log_and_ckpt( cluster['cluster_name'], server ) is not 0: util.log( 'failed to cleanup_pgs_data' ) return -1 for server in cluster['servers']: if testbase.request_to_start_cm( server['id'], server['cm_port'], conf['cm_context'] ) is not 0: util.log('failed to request_to_start_cm') return -1 if testbase.initialize_cluster( cluster ) is not 0: util.log('failed to setup_znodes') return -1 for server in cluster['servers']: if testbase.request_to_start_smr( server, verbose=verbose, log_delete_delay=conf['smr_log_delete_delay'] ) is not 0: return -1 for server in cluster['servers']: if testbase.request_to_start_redis( server, check=False ) is not 0: return -1 for server in cluster['servers']: if testbase.wait_until_finished_to_set_up_role( server ) is not 0: return -1 for server in cluster['servers']: if testbase.request_to_start_gateway( cluster['cluster_name'], server, cluster['servers'][0] ) is not 0: util.log('failed to request_to_start_gateway') return -1 return 0
def failover(self, server): # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) # recovery ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, 10) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role))
def failover( self, server ): # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) # recovery ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis( server_to_join[i] ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) ) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis( master ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected) ) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) ) # check state N max_try = 20 expected = 'N' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server_to_join[i] ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) ) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis( ['id'] ) ret = redis.connect( s['ip'], s['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis( server_to_join[i]['id'] ) ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) ) # try to recover master, but failed ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( master, False ) self.assertEqual( ret, 0, 'failed to start redis' ) max_try = 3 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( master ) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) ) util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.') gw.disconnect() return 0
def failure_recovery( self, role, wait_count=10, redis_only=False ): time.sleep( 2 ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # shutdown server = util.get_server_by_role( self.cluster['servers'], role ) if redis_only == False: ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) # check value cmd = 'get %s\r\n' % (key) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value) )
def test_4_PGS_mgen_is_less_than_PG_mgen(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role(self.cluster['servers'], 'master') ret = testbase.request_to_shutdown_smr(server_to_join) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server_to_join) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server_to_join, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected)) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue(ok, 'Check quorum fail.') # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr(server_to_join) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server_to_join, 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id'])) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) time.sleep(5) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') server = util.get_server_by_role(self.cluster['servers'], 'master') redis = redis_mgmt.Redis(server_to_join['id']) ret = redis.connect(server_to_join['ip'], server_to_join['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i)) gw.disconnect() return 0
def test_4_PGS_mgen_is_less_than_PG_mgen( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role( self.cluster['servers'], 'master' ) ret = testbase.request_to_shutdown_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server_to_join ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server_to_join, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected) ) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue( ok, 'Check quorum fail.' ) # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id']) ) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) time.sleep( 5 ) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) server = util.get_server_by_role( self.cluster['servers'], 'master' ) redis = redis_mgmt.Redis( server_to_join['id'] ) ret = redis.connect( server_to_join['ip'], server_to_join['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i) ) gw.disconnect() return 0
def consistent_after_failover(self): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) gw.connect(ip, port) for i in range(0, max): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') time.sleep(5) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr(server) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(5) # check state F for server in servers: state = self.get_expected_smr_state(server, 'F') self.assertEquals('F', state, 'server%d - state:%s' % (server['id'], state)) # recovery for server in servers: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % server['id']) ret = testbase.request_to_start_redis(server, False) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % server['id']) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep(5) # wait for master election for i in xrange(10): ret = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id'])) state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role)) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server(server) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave)) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis(master['id']) ret = redis.connect(master['ip'], master['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id']) for i in range(max, max * 2): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id']) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis(slave['id']) ret = slave_redis.connect(slave['ip'], slave['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id']) for i in range(0, max * 2): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write(cmd) trash = slave_redis.read_until('\r\n') res = slave_redis.read_until('\r\n') self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)) slave_redis.disconnect()
def test_restart_recovery_with_remote_checkpoint_and_remote_log( self ): util.print_frame() key_base = 'key' target = util.get_server_by_role( self.cluster['servers'], 'slave' ) master = util.get_server_by_role( self.cluster['servers'], 'master' ) ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( master['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway' ) # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # generate some data for i in range( 0, 100 ): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # delete a local checkpoint util.log('delete pgs%d`s check point.' % target['id']) util.del_dumprdb( target['id'] ) # generate a remote check point bgsave_ret = util.bgsave( master ) self.assertTrue( bgsave_ret, 'failed to bgsave. pgs%d' % master['id'] ) # shutdown util.log('shutdown target') ret = testbase.request_to_shutdown_smr( target ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) time.sleep( 10 ) # generate some data ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway' ) for i in range( 100, 200 ): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery util.log('recovery target') ret = testbase.request_to_start_smr( target ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( target ) self.assertEqual( ret, 0, 'failed to start redis' ) time.sleep( 5 ) ret = testbase.wait_until_finished_to_set_up_role( target) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) ) # check value recovered_redis = redis_mgmt.Redis( target['id'] ) ret = recovered_redis .connect( target['ip'], target['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) for i in range (0, 200): key = '%s%d' % (key_base, i) cmd = 'get %s\r\n' % (key) recovered_redis .write( cmd ) recovered_redis.read_until( '\r\n' ) response = recovered_redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % i, 'inconsistent %s, %d' % (response, i) )
def test_restart_recovery_with_remote_checkpoint_and_remote_log(self): util.print_frame() key_base = 'key' target = util.get_server_by_role(self.cluster['servers'], 'slave') master = util.get_server_by_role(self.cluster['servers'], 'master') ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(master['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway') # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # generate some data for i in range(0, 100): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # delete a local checkpoint util.log('delete pgs%d`s check point.' % target['id']) util.del_dumprdb(target['id']) # generate a remote check point bgsave_ret = util.bgsave(master) self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown util.log('shutdown target') ret = testbase.request_to_shutdown_smr(target) self.assertEqual(ret, 0, 'failed to shutdown smr') time.sleep(10) # generate some data ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway') for i in range(100, 200): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery util.log('recovery target') ret = testbase.request_to_start_smr(target) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(target) self.assertEqual(ret, 0, 'failed to start redis') time.sleep(5) ret = testbase.wait_until_finished_to_set_up_role(target) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (target['id'])) # check value recovered_redis = redis_mgmt.Redis(target['id']) ret = recovered_redis.connect(target['ip'], target['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') for i in range(0, 200): key = '%s%d' % (key_base, i) cmd = 'get %s\r\n' % (key) recovered_redis.write(cmd) recovered_redis.read_until('\r\n') response = recovered_redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % i, 'inconsistent %s, %d' % (response, i))
def test_migration_with_expire_command(self): util.print_frame() util.log("start load_generator") load_gen_thrd_list = {} for i in range(1): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec tps = 20000 src_pg_id = 0 dst_pg_id = 1 leader_cm = self.cluster['servers'][0] src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id) dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id) smr = smr_mgmt.SMR(src_master['id']) ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port']) if ret != 0: util.log('failed to connect to smr(source master)') return False src_redis = redis_mgmt.Redis(src_master['id']) ret = src_redis.connect(src_master['ip'], src_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key( src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key( src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts)) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20) # notify dst_redis of migration start util.log(">>> notify dst_redis of migration start (%s)" % time.asctime()) cmd = 'migconf migstart %d-%d\r\n' % (0, 8191) dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # remote partial checkpoint util.log(">>> start remote checkpoint and load (%s)" % time.asctime()) cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], dst_master['ip'], dst_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: if line.find("Checkpoint Sequence Number:") != -1: util.log("seqnumber : " + line[line.rfind(":") + 1:]) seq = int(line[line.rfind(":") + 1:]) util.log(">>>" + str(line.rstrip())) self.assertEqual(0, ret) util.log(">>> end remote checkpoint and load (%s)" % time.asctime()) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") # bgsave for testing later about recovery during migration util.log( ">>> bgsave for testing later about recovery during migration (%s)" % time.asctime()) cmd = 'bgsave\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+Background saving started\r\n') ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100) # remote catchup (smr log migration) util.log(">>> start remote catchup (%s)" % time.asctime()) dst_host = dst_master['ip'] dst_smr_port = dst_master['smr_base_port'] rle = '1 8192' num_part = 8192 smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port, seq, tps, num_part, rle)) response = smr.read_until('\r\n') if response[:3] != '+OK': util.log('failed to execute migrate start command, response:%s' % response) return False while True: smr.write('migrate info\r\n') response = smr.read_until('\r\n') seqs = response.split() logseq = int(seqs[1].split(':')[1]) mig = int(seqs[2].split(':')[1]) util.log('migrate info: %s' % response) if (logseq - mig < 500000): util.log('Remote catchup almost done. try mig2pc') break time.sleep(1) util.log(">>> sleep until 90 sec pass") self.assertFalse(time.time() - ts >= 90) time.sleep(90 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20) util.log(">>> remote catchup phase almost done (%s)" % time.asctime()) # mig2pc util.log(">>> start mig2pc (%s)" % time.asctime()) cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id, 0, 8191) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) util.log('mig2pc result : ' + result) if not result.startswith('{"state":"success","msg":"+OK"}\r\n'): util.log('failed to execute mig2pc command, result:%s' % result) return False util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10) self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20) # finish migration smr.write('migrate interrupt\r\n') response = smr.read_until('\r\n') util.log('migrate interrupt: %s' % response) smr.disconnect() # notify dst_redis of migration end util.log(">>> notify dst_redis of migration end (%s)" % time.asctime()) cmd = 'migconf migend\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191) src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") ts = time.time() util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) # remote partial checkpoint util.log(">>> start rangedel (%s)" % time.asctime()) cmd = "./cluster-util --rangedel %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: util.log(">>>" + str(line.rstrip())) cmd = 'migconf clearend\r\n' src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEqual(res, '+OK\r\n') time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # kill dst_redis and recover from bgsave util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime()) dst_redis.disconnect() ret = testbase.request_to_shutdown_redis(dst_master) self.assertEquals(ret, 0, 'failed to shutdown redis') ret = testbase.request_to_shutdown_smr(dst_master) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_master) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % dst_master['id']) ret = testbase.request_to_start_redis(dst_master) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % dst_master['id']) ret = testbase.wait_until_finished_to_set_up_role(dst_master) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id'])) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis, 'S3:PermanentKey') # kill dst_slave redis and recover without dump file util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime()) dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id) ret = testbase.request_to_shutdown_redis(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown redis') ret = testbase.request_to_shutdown_smr(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_slave) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % dst_slave['id']) ret = testbase.request_to_start_redis(dst_slave) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % dst_slave['id']) ret = testbase.wait_until_finished_to_set_up_role(dst_slave) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id'])) dst_redis_slave = redis_mgmt.Redis(dst_slave['id']) ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') self.assertTrue( self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis_slave, 'S3:PermanentKey') # Go back to initial configuration self.assertTrue( util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000), 'failed to rollback migration')
def failover_while_hang(self, server): # timestamp before hang ts_before = util.get_timestamp_of_pgs(server) self.assertNotEqual( ts_before, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (server['id'], ts_before)) # hang util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' % (server['id'], server['ip'], server['smr_mgmt_port'])) smr = smr_mgmt.SMR(server['id']) ret = smr.connect(server['ip'], server['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (server['ip'], server['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(4) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) util.log('succeeded : pgs%d state changed to F.' % server['id']) # shutdown util.log('shutdown pgs%d while hanging.' % server['id']) ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr. id:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis. id:%d' % server['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) util.log('succeeded : pgs%d state changed to F.' % server['id']) # recovery util.log('restart pgs%d.' % server['id']) ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr. id:%d' % server['id']) ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis. id:%d' % server['id']) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role(server, wait_count) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) util.log('succeeded : pgs%d state changed to N.' % server['id']) # wait for rejoin as a slave success = False for i in range(20): role = util.get_role_of_server(server) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs(server) if ts_after != -1 and ts_before != ts_after: success = True break time.sleep(1) self.assertEqual(success, True, 'failed to rejoin as a slave') util.log('succeeded : pgs%d joined as a slave.' % server['id']) return 0
def test_quorum( self ): util.print_frame() master, slave1, slave2 = self.get_mss() expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave1 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave1['id'] ) time.sleep( 1 ) expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave2 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave2['id'] ) time.sleep( 1 ) expected = self.quorum_policy[0] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave1 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave1 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave1 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave1['id']) ) time.sleep( 1 ) expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave2 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave2 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave2 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave2['id']) ) time.sleep( 1 ) expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) )
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr(server_to_join[i]) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis(server_to_join[i]) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected)) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr(master) self.assertEqual(ret, 0, 'failed to shutdown smr') util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis(master) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected)) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id'])) # check state N max_try = 20 expected = 'N' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server_to_join[i]) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role)) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis(['id']) ret = redis.connect(s['ip'], s['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis(server_to_join[i]['id']) ret = redis.connect(server_to_join[i]['ip'], server_to_join[i]['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j)) # try to recover master, but failed ret = testbase.request_to_start_smr(master) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(master, False) self.assertEqual(ret, 0, 'failed to start redis') max_try = 3 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(master) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role)) util.log( 'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.' ) gw.disconnect() return 0
def recovery_with_local_checkpoint_and_remote_log( self, role ): server = util.get_server_by_role( self.cluster['servers'], role ) # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # set value ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( server['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, id:%d' % server['id'] ) timestamp = {} key_base = 'key0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999' for i in range (0, 50000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write( cmd ) response = gw.read_until( '\r\n' ) self.assertNotEqual( response.find( '+OK' ), -1, 'failed to set key value through gateway' ) # generate a check point bgsave_ret = util.bgsave( server ) self.assertTrue( bgsave_ret, 'failed to bgsave. pgs%d' % server['id'] ) # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEqual( ret, 0, 'failed to shutdown redis' ) util.log('succeeded : shutdown pgs%d' % (server['id'])) # delete smr_logs ret = util.delete_smr_logs( server['id'] ) self.assertEqual( ret, 0, 'failed to delete smr log, id:%d' % server['id'] ) util.log('succeeded : delete replication logs') time.sleep( 5 ) # set value ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway' ) for i in range (50000, 100000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write( cmd ) response = gw.read_until( '\r\n' ) self.assertNotEqual( response.find( '+OK' ), -1, 'failed to set key value through gateway' ) # recovery ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) time.sleep( 5 ) ret = testbase.wait_until_finished_to_set_up_role( server ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) util.log('succeeded : recover pgs%d' % server['id']) # check value recovered_redis = redis_mgmt.Redis( server['id'] ) ret = recovered_redis .connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) for i in range (0, 100000): k = '%s_%d' % (key_base, i) cmd = 'get %s\r\n' % (k) recovered_redis .write( cmd ) recovered_redis.read_until( '\r\n' ) response = recovered_redis.read_until( '\r\n' ) self.assertEqual( response, '%f\r\n' % (timestamp[i]), 'inconsistent %s, %f' % (response, timestamp[i]) )
def failure_recovery(self, role, wait_count=10, redis_only=False): time.sleep(2) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # shutdown server = util.get_server_by_role(self.cluster['servers'], role) if redis_only == False: ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, wait_count) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) # check value cmd = 'get %s\r\n' % (key) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value))
def consistent_after_failover( self ): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( ip ) gw.connect( ip, port ) for i in range( 0, max ): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) time.sleep( 5 ) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 5 ) # check state F for server in servers: state = self.get_expected_smr_state( server, 'F' ) self.assertEquals( 'F', state, 'server%d - state:%s' % (server['id'], state) ) # recovery for server in servers: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] ) ret = testbase.request_to_start_redis( server, False ) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id'] ) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep( 5 ) # wait for master election for i in xrange(10): ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] ) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) ) state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role) ) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server( server ) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave) ) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] ) for i in range( max, max*2 ): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id'] ) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis( slave['id'] ) ret = slave_redis .connect( slave['ip'], slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] ) for i in range( 0, max*2 ): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write( cmd ) trash = slave_redis.read_until( '\r\n' ) res = slave_redis.read_until( '\r\n' ) self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res) ) slave_redis.disconnect()
def test_4_mgmt_is_isolated_with_red_failover(self): util.print_frame() util.iptables_print_list() cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port) # Master must be the first pgs, cluster['servers'][0]. to_be_master = cluster['servers'][0] m = util.get_server_by_role_and_pg(cluster['servers'], 'master', to_be_master['pg_id']) master_id = -1 if m['id'] != to_be_master['id']: try_cnt = 0 while master_id != to_be_master['id'] and try_cnt < 20: master_id = util.role_change(cluster['servers'][0], cluster['cluster_name'], to_be_master['id']) try_cnt += 1 time.sleep(1) self.assertEquals(master_id, to_be_master['id'], 'change %d to a master fail' % to_be_master['id']) # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Set SMR option (slave_idle_timeout) util.log('\n\n\n ### Set SMR option ###') for s in cluster['servers']: t = telnet.Telnet('SMR%d' % s['id']) self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Network isolation test for loop_cnt in range(3): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) for s in cluster['servers']: self.assertTrue(util.iptables_drop('A', '127.0.0.100', s['smr_mgmt_port']), 'add a bloking role to iptables fail.') for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') pgs_list = util.get_pgs_info_list(mgmt_ip, mgmt_port, cluster) reds = filter(lambda x: x['color'] == 'RED', pgs_list) # Shutdown server = cluster['servers'][random.choice(reds)['pgs_id']] util.log( 'shutdown pgs%d while hanging.' % server['id'] ) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] ) # Check state F max_try = 20 expected = 'F' for i in range( 0, max_try): util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port)) state = util._get_smr_state( server['id'], cluster['cluster_name'], mgmt_ip, mgmt_port ) if expected == state: break; time.sleep( 1 ) self.assertEqual( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % server['id'] ) # Unblock network for s in cluster['servers']: self.assertTrue(util.iptables_drop('D', '127.0.0.100', s['smr_mgmt_port']), 'delete a bloking role to iptables fail.') # Check cluster state ok = False for i in range(10): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == server['id']: continue if is_pgs_normal(s) == False: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Recovery util.log( 'restart pgs%d.' % server['id'] ) ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis' ) ok = False for i in xrange(5): ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True) if ok: break else: time.sleep(1) self.assertTrue(ok, 'failed to check cluster state') # Reset SMR option (slave_idle_timeout) t = telnet.Telnet('SMR%d' % server['id']) self.assertEqual(t.connect(server['ip'], server['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (server['ip'], server['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (server['ip'], server['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (server['ip'], server['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(len(final_state)): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) if initial_state[i]['pgs_id'] == 1: self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def state_transition( self ): server = util.get_server_by_role( self.cluster['servers'], 'slave' ) self.assertNotEquals( server, None, 'failed to get_server_by_role-slave' ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) # check initial state state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role) ) # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEquals( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 3 ) # check state F expected = 'F' state = self.get_expected_smr_state( server, expected ) self.assertEquals( expected , state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected) ) # set value ret = gw.connect( ip, port ) self.assertEquals( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) timestamp = 0.0 for i in range( 0, 100 ): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery ret = testbase.request_to_start_smr( server ) self.assertEquals( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEquals( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) time.sleep( 5 ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N expected = 'N' max_try = 20 for i in range( 0, max_try ): state = self.get_expected_smr_state( server, expected ) if state == expected: break time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected) )
def test_migration_with_expire_command(self): util.print_frame() util.log("start load_generator") load_gen_thrd_list = {} for i in range(1): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec tps = 20000 src_pg_id = 0 dst_pg_id = 1 leader_cm = self.cluster['servers'][0] src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id) dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id) smr = smr_mgmt.SMR(src_master['id']) ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port']) if ret != 0: util.log('failed to connect to smr(source master)') return False src_redis = redis_mgmt.Redis(src_master['id']) ret = src_redis.connect(src_master['ip'], src_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts)) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20) # notify dst_redis of migration start util.log(">>> notify dst_redis of migration start (%s)" % time.asctime()) cmd = 'migconf migstart %d-%d\r\n' % (0, 8191) dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) # remote partial checkpoint util.log(">>> start remote checkpoint and load (%s)" % time.asctime()) cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], dst_master['ip'], dst_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: if line.find("Checkpoint Sequence Number:") != -1: util.log("seqnumber : " + line[line.rfind(":")+1:]) seq = int(line[line.rfind(":")+1:]) util.log(">>>" + str(line.rstrip())) self.assertEqual(0, ret) util.log(">>> end remote checkpoint and load (%s)" % time.asctime()) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") # bgsave for testing later about recovery during migration util.log(">>> bgsave for testing later about recovery during migration (%s)" % time.asctime()) cmd = 'bgsave\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+Background saving started\r\n' ) ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100) # remote catchup (smr log migration) util.log(">>> start remote catchup (%s)" % time.asctime()) dst_host = dst_master['ip'] dst_smr_port = dst_master['smr_base_port'] rle = '1 8192' num_part = 8192 smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port, seq, tps, num_part, rle)) response = smr.read_until('\r\n') if response[:3] != '+OK': util.log('failed to execute migrate start command, response:%s' % response) return False while True: smr.write('migrate info\r\n') response = smr.read_until('\r\n') seqs = response.split() logseq = int(seqs[1].split(':')[1]) mig = int(seqs[2].split(':')[1]) util.log('migrate info: %s' % response) if (logseq-mig < 500000): util.log('Remote catchup almost done. try mig2pc') break time.sleep(1) util.log(">>> sleep until 90 sec pass") self.assertFalse(time.time() - ts >= 90) time.sleep(90 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20) util.log(">>> remote catchup phase almost done (%s)" % time.asctime()) # mig2pc util.log(">>> start mig2pc (%s)" % time.asctime()) cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id, 0, 8191) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) util.log('mig2pc result : ' + result) if not result.startswith('{"state":"success","msg":"+OK"}\r\n'): util.log('failed to execute mig2pc command, result:%s' % result) return False util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10) self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20) # finish migration smr.write('migrate interrupt\r\n') response = smr.read_until('\r\n') util.log('migrate interrupt: %s' % response) smr.disconnect() # notify dst_redis of migration end util.log(">>> notify dst_redis of migration end (%s)" % time.asctime()) cmd = 'migconf migend\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191) src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") ts = time.time() util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) # remote partial checkpoint util.log(">>> start rangedel (%s)" % time.asctime()) cmd = "./cluster-util --rangedel %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: util.log(">>>" + str(line.rstrip())) cmd = 'migconf clearend\r\n' src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEqual(res, '+OK\r\n') time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # kill dst_redis and recover from bgsave util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime()) dst_redis.disconnect() ret = testbase.request_to_shutdown_redis(dst_master) self.assertEquals( ret, 0, 'failed to shutdown redis' ) ret = testbase.request_to_shutdown_smr(dst_master) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_master) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_master['id'] ) ret = testbase.request_to_start_redis(dst_master) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_master['id'] ) ret = testbase.wait_until_finished_to_set_up_role(dst_master) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id']) ) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis, 'S3:PermanentKey') # kill dst_slave redis and recover without dump file util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime()) dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id) ret = testbase.request_to_shutdown_redis(dst_slave) self.assertEquals( ret, 0, 'failed to shutdown redis' ) ret = testbase.request_to_shutdown_smr(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_slave) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_slave['id'] ) ret = testbase.request_to_start_redis(dst_slave) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_slave['id'] ) ret = testbase.wait_until_finished_to_set_up_role(dst_slave) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id']) ) dst_redis_slave = redis_mgmt.Redis(dst_slave['id']) ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis_slave, 'S3:PermanentKey') # Go back to initial configuration self.assertTrue(util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000), 'failed to rollback migration')
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr( target ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( target ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps= {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr( target ) self.assertEqual( ret, 0, 'failed to start smr' ) util.log('start smr-replicator done') ret = testbase.request_to_start_redis( target, 60 ) self.assertEqual( ret, 0, 'failed to start redis' ) util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) ) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check quorum expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def failover_while_hang( self, server ): # timestamp before hang ts_before = util.get_timestamp_of_pgs( server ) self.assertNotEqual( ts_before, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (server['id'], ts_before) ) # hang util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' % (server['id'], server['ip'], server['smr_mgmt_port'])) smr = smr_mgmt.SMR( server['id'] ) ret = smr.connect( server['ip'], server['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (server['ip'], server['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 10000\r\n' ) reply = smr.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) time.sleep( 4 ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % server['id'] ) # shutdown util.log( 'shutdown pgs%d while hanging.' % server['id'] ) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % server['id'] ) # recovery util.log( 'restart pgs%d.' % server['id'] ) ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to N.' % server['id'] ) # wait for rejoin as a slave success = False for i in range( 20 ): role = util.get_role_of_server( server ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( server ) if ts_after != -1 and ts_before != ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave' ) util.log( 'succeeded : pgs%d joined as a slave.' % server['id'] ) return 0
def state_transition(self): server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(server, None, 'failed to get_server_by_role-slave') # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) # check initial state state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role)) # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEquals(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(3) # check state F expected = 'F' state = self.get_expected_smr_state(server, expected) self.assertEquals( expected, state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected)) # set value ret = gw.connect(ip, port) self.assertEquals(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) timestamp = 0.0 for i in range(0, 100): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery ret = testbase.request_to_start_smr(server) self.assertEquals(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEquals(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, 10) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) time.sleep(5) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N expected = 'N' max_try = 20 for i in range(0, max_try): state = self.get_expected_smr_state(server, expected) if state == expected: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected))
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['gateway_port'], 3000, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'qa', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me' and fi[1] == 'lconn': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'qa', 'setquorum' elif fi[0] == 'qa' and fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) # shutdown ret = testbase.request_to_shutdown_smr(s1) self.assertEqual(0, ret, '[%s] failed to shutdown smr%d' % (str(fi), s1['id'])) ret = testbase.request_to_shutdown_redis(s1) self.assertEqual(0, ret, '[%s] failed to shutdown redis%d' % (str(fi), s1['id'])) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # Modify quorum ret = util.cmd_to_smr_addr(m['ip'], m['smr_mgmt_port'], 'setquorum 0\r\n') self.assertEqual("+OK\r\n", ret, '[%s] "setquorum 0" fail.' % str(fi)) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # recovery ret = testbase.request_to_start_smr(s1) self.assertEqual(0, ret, '[%s] failed to start smr' % str(fi)) ret = testbase.request_to_start_redis(s1, max_try=120) self.assertEqual(0, ret, '[%s] failed to start redis' % str(fi)) ret = testbase.wait_until_finished_to_set_up_role(s1, 11) self.assertEqual(0, ret, '[%s] failed to role change. smr_id:%d' % (str(fi), s1['id'])) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi)) # Delete forwarding role out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) finally: for c in clnts: c.quit() for c in clnts: c.join()
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr(target) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(target) self.assertEquals(ret, 0, 'failed to shutdown redis') r = '' expected = 'N' for fc_cnt in xrange(20): r = util.get_smr_role_of_cm(target, self.leader_cm) if r == expected: break time.sleep(0.5) self.assertEquals(r, expected, 'failure detection error.') running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual( old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue( ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr(target) self.assertEqual(ret, 0, 'failed to start smr') util.log('start smr-replicator done') ret = testbase.request_to_start_redis(target, 60) self.assertEqual(ret, 0, 'failed to start redis') util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role(target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id'])) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Check quorum expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue( ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def recovery_with_local_checkpoint_and_remote_log(self, role): server = util.get_server_by_role(self.cluster['servers'], role) # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(server['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, id:%d' % server['id']) timestamp = {} key_base = 'key0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999' for i in range(0, 50000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write(cmd) response = gw.read_until('\r\n') self.assertNotEqual(response.find('+OK'), -1, 'failed to set key value through gateway') # generate a check point bgsave_ret = util.bgsave(server) self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % server['id']) # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEqual(ret, 0, 'failed to shutdown redis') util.log('succeeded : shutdown pgs%d' % (server['id'])) # delete smr_logs ret = util.delete_smr_logs(server['id']) self.assertEqual(ret, 0, 'failed to delete smr log, id:%d' % server['id']) util.log('succeeded : delete replication logs') time.sleep(5) # set value ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway') for i in range(50000, 100000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write(cmd) response = gw.read_until('\r\n') self.assertNotEqual(response.find('+OK'), -1, 'failed to set key value through gateway') # recovery ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') time.sleep(5) ret = testbase.wait_until_finished_to_set_up_role(server) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) util.log('succeeded : recover pgs%d' % server['id']) # check value recovered_redis = redis_mgmt.Redis(server['id']) ret = recovered_redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') for i in range(0, 100000): k = '%s_%d' % (key_base, i) cmd = 'get %s\r\n' % (k) recovered_redis.write(cmd) recovered_redis.read_until('\r\n') response = recovered_redis.read_until('\r\n') self.assertEqual(response, '%f\r\n' % (timestamp[i]), 'inconsistent %s, %f' % (response, timestamp[i]))
def test_quorum( self ): util.print_frame() master, slave1, slave2 = util.get_mss(self.cluster) expected = 2 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave1 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave1['id'] ) time.sleep( 1 ) expected = 1 max_try = 20 for i in range( 0, max_try ): master = util.get_server_by_role( self.cluster['servers'], 'master' ) quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave2 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave2['id'] ) time.sleep( 1 ) expected = 0 max_try = 20 for i in range( 0, max_try ): master = util.get_server_by_role( self.cluster['servers'], 'master' ) quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave1 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave1 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave1 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave1['id']) ) time.sleep( 1 ) expected = 1 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave2 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave2 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave2 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave2['id']) ) time.sleep( 1 ) expected = 2 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) )
def test_large_scale_master_election(self): util.print_frame() # initialize cluster information pgs_id = 10 cluster = { 'cluster_name': 'large_scale', 'keyspace_size': 8192, 'quorum_policy': '0:1', 'slots': [], 'pg_id_list': [], 'servers': [] } pg_max = 32 pgs_per_pg = 3 for pg_id in range(pg_max): cluster['pg_id_list'].append(pg_id) cluster['slots'].append(8192 / pg_max * pg_id) if pg_id == pg_max - 1: cluster['slots'].append(8191) else: cluster['slots'].append(8192 / pg_max * (pg_id + 1) - 1) for pgs in range(pgs_per_pg): smr_base_port = 15000 + pgs_id * 20 smr_mgmt_port = smr_base_port + 3 gateway_port = smr_base_port + 10 redis_port = smr_base_port + 9 server = {} server['id'] = pgs_id pgs_id = pgs_id + 1 server['cluster_name'] = cluster['cluster_name'] server['ip'] = self.cluster['servers'][0]['ip'] server['pm_name'] = self.cluster['servers'][0]['pm_name'] server['cm_port'] = None server['pg_id'] = pg_id server['smr_base_port'] = smr_base_port server['smr_mgmt_port'] = smr_mgmt_port server['gateway_port'] = gateway_port server['redis_port'] = redis_port server['zk_port'] = 2181 cluster['servers'].append(server) # send initialize commands to confmaster testbase.initialize_cluster(cluster, self.leader_cm) # set up pgs binaries try: for server in cluster['servers']: id = server['id'] util.log('copy binaries, server_id=%d' % id) util.copy_smrreplicator(id) util.copy_gw(id) util.copy_redis_server(id) util.copy_cluster_util(id) except IOError as e: util.log(e) util.log('Error: can not find file or read data') self.assertEqual(0, 1, 'Error: can not find file or read data') except: util.log('Error: file open error.') # cleanup servers`s directories for server in cluster['servers']: ret = testbase.cleanup_pgs_log_and_ckpt(cluster['cluster_name'], server) self.assertEqual( ret, 0, 'failed to cleanup_test_environment, id=%d' % server['id']) # start pgs for server in cluster['servers']: ret = testbase.request_to_start_smr(server) self.assertEqual( ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.request_to_start_redis(server, check=False) self.assertEqual( ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.wait_until_finished_to_set_up_role(server) self.assertEqual(ret, 0, 'failed to role set up, id=%d' % server['id']) for i in range(4): server = cluster['servers'][i] ret = testbase.request_to_start_gateway(cluster['cluster_name'], server, self.leader_cm) self.assertEqual( ret, 0, 'failed to request_to_start_gateway, id=%d' % server['id']) clusters = cluster_ls() self.assertNotEqual(len(clusters), 0, 'There is no clsuter.') ok = True for c in clusters: if not util.check_cluster(str(c), self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True): ok = False self.assertEqual(ok, True, 'failed to initlize roles of pgs')
def test_large_scale_master_election( self ): util.print_frame() # initialize cluster information pgs_id = 10 cluster = { 'cluster_name' : 'large_scale', 'keyspace_size' : 8192, 'quorum_policy' : '0:1', 'slots' : [], 'pg_id_list' : [], 'servers' : [] } pg_max = 32 pgs_per_pg = 3 for pg_id in range(pg_max): cluster['pg_id_list'].append(pg_id) cluster['slots'].append(8192 / pg_max * pg_id) if pg_id == pg_max - 1: cluster['slots'].append(8191) else: cluster['slots'].append(8192 / pg_max * (pg_id + 1) - 1) for pgs in range(pgs_per_pg): smr_base_port = 15000 + pgs_id * 20 smr_mgmt_port = smr_base_port + 3 gateway_port = smr_base_port + 10 redis_port = smr_base_port + 9 server = {} server['id'] = pgs_id pgs_id = pgs_id + 1 server['cluster_name'] = cluster['cluster_name'] server['ip'] = self.cluster['servers'][0]['ip'] server['pm_name'] = self.cluster['servers'][0]['pm_name'] server['cm_port'] = None server['pg_id'] = pg_id server['smr_base_port'] = smr_base_port server['smr_mgmt_port'] = smr_mgmt_port server['gateway_port'] = gateway_port server['redis_port'] = redis_port server['zk_port'] = 2181 cluster['servers'].append(server) # send initialize commands to confmaster testbase.initialize_cluster(cluster, self.leader_cm) # set up pgs binaries try: for server in cluster['servers']: id = server['id'] util.log('copy binaries, server_id=%d' % id) util.copy_smrreplicator( id ) util.copy_gw( id ) util.copy_redis_server( id ) util.copy_cluster_util( id ) except IOError as e: util.log(e) util.log('Error: can not find file or read data') self.assertEqual(0, 1, 'Error: can not find file or read data') except: util.log('Error: file open error.') # cleanup servers`s directories for server in cluster['servers']: ret = testbase.cleanup_pgs_log_and_ckpt( cluster['cluster_name'], server ) self.assertEqual(ret, 0, 'failed to cleanup_test_environment, id=%d' % server['id']) # start pgs for server in cluster['servers']: ret = testbase.request_to_start_smr( server ) self.assertEqual(ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.request_to_start_redis( server, check=False ) self.assertEqual(ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.wait_until_finished_to_set_up_role(server) self.assertEqual(ret, 0, 'failed to role set up, id=%d' % server['id']) for i in range(4): server = cluster['servers'][i] ret = testbase.request_to_start_gateway( cluster['cluster_name'], server, self.leader_cm ) self.assertEqual(ret, 0, 'failed to request_to_start_gateway, id=%d' % server['id']) clusters = cluster_ls() self.assertNotEqual(len(clusters), 0, 'There is no clsuter.') ok = True for c in clusters: if not util.check_cluster(str(c), self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True): ok = False self.assertEqual(ok, True, 'failed to initlize roles of pgs')
def test_5_mgmt_is_isolated_with_master_failover(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test for loop_cnt in range(3): master, slave1, slave2 = util.get_mss(cluster) self.assertNotEquals(master, None, 'there is no master') self.assertNotEquals(slave1, None, 'there is no slave1') self.assertNotEquals(slave2, None, 'there is no slave2') # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Shutdown master util.log( 'shutdown pgs%d while hanging.' % master['id'] ) ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % master['id'] ) ret = testbase.request_to_shutdown_redis( master ) self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % master['id'] ) # Check state F max_try = 20 expected = 'F' for i in range( 0, max_try): util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port)) state = util._get_smr_state( master['id'], cluster['cluster_name'], mgmt_ip, mgmt_port ) if expected == state: break; time.sleep( 1 ) self.assertEqual( expected , state, 'master%d - state:%s, expected:%s' % (master['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % master['id'] ) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == master['id']: continue if s['active_role'] != s['mgmt_role']: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Recovery util.log( 'restart pgs%d.' % master['id'] ) ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % master['id'] ) ret = testbase.request_to_start_redis( master ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % master['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( master, wait_count ) self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (master['id']) ) redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis' ) ok = False for i in xrange(5): ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True) if ok: break else: time.sleep(1) self.assertTrue(ok, 'failed to check cluster state') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(0, 3): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) for i in range(3, 6): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state') # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize') # Delete forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)