def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 2 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual( old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state(self.cluster) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 1 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state( self.cluster ) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Get new timestamp new_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Compare old timestamps and new timestamps for s in self.cluster['servers']: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] if master in hanging_servers and len(running_servers) != 0: self.assertNotEqual(old_ts, new_ts, 'Timestamp of a hanging server has not changed. %d->%d' % (old_ts, new_ts)) else: self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis( server_to_join[i] ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) ) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis( master ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected) ) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) ) # check state N max_try = 20 expected = 'N' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server_to_join[i] ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) ) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis( ['id'] ) ret = redis.connect( s['ip'], s['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis( server_to_join[i]['id'] ) ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) ) # try to recover master, but failed ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( master, False ) self.assertEqual( ret, 0, 'failed to start redis' ) max_try = 3 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( master ) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) ) util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.') gw.disconnect() return 0
def test_4_PGS_mgen_is_less_than_PG_mgen( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role( self.cluster['servers'], 'master' ) ret = testbase.request_to_shutdown_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server_to_join ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server_to_join, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected) ) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue( ok, 'Check quorum fail.' ) # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id']) ) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) time.sleep( 5 ) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) server = util.get_server_by_role( self.cluster['servers'], 'master' ) redis = redis_mgmt.Redis( server_to_join['id'] ) ret = redis.connect( server_to_join['ip'], server_to_join['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i) ) gw.disconnect() return 0
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr(server_to_join[i]) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis(server_to_join[i]) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected)) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr(master) self.assertEqual(ret, 0, 'failed to shutdown smr') util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis(master) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected)) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id'])) # check state N max_try = 20 expected = 'N' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server_to_join[i]) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role)) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis(['id']) ret = redis.connect(s['ip'], s['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis(server_to_join[i]['id']) ret = redis.connect(server_to_join[i]['ip'], server_to_join[i]['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j)) # try to recover master, but failed ret = testbase.request_to_start_smr(master) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(master, False) self.assertEqual(ret, 0, 'failed to start redis') max_try = 3 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(master) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role)) util.log( 'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.' ) gw.disconnect() return 0
def test_4_PGS_mgen_is_less_than_PG_mgen(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role(self.cluster['servers'], 'master') ret = testbase.request_to_shutdown_smr(server_to_join) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server_to_join) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server_to_join, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected)) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue(ok, 'Check quorum fail.') # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr(server_to_join) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server_to_join, 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id'])) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) time.sleep(5) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') server = util.get_server_by_role(self.cluster['servers'], 'master') redis = redis_mgmt.Redis(server_to_join['id']) ret = redis.connect(server_to_join['ip'], server_to_join['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i)) gw.disconnect() return 0