def test_1_consistent_while_slave_is_in_load(self): util.print_frame() ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) gw.connect(ip, port) max_key = 5 key_base = 'load_gen_key' for idx in range(max_key): cmd = 'set %s%d 0\r\n' % (key_base, idx) gw.write(cmd) gw.read_until('\r\n', 10) try_count = 9999 for value in range(try_count): for idx in range(max_key): cmd = 'set %s%d %d\r\n' % (key_base, idx, value) gw.write(cmd) response = gw.read_until('\r\n', 10) self.assertEquals(response, '+OK\r\n') cmd = 'get %s%d\r\n' % (key_base, idx) gw.write(cmd) response = gw.read_until('\r\n', 10) response = gw.read_until('\r\n', 10) self.assertEquals( response, '%s\r\n' % (value), 'fail! original_value:%d, return_from_slave:%s' % (value, response[1:]))
def test_migrate_empty_s3obj(self): util.print_frame() ip, port = util.get_rand_gateway(self.cluster) client = redis_sock.RedisClient(ip, port) # Fill some string and empty s3 objects keyprefix = 'test_migrate_empty_s3obj' for i in range(1000): ok, data = client.do_request('set %s_string_%d %d\r\n' % (keyprefix, i, i)) assert (ok == True) ok, data = client.do_request( 's3ladd ks %s_s3_%d svc key val 0\r\n' % (keyprefix, i)) assert (ok == True and data == 1) ok, data = client.do_request('s3lrem ks %s_s3_%d svc key val\r\n' % (keyprefix, i)) assert (ok == True and data == 1) ## migration pg0 -> pg1 then pg1 -> pg0 ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') # Check string object for i in range(1000): ok, data = client.do_request('get %s_string_%d\r\n' % (keyprefix, i)) assert (ok == True and int(data) == i) client.close()
def test_migrate_empty_s3obj(self): util.print_frame() ip, port = util.get_rand_gateway(self.cluster) client = redis_sock.RedisClient(ip, port) # Fill some string and empty s3 objects keyprefix = 'test_migrate_empty_s3obj' for i in range (1000): ok, data = client.do_request('set %s_string_%d %d\r\n' % (keyprefix, i, i)) assert (ok == True) ok, data = client.do_request('s3ladd ks %s_s3_%d svc key val 0\r\n' % (keyprefix, i)) assert (ok == True and data == 1) ok, data = client.do_request('s3lrem ks %s_s3_%d svc key val\r\n' % (keyprefix, i)) assert (ok == True and data == 1) ## migration pg0 -> pg1 then pg1 -> pg0 ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') # Check string object for i in range (1000): ok, data = client.do_request('get %s_string_%d\r\n' % (keyprefix, i)) assert (ok == True and int(data) == i) client.close()
def test_1_consistent_while_slave_is_in_load( self ): util.print_frame() ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( ip ) gw.connect( ip, port ) max_key = 5 key_base = 'load_gen_key' for idx in range( max_key ): cmd = 'set %s%d 0\r\n' % (key_base, idx) gw.write( cmd ) gw.read_until( '\r\n', 10 ) try_count = 9999 for value in range( try_count ): for idx in range( max_key ): cmd = 'set %s%d %d\r\n' % (key_base, idx, value) gw.write( cmd ) response = gw.read_until( '\r\n', 10 ) self.assertEquals( response, '+OK\r\n' ) cmd = 'get %s%d\r\n' % (key_base, idx) gw.write( cmd ) response = gw.read_until( '\r\n', 10 ) response = gw.read_until( '\r\n', 10 ) self.assertEquals( response, '%s\r\n' % (value), 'fail! original_value:%d, return_from_slave:%s' % (value, response[1:]) )
def pgs_add_and_del( self, upgrade_server, type ): util.print_frame() util.log( '[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type) ) util.log_server_state( self.cluster ) # start load generator load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway( '0' ) gw.connect( ip, port ) for i in range( 0, 50 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) ) # attach pgs from cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) time.sleep( 3 ) # check new values redis = redis_mgmt.Redis( upgrade_server['id'] ) ret = redis.connect( upgrade_server['ip'], upgrade_server['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port']) ) for i in range( 0, 50 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i) ) util.log( 'succeeded : check values with get operations on pgs%d.' % (upgrade_server['id']) ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state( self.cluster ) return 0
def test_basic_op_gateway(self): util.print_frame() ip, port = util.get_rand_gateway(self.cluster) f = open("%s/test_basicop_output_gw" % constant.logdir, 'w') p = util.exec_proc_async("../redis", "./runtest_gw --accurate --gw-port "+str(port), True, None, f, None) ret = p.wait() f.close() self.assertEquals(0, ret)
def test_random_pgs_del_and_add(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator( i, ip, port) self.load_gen_thrd_list[i].start() util.log("started load_generator") servers = self.cluster['servers'] gw_list = [] for server in servers: gw = {} gw['mgmt'] = telnetlib.Telnet(server['ip'], server['gateway_port'] + 1) gw['normal'] = telnetlib.Telnet(server['ip'], server['gateway_port']) gw_list.append(gw) count = 10 while count > 0: c = random.choice(servers) for gw in gw_list: gw['mgmt'].write("pgs_del %d %d\r\n" % (c['id'], c['pg_id'])) gw['mgmt'].read_until("+OK\r\n") gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n") print gw_list[0]['mgmt'].read_until("+PONG\r\n") for gw in gw_list: gw['mgmt'].write( "pgs_add %d %d %s %d\r\n" % (c['id'], c['pg_id'], c['ip'], c['redis_port'])) gw['mgmt'].read_until("+OK\r\n") for gw in gw_list: while True: gw['normal'].write("info gateway\r\n") ret = gw['normal'].read_until("\r\n\r\n") if "gateway_disconnected_redis:0\r\n" in ret: break count -= 1 # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
def test_basic_op_gateway(self): util.print_frame() ip, port = util.get_rand_gateway(self.cluster) f = open("%s/test_basicop_output_gw" % constant.logdir, 'w') p = util.exec_proc_async("../redis-2.8.8", "./runtest_gw --accurate --gw-port "+str(port), True, None, f, None) ret = p.wait() f.close() self.assertEquals(0, ret)
def test_random_pgs_del_and_add(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_thrd_list[i].start() util.log("started load_generator") servers = self.cluster['servers'] gw_list = [] for server in servers: gw = {} gw['mgmt'] = telnetlib.Telnet(server['ip'], server['gateway_port']+1) gw['normal'] = telnetlib.Telnet(server['ip'], server['gateway_port']) gw_list.append(gw) count = 10 while count > 0: c = random.choice(servers) for gw in gw_list: gw['mgmt'].write("pgs_del %d %d\r\n" % (c['id'], c['pg_id'])) gw['mgmt'].read_until("+OK\r\n") gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n") print gw_list[0]['mgmt'].read_until("+PONG\r\n") for gw in gw_list: gw['mgmt'].write("pgs_add %d %d %s %d\r\n" % (c['id'], c['pg_id'], c['ip'], c['redis_port'])) gw['mgmt'].read_until("+OK\r\n") for gw in gw_list: while True: gw['normal'].write("info gateway\r\n") ret = gw['normal'].read_until("\r\n\r\n") if "gateway_disconnected_redis:0\r\n" in ret: break count -= 1 # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
def test_migrate_all(self): util.print_frame() migration_count = 10 # start load generator load_gen_thrd_list = {} util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port, ops_limit=500) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec # start migration for i in range(migration_count): # pg0 -> pg1 ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') # pg0 <- pg1 ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') ok = True for j in range(len(load_gen_thrd_list)): if load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
def put_some_data( self ): # start load generator max_load_generator = 100 load_gen_thrd_list = {} util.log('start load_generator') for i in range(max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(10) # generate some load util.log('end load_generator') # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Data are inconsistent.') return 0
def test_single_thread_input( self ): util.print_frame() self.cluster = config.clusters[0] result = {} ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( ip ) self.assertEquals( 0, gw.connect( ip, port ) ) max = 5 for idx in range( max ): cmd = 'set key%d 0\r\n' % (idx) gw.write( cmd ) result[idx] = gw.read_until( '\r\n' ) data_max = 65535 for idx in range( max ): for cnt in range( 0, data_max ): gw.write( 'crc16 key%d %d\r\n' % (idx, cnt) ) result[idx] = gw.read_until( '\r\n' ) for idx in range( max - 1 ): self.assertEquals( result[idx], result[idx + 1] )
def test_single_thread_input(self): util.print_frame() self.cluster = config.clusters[0] result = {} ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) self.assertEquals(0, gw.connect(ip, port)) max = 5 for idx in range(max): cmd = 'set key%d 0\r\n' % (idx) gw.write(cmd) result[idx] = gw.read_until('\r\n') data_max = 65535 for idx in range(max): for cnt in range(0, data_max): gw.write('crc16 key%d %d\r\n' % (idx, cnt)) result[idx] = gw.read_until('\r\n') for idx in range(max - 1): self.assertEquals(result[idx], result[idx + 1])
def test_migrate_all(self): util.print_frame() migration_count = 10 # start load generator load_gen_thrd_list = {} util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec # start migration for i in range(migration_count): # pg0 -> pg1 ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') # pg0 <- pg1 ret = util.migration(self.cluster, 1, 0, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') ok = True for j in range(len(load_gen_thrd_list)): if load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break; time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
def test_delete_smrlog_after_scaleout(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") # servers for scale out servers = [config.server4, config.server5, config.server6] leader_cm = self.cluster['servers'][0] # Scale out cluster = config.clusters[0] ret = util.pg_add(cluster, servers, leader_cm) self.assertEqual(True, ret, 'Scale out fail. util.pg_add returns false') time.sleep(5) # pg0 -> pg1 cluster = config.clusters[1] ret = util.migration(cluster, 0, 1, 8000, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail 0 -> 1') # get log file old_logs = {} for s in config.clusters[0]['servers']: parent_dir, log_dir = util.smr_log_dir(s['id']) path = '%s/%s' % (parent_dir, log_dir) old_logs[s['id']] = util.ls(path) # bgsave in order to make smrlogs deleted. for s in config.clusters[0]['servers']: bgsave_ret = util.bgsave(s) self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % s['id']) util.log('bgsave pgs%d is done.') # check consistency ok = True for j in range(len(self.load_gen_thrd_list)): self.assertTrue(self.load_gen_thrd_list[j].isConsistent(), 'Inconsistent after migration') # is smr-replicator delete smrlogs? i = 0 while i < 20: i += 1 # get current log files cur_logs = {} for s in config.clusters[0]['servers']: parent_dir, log_dir = util.smr_log_dir(s['id']) path = '%s/%s' % (parent_dir, log_dir) cur_logs[s['id']] = util.ls(path) # compare old and new temp_old_logs = copy.deepcopy(old_logs) for id, nl in cur_logs.items(): ol = temp_old_logs.get(id) self.assertNotEqual(ol, None, "failed to check logfiles. old logs for smr-replicator '%d' is not exist." % id) for log in nl: if log in ol: ol.remove(log) ok = True for id, ol in temp_old_logs.items(): if len(ol) == 0: ok = False util.log('Loop %d ---------------------------------------------------------' % i) util.log('deleted smrlog files: %s' % util.json_to_str(temp_old_logs)) if ok: break time.sleep(10) self.assertTrue(ok, 'smr-replicator does not delete smrlogs.') util.log('smr-replicator deletes smrlogs.') # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
def test_scaleout(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator( i, ip, port) self.load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") # servers for scale out servers = [config.server4, config.server5, config.server6] leader_cm = self.cluster['servers'][0] # start migration migration_count = 5 for i in range(migration_count): # Scale out cluster = config.clusters[0] ret = util.pg_add(cluster, servers, leader_cm) self.assertEqual(True, ret, 'Scale out fail. util.pg_add returns false') time.sleep(5) # pg0 -> pg1 cluster = config.clusters[1] ret = util.migration(cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail 0 -> 1') # pg0 <- pg1 cluster = config.clusters[1] ret = util.migration(cluster, 1, 0, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail 1 <- 0') # Scale in #TODO Temporary #cluster = config.clusters[0] #for server in cluster['servers']: # if testbase.request_to_shutdown_hbc(server) is not 0: # util.log('scale in : failed to request to shutdown hbc') # self.assertFalse('scale in : failed to request to shutdown hbc') #time.sleep(5) ############### cluster = config.clusters[1] ret = util.pg_del(cluster, servers, leader_cm) self.assertEqual(True, ret, 'Scale in fail. util.pg_del returns false') #TODO Temporary #cluster = config.clusters[0] #for server in cluster['servers']: # if testbase.request_to_start_heartbeat_checker( server ) is not 0: # util.log('scale in : failed to start hbc') # self.assertFalse('scale in : failed to start hbc') #time.sleep(5) ############### # check consistency ok = True for j in range(len(self.load_gen_thrd_list)): if self.load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
def state_transition(self): server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(server, None, 'failed to get_server_by_role-slave') # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) # check initial state state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role)) # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEquals(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(3) # check state F expected = 'F' state = self.get_expected_smr_state(server, expected) self.assertEquals( expected, state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected)) # set value ret = gw.connect(ip, port) self.assertEquals(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) timestamp = 0.0 for i in range(0, 100): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery ret = testbase.request_to_start_smr(server) self.assertEquals(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEquals(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, 10) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) time.sleep(5) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N expected = 'N' max_try = 20 for i in range(0, max_try): state = self.get_expected_smr_state(server, expected) if state == expected: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected))
def consistent_after_failover(self): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) gw.connect(ip, port) for i in range(0, max): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') time.sleep(5) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr(server) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(5) # check state F for server in servers: state = self.get_expected_smr_state(server, 'F') self.assertEquals('F', state, 'server%d - state:%s' % (server['id'], state)) # recovery for server in servers: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % server['id']) ret = testbase.request_to_start_redis(server, False) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % server['id']) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep(5) # wait for master election for i in xrange(10): ret = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id'])) state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role)) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server(server) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave)) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis(master['id']) ret = redis.connect(master['ip'], master['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id']) for i in range(max, max * 2): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id']) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis(slave['id']) ret = slave_redis.connect(slave['ip'], slave['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id']) for i in range(0, max * 2): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write(cmd) trash = slave_redis.read_until('\r\n') res = slave_redis.read_until('\r\n') self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)) slave_redis.disconnect()
def test_migration_with_expire_command(self): util.print_frame() util.log("start load_generator") load_gen_thrd_list = {} for i in range(1): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec tps = 20000 src_pg_id = 0 dst_pg_id = 1 leader_cm = self.cluster['servers'][0] src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id) dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id) smr = smr_mgmt.SMR(src_master['id']) ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port']) if ret != 0: util.log('failed to connect to smr(source master)') return False src_redis = redis_mgmt.Redis(src_master['id']) ret = src_redis.connect(src_master['ip'], src_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts)) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20) # notify dst_redis of migration start util.log(">>> notify dst_redis of migration start (%s)" % time.asctime()) cmd = 'migconf migstart %d-%d\r\n' % (0, 8191) dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) # remote partial checkpoint util.log(">>> start remote checkpoint and load (%s)" % time.asctime()) cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], dst_master['ip'], dst_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: if line.find("Checkpoint Sequence Number:") != -1: util.log("seqnumber : " + line[line.rfind(":")+1:]) seq = int(line[line.rfind(":")+1:]) util.log(">>>" + str(line.rstrip())) self.assertEqual(0, ret) util.log(">>> end remote checkpoint and load (%s)" % time.asctime()) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") # bgsave for testing later about recovery during migration util.log(">>> bgsave for testing later about recovery during migration (%s)" % time.asctime()) cmd = 'bgsave\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+Background saving started\r\n' ) ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100) # remote catchup (smr log migration) util.log(">>> start remote catchup (%s)" % time.asctime()) dst_host = dst_master['ip'] dst_smr_port = dst_master['smr_base_port'] rle = '1 8192' num_part = 8192 smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port, seq, tps, num_part, rle)) response = smr.read_until('\r\n') if response[:3] != '+OK': util.log('failed to execute migrate start command, response:%s' % response) return False while True: smr.write('migrate info\r\n') response = smr.read_until('\r\n') seqs = response.split() logseq = int(seqs[1].split(':')[1]) mig = int(seqs[2].split(':')[1]) util.log('migrate info: %s' % response) if (logseq-mig < 500000): util.log('Remote catchup almost done. try mig2pc') break time.sleep(1) util.log(">>> sleep until 90 sec pass") self.assertFalse(time.time() - ts >= 90) time.sleep(90 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20) util.log(">>> remote catchup phase almost done (%s)" % time.asctime()) # mig2pc util.log(">>> start mig2pc (%s)" % time.asctime()) cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id, 0, 8191) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) util.log('mig2pc result : ' + result) if not result.startswith('{"state":"success","msg":"+OK"}\r\n'): util.log('failed to execute mig2pc command, result:%s' % result) return False util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10) self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20) # finish migration smr.write('migrate interrupt\r\n') response = smr.read_until('\r\n') util.log('migrate interrupt: %s' % response) smr.disconnect() # notify dst_redis of migration end util.log(">>> notify dst_redis of migration end (%s)" % time.asctime()) cmd = 'migconf migend\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191) src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") ts = time.time() util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) # remote partial checkpoint util.log(">>> start rangedel (%s)" % time.asctime()) cmd = "./cluster-util --rangedel %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: util.log(">>>" + str(line.rstrip())) cmd = 'migconf clearend\r\n' src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEqual(res, '+OK\r\n') time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # kill dst_redis and recover from bgsave util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime()) dst_redis.disconnect() ret = testbase.request_to_shutdown_redis(dst_master) self.assertEquals( ret, 0, 'failed to shutdown redis' ) ret = testbase.request_to_shutdown_smr(dst_master) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_master) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_master['id'] ) ret = testbase.request_to_start_redis(dst_master) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_master['id'] ) ret = testbase.wait_until_finished_to_set_up_role(dst_master) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id']) ) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis, 'S3:PermanentKey') # kill dst_slave redis and recover without dump file util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime()) dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id) ret = testbase.request_to_shutdown_redis(dst_slave) self.assertEquals( ret, 0, 'failed to shutdown redis' ) ret = testbase.request_to_shutdown_smr(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_slave) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_slave['id'] ) ret = testbase.request_to_start_redis(dst_slave) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_slave['id'] ) ret = testbase.wait_until_finished_to_set_up_role(dst_slave) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id']) ) dst_redis_slave = redis_mgmt.Redis(dst_slave['id']) ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis_slave, 'S3:PermanentKey') # Go back to initial configuration self.assertTrue(util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000), 'failed to rollback migration')
def test_5_transfer_pgs_to_another_machine(self): util.print_frame() self.load_gen_list = {} # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # incrase master generation number util.log('failover in order to increase master generation number.') max = 0 for i in range(5): key_base = 'key' for i in range(max, max + 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') max = max + 10000 m = util.get_server_by_role(self.cluster['servers'], 'master') util.log('failover pgs%d' % m['id']) ret = util.failover(m, self.leader_cm) self.assertTrue(ret, 'failed to failover pgs%d' % m['id']) # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") m, s1, s2 = util.get_mss(self.cluster) servers = [m, s1, s2] # bgsave for s in servers: ret = util.bgsave(s) self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id']) new_servers = [config.server4, config.server5] # add new slaves for s in new_servers: util.log('delete pgs%d`s check point.' % s['id']) util.del_dumprdb(s['id']) ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'], 'dump.rdb', 0, 8191) self.assertEqual( True, ret, 'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d' % (m['ip'], m['redis_port'], s['id'])) ret = util.install_pgs(self.cluster, s, self.leader_cm, 0, rm_ckpt=False) self.assertEqual( True, ret, 'failed : util.pgs_add returns false, pgsid=%d' % s['id']) util.log('succeeeded : add a new slave, pgsid=%d' % s['id']) # check consistency ok = True for j in range(self.max_load_generator): if self.load_gen_list[j].isConsistent() == False: ok = False break if not ok: break for server_to_del in servers: for s in servers: util.pingpong(s['ip'], s['smr_mgmt_port']) for s in new_servers: util.pingpong(s['ip'], s['smr_mgmt_port']) self.__del_server(server_to_del) util.log('succeeded : delete pgs%d' % server_to_del['id']) new_m = util.get_server_by_role(new_servers, 'master') new_s = util.get_server_by_role(new_servers, 'slave') self.assertNotEqual(new_m, None, 'master is None.') self.assertNotEqual(new_s, None, 'slave is None.') for s in new_servers: util.pingpong(s['ip'], s['smr_mgmt_port']) time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) # Go back to initial configuration # recover pgs for s in servers: self.assertTrue( util.install_pgs(self.cluster, s, self.leader_cm, rm_ckpt=False), 'failed to recover pgs. (install_pgs)') # cleanup new slaves for s in new_servers: self.assertTrue( util.uninstall_pgs(self.cluster, s, self.leader_cm), 'failed to cleanup pgs. (uninstall_pgs)')
def master_failover_while_hang(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) self.failover_while_hang(m) util.log('server state transition after hang') util.log_server_state(self.cluster) redis1 = redis_mgmt.Redis(m['id']) ret = redis1.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (m['id'], s2['id'])) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(5) # wait for forced master election success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True break time.sleep(1) util.log('server state transition after hang') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to forced master election') redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check if the haning server recovered and joined as a slave time.sleep(7) role = util.get_role_of_server(m) self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave') redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_all_pgs_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave1 = smr_mgmt.SMR(s1['id']) ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_slave2 = smr_mgmt.SMR(s2['id']) ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) smr_master.write('fi delay sleep 1 8000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave1.write('fi delay sleep 1 8000\r\n') smr_slave2.write('fi delay sleep 1 8000\r\n') time.sleep(10) # check consistency ok = False for try_cnt in xrange(20): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) if ok: break time.sleep(0.5) self.assertTrue(ok, 'Unstable cluster state') util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check new values (s2) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i)) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def test_two_slaves_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs(s1) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1)) ts_before2 = util.get_timestamp_of_pgs(s2) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2)) # hang smr1 = smr_mgmt.SMR(s1['id']) ret = smr1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr2 = smr_mgmt.SMR(s2['id']) ret = smr2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr1.write('fi delay sleep 1 8000\r\n') reply = smr1.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr2.write('fi delay sleep 1 8000\r\n') time.sleep(7) success = False for i in xrange(20): ret = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port, check_quorum=True) if ret: success = True break time.sleep(1) self.assertEqual(success, True, 'unstable cluster') # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_and_slave_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave = smr_mgmt.SMR(s1['id']) ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_master.write('fi delay sleep 1 10000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave.write('fi delay sleep 1 10000\r\n') util.log('server state transition after hang') util.log_server_state(self.cluster) time.sleep(5) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range(15): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role != 'M': success = False break time.sleep(1) util.log('') util.log('It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2') util.log('') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to check copy-quorum') ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue(ok, 'Cluster state is not normal!') redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write(cmd) res = redis2.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port'])) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port'])) if len(self.cluster['servers']) != 3: # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 2 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual( old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state(self.cluster) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def test_restart_recovery_with_remote_checkpoint_and_remote_log(self): util.print_frame() key_base = 'key' target = util.get_server_by_role(self.cluster['servers'], 'slave') master = util.get_server_by_role(self.cluster['servers'], 'master') ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(master['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway') # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # generate some data for i in range(0, 100): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # delete a local checkpoint util.log('delete pgs%d`s check point.' % target['id']) util.del_dumprdb(target['id']) # generate a remote check point bgsave_ret = util.bgsave(master) self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown util.log('shutdown target') ret = testbase.request_to_shutdown_smr(target) self.assertEqual(ret, 0, 'failed to shutdown smr') time.sleep(10) # generate some data ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway') for i in range(100, 200): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery util.log('recovery target') ret = testbase.request_to_start_smr(target) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(target) self.assertEqual(ret, 0, 'failed to start redis') time.sleep(5) ret = testbase.wait_until_finished_to_set_up_role(target) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (target['id'])) # check value recovered_redis = redis_mgmt.Redis(target['id']) ret = recovered_redis.connect(target['ip'], target['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') for i in range(0, 200): key = '%s%d' % (key_base, i) cmd = 'get %s\r\n' % (key) recovered_redis.write(cmd) recovered_redis.read_until('\r\n') response = recovered_redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % i, 'inconsistent %s, %d' % (response, i))
def recovery_with_local_checkpoint_and_remote_log(self, role): server = util.get_server_by_role(self.cluster['servers'], role) # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(server['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, id:%d' % server['id']) timestamp = {} key_base = 'key0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999' for i in range(0, 50000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write(cmd) response = gw.read_until('\r\n') self.assertNotEqual(response.find('+OK'), -1, 'failed to set key value through gateway') # generate a check point bgsave_ret = util.bgsave(server) self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % server['id']) # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEqual(ret, 0, 'failed to shutdown redis') util.log('succeeded : shutdown pgs%d' % (server['id'])) # delete smr_logs ret = util.delete_smr_logs(server['id']) self.assertEqual(ret, 0, 'failed to delete smr log, id:%d' % server['id']) util.log('succeeded : delete replication logs') time.sleep(5) # set value ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway') for i in range(50000, 100000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write(cmd) response = gw.read_until('\r\n') self.assertNotEqual(response.find('+OK'), -1, 'failed to set key value through gateway') # recovery ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') time.sleep(5) ret = testbase.wait_until_finished_to_set_up_role(server) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) util.log('succeeded : recover pgs%d' % server['id']) # check value recovered_redis = redis_mgmt.Redis(server['id']) ret = recovered_redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') for i in range(0, 100000): k = '%s_%d' % (key_base, i) cmd = 'get %s\r\n' % (k) recovered_redis.write(cmd) recovered_redis.read_until('\r\n') response = recovered_redis.read_until('\r\n') self.assertEqual(response, '%f\r\n' % (timestamp[i]), 'inconsistent %s, %f' % (response, timestamp[i]))
def test_4_PGS_mgen_is_less_than_PG_mgen( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role( self.cluster['servers'], 'master' ) ret = testbase.request_to_shutdown_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server_to_join ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server_to_join, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected) ) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue( ok, 'Check quorum fail.' ) # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id']) ) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) time.sleep( 5 ) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) server = util.get_server_by_role( self.cluster['servers'], 'master' ) redis = redis_mgmt.Redis( server_to_join['id'] ) ret = redis.connect( server_to_join['ip'], server_to_join['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i) ) gw.disconnect() return 0
def test_rdb_backups(self): util.print_frame() bgsave_count = 50 org_path = os.getcwd() os.chdir(util.redis_dir(0)) server0 = self.cluster['servers'][0] redis0 = telnetlib.Telnet(server0['ip'], server0['redis_port']) util.log("Starting load generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_thrd_list[i].start() util.log("Set the number of rdb backups = 24") redis0.write("config set number-of-rdb-backups 24\r\n") redis0.read_until("+OK\r\n") util.log("Clear old rdb backups\r\n") for f in os.listdir('.'): if (f.endswith('.rdb')): os.remove(f) util.log("Bgsaving continuously and counting the number of rdb backups") for i in range(bgsave_count): # Save current time before Bgsaving redis0.write('time\r\n') redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) ret = redis0.read_until('\r\n', 1) redis_server_time = int(ret.strip()) redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) time.sleep(1.1) redis0.write('time\r\n') redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) ret = redis0.read_until('\r\n', 1) self.assertNotEqual(redis_server_time, int(ret.strip())) redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) util.log("%d ~ %d" % (redis_server_time, int(ret.strip()))) # Bgsave redis0.write("bgsave\r\n") ret = redis0.read_until('\r\n', 1) self.assertEqual('+Background saving started\r\n', ret) # Wait finishing bgsave while True: redis0.write('lastsave\r\n') ret = redis0.read_until('\r\n', 1) lastsave_time = int(ret[1:].strip()) if lastsave_time > redis_server_time: break time.sleep(0.1) # Count the number of rdb backups rdb_list = [name for name in os.listdir('.') if os.path.isfile(name) and name.startswith('dump') and name.endswith('.rdb')] util.log(rdb_list) util.log("Iteration:%d, rdb Backups:%d" % (i+1, len(rdb_list))) self.assertTrue(i+1 > 24 and len(rdb_list) == 25 or len(rdb_list) == i+1) self.assertTrue('dump.rdb' in rdb_list) util.log("\nSet the number of rdb backups = 5") redis0.write("config set number-of-rdb-backups 5\r\n") redis0.read_until("+OK\r\n") for i in range(3): # Save current time before Bgsaving redis0.write('time\r\n') redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) ret = redis0.read_until('\r\n', 1) redis_server_time = int(ret.strip()) redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) time.sleep(1.1) # Bgsave redis0.write("bgsave\r\n") ret = redis0.read_until('\r\n', 1) self.assertEqual('+Background saving started\r\n', ret) # Wait finishing bgsave while True: redis0.write('lastsave\r\n') ret = redis0.read_until('\r\n', 1) lastsave_time = int(ret[1:].strip()) if lastsave_time > redis_server_time: break time.sleep(0.1) # Count the number of rdb backups rdb_list = [name for name in os.listdir('.') if os.path.isfile(name) and name.startswith('dump') and name.endswith('.rdb')] util.log(rdb_list) util.log("Iteration:%d, rdb Backups:%d" % (i+1, len(rdb_list))) self.assertTrue(len(rdb_list) == 6) self.assertTrue('dump.rdb' in rdb_list) # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test') os.chdir(org_path)
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis( server_to_join[i] ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) ) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis( master ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected) ) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) ) # check state N max_try = 20 expected = 'N' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server_to_join[i] ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) ) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis( ['id'] ) ret = redis.connect( s['ip'], s['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis( server_to_join[i]['id'] ) ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) ) # try to recover master, but failed ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( master, False ) self.assertEqual( ret, 0, 'failed to start redis' ) max_try = 3 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( master ) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) ) util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.') gw.disconnect() return 0
def start_load_generator(self, num): for i in range(num): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator( i, ip, port) self.load_gen_thrd_list[i].start()
def failure_recovery( self, role, wait_count=10, redis_only=False ): time.sleep( 2 ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # shutdown server = util.get_server_by_role( self.cluster['servers'], role ) if redis_only == False: ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) # check value cmd = 'get %s\r\n' % (key) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value) )
def elect_master_randomly(self): # set data ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 1000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) server_ids = [] for server in self.cluster['servers']: server_ids.append(server['id']) for try_cnt in range(30): # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('master id : %d' % m['id']) if try_cnt != 0: if m['id'] in server_ids: server_ids.remove(m['id']) smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) cmd = 'role lconn\r\n' smr.write(cmd) reply = smr.read_until('\r\n') self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2])) # wait until role-change is finished for role_change_try_cnt in range(5): count_master = 0 count_slave = 0 for server in self.cluster['servers']: real_role = util.get_role_of_server(server) real_role = util.roleNumberToChar(real_role) if real_role == 'M': count_master = count_master + 1 elif real_role == 'S': count_slave = count_slave + 1 if count_master == 1 and count_slave == 2: break time.sleep(1) # check the number of master and slave self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave)) self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave)) util.log( 'succeeded : the number of master is 1 and the number of slave is 2' ) # check states of all pgs in pg for try_cnt in range(3): ok = True for s in self.cluster['servers']: real_role = util.get_role_of_server(s) real_role = util.roleNumberToChar(real_role) smr_info = util.get_smr_info(s, self.leader_cm) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb != 'Y': ok = False if real_role != cc_role: ok = False if ok: util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb)) else: util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb)) if ok == False: time.sleep(0.5) else: break self.assertTrue(ok, 'failed : role check') if len(server_ids) == 0: util.log('succeeded : all smrs have been as a master') return 0 self.assertEqual( 0, len(server_ids), 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids))) return 0
def consistent_after_failover( self ): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( ip ) gw.connect( ip, port ) for i in range( 0, max ): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) time.sleep( 5 ) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 5 ) # check state F for server in servers: state = self.get_expected_smr_state( server, 'F' ) self.assertEquals( 'F', state, 'server%d - state:%s' % (server['id'], state) ) # recovery for server in servers: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] ) ret = testbase.request_to_start_redis( server, False ) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id'] ) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep( 5 ) # wait for master election for i in xrange(10): ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] ) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) ) state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role) ) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server( server ) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave) ) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] ) for i in range( max, max*2 ): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id'] ) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis( slave['id'] ) ret = slave_redis .connect( slave['ip'], slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] ) for i in range( 0, max*2 ): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write( cmd ) trash = slave_redis.read_until( '\r\n' ) res = slave_redis.read_until( '\r\n' ) self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res) ) slave_redis.disconnect()
def test_delete_smrlog_after_scaleout(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator( i, ip, port) self.load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") # servers for scale out servers = [config.server4, config.server5, config.server6] leader_cm = self.cluster['servers'][0] # Scale out cluster = config.clusters[0] ret = util.pg_add(cluster, servers, leader_cm) self.assertEqual(True, ret, 'Scale out fail. util.pg_add returns false') time.sleep(5) # pg0 -> pg1 cluster = config.clusters[1] ret = util.migration(cluster, 0, 1, 8000, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail 0 -> 1') # get log file old_logs = {} for s in config.clusters[0]['servers']: parent_dir, log_dir = util.smr_log_dir(s['id']) path = '%s/%s' % (parent_dir, log_dir) old_logs[s['id']] = util.ls(path) # bgsave in order to make smrlogs deleted. for s in config.clusters[0]['servers']: bgsave_ret = util.bgsave(s) self.assertTrue(bgsave_ret, 'failed to bgsave. pgs%d' % s['id']) util.log('bgsave pgs%d is done.') # check consistency ok = True for j in range(len(self.load_gen_thrd_list)): self.assertTrue(self.load_gen_thrd_list[j].isConsistent(), 'Inconsistent after migration') # is smr-replicator delete smrlogs? i = 0 while i < 20: i += 1 # get current log files cur_logs = {} for s in config.clusters[0]['servers']: parent_dir, log_dir = util.smr_log_dir(s['id']) path = '%s/%s' % (parent_dir, log_dir) cur_logs[s['id']] = util.ls(path) # compare old and new temp_old_logs = copy.deepcopy(old_logs) for id, nl in cur_logs.items(): ol = temp_old_logs.get(id) self.assertNotEqual( ol, None, "failed to check logfiles. old logs for smr-replicator '%d' is not exist." % id) for log in nl: if log in ol: ol.remove(log) ok = True for id, ol in temp_old_logs.items(): if len(ol) == 0: ok = False util.log( 'Loop %d ---------------------------------------------------------' % i) util.log('deleted smrlog files: %s' % util.json_to_str(temp_old_logs)) if ok: break time.sleep(10) self.assertTrue(ok, 'smr-replicator does not delete smrlogs.') util.log('smr-replicator deletes smrlogs.') # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
def elect_master_randomly( self ): # set data ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway( '0' ) gw.connect( ip, port ) for i in range( 0, 1000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) ) server_ids = [] for server in self.cluster['servers']: server_ids.append( server['id'] ) for try_cnt in range( 30 ): # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'master id : %d' % m['id'] ) if try_cnt != 0: if m['id'] in server_ids: server_ids.remove( m['id'] ) smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) cmd = 'role lconn\r\n' smr.write( cmd ) reply = smr.read_until( '\r\n' ) self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) ) # wait until role-change is finished for role_change_try_cnt in range( 5 ): count_master = 0 count_slave = 0 for server in self.cluster['servers']: real_role = util.get_role_of_server( server ) real_role = util.roleNumberToChar( real_role ) if real_role == 'M': count_master = count_master + 1 elif real_role == 'S': count_slave = count_slave + 1 if count_master == 1 and count_slave == 2: break; time.sleep( 1 ) # check the number of master and slave self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave) ) self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave) ) util.log( 'succeeded : the number of master is 1 and the number of slave is 2' ) # check states of all pgs in pg for try_cnt in range( 3 ): ok = True for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb != 'Y': ok = False if real_role != cc_role: ok = False if ok: util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) ) else: util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) ) if ok == False: time.sleep( 0.5 ) else: break self.assertTrue( ok, 'failed : role check' ) if len( server_ids ) == 0: util.log( 'succeeded : all smrs have been as a master' ) return 0 self.assertEqual( 0, len( server_ids ) , 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids)) ) return 0
def test_moving_pgs(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_thrd_list[i].start() util.log("started load_generator") servers = self.cluster['servers'] gw_list = [] for server in servers: gw = {} gw['mgmt'] = telnetlib.Telnet(server['ip'], server['gateway_port']+1) gw['normal'] = telnetlib.Telnet(server['ip'], server['gateway_port']) gw_list.append(gw) n = 0 step = 0 iter = 30 while iter > 0: if n == 0 or random.randint(0, 1) == 0: step = random.randint(1, 10) else: step = -1 * random.randint(1, n) print "<<< ITER = %d, PG%d -> PG%d, PG%d -> PG%d >>>" % (iter, n*2, (n+step)*2, n*2+1, (n+step)*2+1) gw = gw_list[0] self.pgs_del_server(gw['mgmt'], servers[0], n) self.pgs_del_server(gw['mgmt'], servers[1], n) self.pgs_del_server(gw['mgmt'], servers[5], n) gw['mgmt'].write("pg_add %d\r\n" % ((n+step)*2)) gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("pg_add %d\r\n" % ((n+step)*2+1)) gw['mgmt'].read_until("+OK\r\n") self.pgs_add_server(gw['mgmt'], servers[0], n+step) self.pgs_add_server(gw['mgmt'], servers[1], n+step) self.pgs_add_server(gw['mgmt'], servers[5], n+step) while True: gw['normal'].write("info gateway\r\n") ret = gw['normal'].read_until("\r\n", 1) if "-ERR" in ret: continue ret = gw['normal'].read_until("\r\n\r\n", 1) #print ret if "gateway_disconnected_redis:0\r\n" in ret: break gw['mgmt'].write("delay 0 4095\r\n") gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("delay 4096 8191\r\n") gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("redirect 0 4095 %d\r\n" % ((n+step)*2)) gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("redirect 4096 8191 %d\r\n" % ((n+step)*2+1)) gw['mgmt'].read_until("+OK\r\n") gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n") print gw_list[0]['mgmt'].read_until("+PONG\r\n") self.pgs_del_server(gw['mgmt'], servers[2], n) self.pgs_del_server(gw['mgmt'], servers[3], n) self.pgs_del_server(gw['mgmt'], servers[4], n) self.pgs_add_server(gw['mgmt'], servers[2], n+step) self.pgs_add_server(gw['mgmt'], servers[3], n+step) self.pgs_add_server(gw['mgmt'], servers[4], n+step) while True: gw['normal'].write("info gateway\r\n") ret = gw['normal'].read_until("\r\n", 1) if "-ERR" in ret: continue ret = gw['normal'].read_until("\r\n\r\n", 1) #print ret if "gateway_disconnected_redis:0\r\n" in ret: break gw['mgmt'].write("pg_del %d\r\n" % (n*2)) gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("pg_del %d\r\n" % (n*2+1)) gw['mgmt'].read_until("+OK\r\n") n += step gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n") print gw_list[0]['mgmt'].read_until("+PONG\r\n") iter -= 1 # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
def state_transition( self ): server = util.get_server_by_role( self.cluster['servers'], 'slave' ) self.assertNotEquals( server, None, 'failed to get_server_by_role-slave' ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) # check initial state state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role) ) # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEquals( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 3 ) # check state F expected = 'F' state = self.get_expected_smr_state( server, expected ) self.assertEquals( expected , state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected) ) # set value ret = gw.connect( ip, port ) self.assertEquals( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) timestamp = 0.0 for i in range( 0, 100 ): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery ret = testbase.request_to_start_smr( server ) self.assertEquals( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEquals( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) time.sleep( 5 ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N expected = 'N' max_try = 20 for i in range( 0, max_try ): state = self.get_expected_smr_state( server, expected ) if state == expected: break time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected) )
def test_scaleout(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") # servers for scale out servers = [config.server4, config.server5, config.server6] leader_cm = self.cluster['servers'][0] # start migration migration_count = 5 for i in range(migration_count): # Scale out cluster = config.clusters[0] ret = util.pg_add(cluster, servers, leader_cm) self.assertEqual(True, ret, 'Scale out fail. util.pg_add returns false') time.sleep(5) # pg0 -> pg1 cluster = config.clusters[1] ret = util.migration(cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail 0 -> 1') # pg0 <- pg1 cluster = config.clusters[1] ret = util.migration(cluster, 1, 0, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail 1 <- 0') # Scale in #TODO Temporary #cluster = config.clusters[0] #for server in cluster['servers']: # if testbase.request_to_shutdown_hbc(server) is not 0: # util.log('scale in : failed to request to shutdown hbc') # self.assertFalse('scale in : failed to request to shutdown hbc') #time.sleep(5) ############### cluster = config.clusters[1] ret = util.pg_del(cluster, servers, leader_cm) self.assertEqual(True, ret, 'Scale in fail. util.pg_del returns false') #TODO Temporary #cluster = config.clusters[0] #for server in cluster['servers']: # if testbase.request_to_start_heartbeat_checker( server ) is not 0: # util.log('scale in : failed to start hbc') # self.assertFalse('scale in : failed to start hbc') #time.sleep(5) ############### # check consistency ok = True for j in range(len(self.load_gen_thrd_list)): if self.load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break; time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration')
def test_random_migrate(self): util.print_frame() # start load generator load_gen_thrd_list = {} util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') leader_cm = self.cluster['servers'][0] cluster_name = self.cluster['cluster_name'] mapping = [-1] * 8192 count = 50 while count > 0: # get PN -> PG map cmd = 'cluster_info %s' % cluster_name result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) ret = json.loads(result) rle = ret['data']['cluster_info']['PN_PG_Map'] print "PN_PG_MAP = %s" % rle sp = rle.split() index = 0 for i in range(len(sp) / 2): for j in range(int(sp[i * 2 + 1])): mapping[index] = int(sp[i * 2]) index += 1 slot = random.randint(0, 8191) src_pgid = mapping[slot] dst_pgid = (src_pgid + 1) % 2 slot_end = slot while random.randint(0, 5) <= 4: if slot_end < 8191 and mapping[slot_end + 1] == src_pgid: slot_end += 1 else: break print "SLOT=%d, SRC_PGID=%d, DST_PGID=%d" % (slot, src_pgid, dst_pgid) ret = util.migration(self.cluster, src_pgid, dst_pgid, slot, slot_end, 40000) self.assertEqual(True, ret, 'Migration Fail') ok = True for j in range(len(load_gen_thrd_list)): if load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break count -= 1 # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # Go back to initial configuration cinfo = util.cluster_info(leader_cm['ip'], leader_cm['cm_port'], cluster_name) for slot in util.get_slots(cinfo['cluster_info']['PN_PG_Map'], 1): self.assertTrue( util.migration(self.cluster, 1, 0, slot['begin'], slot['end'], 40000), 'failed to rollback migration')
def test_1_role_change(self): util.print_frame() self.load_gen_list = {} # Start load generator util.log("Start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Loop (smr: 3 copy) for i in range(30): target_server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(target_server, None, 'Get slave fail.') target = target_server['id'] print '' util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target)) # Get old timestamp util.log_server_state(self.cluster) old_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target + 1) % 3 util.log('Change role success.') # Wait until role change finished for s in self.cluster['servers']: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): try: pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break except: pass time.sleep(0.2) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state(self.cluster) new_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(3): self.assertNotEqual( old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change') # Loop (smr: 2 copy) self.__del_server(self.cluster['servers'][0]) servers = [self.cluster['servers'][1], self.cluster['servers'][2]] normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") for i in range(30): print '' util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target)) s = util.get_server_by_role(servers, 'slave') target = s['id'] # Get old timestamp util.log_server_state(self.cluster) old_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs(s) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target) % 2 + 1 util.log('Change role success.') # Wait until role change finished for s in servers: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break time.sleep(0.1) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state(self.cluster) new_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs(s) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(2): self.assertNotEqual( old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change') # Go back to initial configuration self.assertTrue( util.install_pgs(self.cluster, self.cluster['servers'][0], self.leader_cm, rm_ckpt=False), 'failed to recover pgs.')
def test_migration_with_expire_command(self): util.print_frame() util.log("start load_generator") load_gen_thrd_list = {} for i in range(1): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec tps = 20000 src_pg_id = 0 dst_pg_id = 1 leader_cm = self.cluster['servers'][0] src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id) dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id) smr = smr_mgmt.SMR(src_master['id']) ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port']) if ret != 0: util.log('failed to connect to smr(source master)') return False src_redis = redis_mgmt.Redis(src_master['id']) ret = src_redis.connect(src_master['ip'], src_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key( src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key( src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts)) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20) # notify dst_redis of migration start util.log(">>> notify dst_redis of migration start (%s)" % time.asctime()) cmd = 'migconf migstart %d-%d\r\n' % (0, 8191) dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # remote partial checkpoint util.log(">>> start remote checkpoint and load (%s)" % time.asctime()) cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], dst_master['ip'], dst_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: if line.find("Checkpoint Sequence Number:") != -1: util.log("seqnumber : " + line[line.rfind(":") + 1:]) seq = int(line[line.rfind(":") + 1:]) util.log(">>>" + str(line.rstrip())) self.assertEqual(0, ret) util.log(">>> end remote checkpoint and load (%s)" % time.asctime()) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") # bgsave for testing later about recovery during migration util.log( ">>> bgsave for testing later about recovery during migration (%s)" % time.asctime()) cmd = 'bgsave\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+Background saving started\r\n') ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100) # remote catchup (smr log migration) util.log(">>> start remote catchup (%s)" % time.asctime()) dst_host = dst_master['ip'] dst_smr_port = dst_master['smr_base_port'] rle = '1 8192' num_part = 8192 smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port, seq, tps, num_part, rle)) response = smr.read_until('\r\n') if response[:3] != '+OK': util.log('failed to execute migrate start command, response:%s' % response) return False while True: smr.write('migrate info\r\n') response = smr.read_until('\r\n') seqs = response.split() logseq = int(seqs[1].split(':')[1]) mig = int(seqs[2].split(':')[1]) util.log('migrate info: %s' % response) if (logseq - mig < 500000): util.log('Remote catchup almost done. try mig2pc') break time.sleep(1) util.log(">>> sleep until 90 sec pass") self.assertFalse(time.time() - ts >= 90) time.sleep(90 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20) util.log(">>> remote catchup phase almost done (%s)" % time.asctime()) # mig2pc util.log(">>> start mig2pc (%s)" % time.asctime()) cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id, 0, 8191) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) util.log('mig2pc result : ' + result) if not result.startswith('{"state":"success","msg":"+OK"}\r\n'): util.log('failed to execute mig2pc command, result:%s' % result) return False util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10) self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20) # finish migration smr.write('migrate interrupt\r\n') response = smr.read_until('\r\n') util.log('migrate interrupt: %s' % response) smr.disconnect() # notify dst_redis of migration end util.log(">>> notify dst_redis of migration end (%s)" % time.asctime()) cmd = 'migconf migend\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191) src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") ts = time.time() util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) # remote partial checkpoint util.log(">>> start rangedel (%s)" % time.asctime()) cmd = "./cluster-util --rangedel %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: util.log(">>>" + str(line.rstrip())) cmd = 'migconf clearend\r\n' src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEqual(res, '+OK\r\n') time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # kill dst_redis and recover from bgsave util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime()) dst_redis.disconnect() ret = testbase.request_to_shutdown_redis(dst_master) self.assertEquals(ret, 0, 'failed to shutdown redis') ret = testbase.request_to_shutdown_smr(dst_master) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_master) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % dst_master['id']) ret = testbase.request_to_start_redis(dst_master) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % dst_master['id']) ret = testbase.wait_until_finished_to_set_up_role(dst_master) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id'])) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis, 'S3:PermanentKey') # kill dst_slave redis and recover without dump file util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime()) dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id) ret = testbase.request_to_shutdown_redis(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown redis') ret = testbase.request_to_shutdown_smr(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_slave) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % dst_slave['id']) ret = testbase.request_to_start_redis(dst_slave) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % dst_slave['id']) ret = testbase.wait_until_finished_to_set_up_role(dst_slave) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id'])) dst_redis_slave = redis_mgmt.Redis(dst_slave['id']) ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') self.assertTrue( self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis_slave, 'S3:PermanentKey') # Go back to initial configuration self.assertTrue( util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000), 'failed to rollback migration')
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr(target) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(target) self.assertEquals(ret, 0, 'failed to shutdown redis') r = '' expected = 'N' for fc_cnt in xrange(20): r = util.get_smr_role_of_cm(target, self.leader_cm) if r == expected: break time.sleep(0.5) self.assertEquals(r, expected, 'failure detection error.') running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual( old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue( ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr(target) self.assertEqual(ret, 0, 'failed to start smr') util.log('start smr-replicator done') ret = testbase.request_to_start_redis(target, 60) self.assertEqual(ret, 0, 'failed to start redis') util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role(target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id'])) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Check quorum expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue( ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def test_moving_pgs(self): util.print_frame() # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator( i, ip, port) self.load_gen_thrd_list[i].start() util.log("started load_generator") servers = self.cluster['servers'] gw_list = [] for server in servers: gw = {} gw['mgmt'] = telnetlib.Telnet(server['ip'], server['gateway_port'] + 1) gw['normal'] = telnetlib.Telnet(server['ip'], server['gateway_port']) gw_list.append(gw) n = 0 step = 0 iter = 30 while iter > 0: if n == 0 or random.randint(0, 1) == 0: step = random.randint(1, 10) else: step = -1 * random.randint(1, n) print "<<< ITER = %d, PG%d -> PG%d, PG%d -> PG%d >>>" % ( iter, n * 2, (n + step) * 2, n * 2 + 1, (n + step) * 2 + 1) gw = gw_list[0] self.pgs_del_server(gw['mgmt'], servers[0], n) self.pgs_del_server(gw['mgmt'], servers[1], n) self.pgs_del_server(gw['mgmt'], servers[5], n) gw['mgmt'].write("pg_add %d\r\n" % ((n + step) * 2)) gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("pg_add %d\r\n" % ((n + step) * 2 + 1)) gw['mgmt'].read_until("+OK\r\n") self.pgs_add_server(gw['mgmt'], servers[0], n + step) self.pgs_add_server(gw['mgmt'], servers[1], n + step) self.pgs_add_server(gw['mgmt'], servers[5], n + step) while True: gw['normal'].write("info gateway\r\n") ret = gw['normal'].read_until("\r\n", 1) if "-ERR" in ret: continue ret = gw['normal'].read_until("\r\n\r\n", 1) #print ret if "gateway_disconnected_redis:0\r\n" in ret: break gw['mgmt'].write("delay 0 4095\r\n") gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("delay 4096 8191\r\n") gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("redirect 0 4095 %d\r\n" % ((n + step) * 2)) gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("redirect 4096 8191 %d\r\n" % ((n + step) * 2 + 1)) gw['mgmt'].read_until("+OK\r\n") gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n") print gw_list[0]['mgmt'].read_until("+PONG\r\n") self.pgs_del_server(gw['mgmt'], servers[2], n) self.pgs_del_server(gw['mgmt'], servers[3], n) self.pgs_del_server(gw['mgmt'], servers[4], n) self.pgs_add_server(gw['mgmt'], servers[2], n + step) self.pgs_add_server(gw['mgmt'], servers[3], n + step) self.pgs_add_server(gw['mgmt'], servers[4], n + step) while True: gw['normal'].write("info gateway\r\n") ret = gw['normal'].read_until("\r\n", 1) if "-ERR" in ret: continue ret = gw['normal'].read_until("\r\n\r\n", 1) #print ret if "gateway_disconnected_redis:0\r\n" in ret: break gw['mgmt'].write("pg_del %d\r\n" % (n * 2)) gw['mgmt'].read_until("+OK\r\n") gw['mgmt'].write("pg_del %d\r\n" % (n * 2 + 1)) gw['mgmt'].read_until("+OK\r\n") n += step gw_list[0]['mgmt'].write("cluster_info\r\nping\r\n") print gw_list[0]['mgmt'].read_until("+PONG\r\n") iter -= 1 # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test')
def test_rdb_backups(self): util.print_frame() bgsave_count = 50 org_path = os.getcwd() os.chdir(util.redis_dir(0)) server0 = self.cluster['servers'][0] redis0 = telnetlib.Telnet(server0['ip'], server0['redis_port']) util.log("Starting load generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator( i, ip, port) self.load_gen_thrd_list[i].start() util.log("Set the number of rdb backups = 24") redis0.write("config set number-of-rdb-backups 24\r\n") redis0.read_until("+OK\r\n") util.log("Clear old rdb backups\r\n") for f in os.listdir('.'): if (f.endswith('.rdb')): os.remove(f) util.log( "Bgsaving continuously and counting the number of rdb backups") for i in range(bgsave_count): # Save current time before Bgsaving redis0.write('time\r\n') redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) ret = redis0.read_until('\r\n', 1) redis_server_time = int(ret.strip()) redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) time.sleep(1.1) redis0.write('time\r\n') redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) ret = redis0.read_until('\r\n', 1) self.assertNotEqual(redis_server_time, int(ret.strip())) redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) util.log("%d ~ %d" % (redis_server_time, int(ret.strip()))) # Bgsave redis0.write("bgsave\r\n") ret = redis0.read_until('\r\n', 1) self.assertEqual('+Background saving started\r\n', ret) # Wait finishing bgsave while True: redis0.write('lastsave\r\n') ret = redis0.read_until('\r\n', 1) lastsave_time = int(ret[1:].strip()) if lastsave_time > redis_server_time: break time.sleep(0.1) # Count the number of rdb backups rdb_list = [ name for name in os.listdir('.') if os.path.isfile(name) and name.startswith('dump') and name.endswith('.rdb') ] util.log(rdb_list) util.log("Iteration:%d, rdb Backups:%d" % (i + 1, len(rdb_list))) self.assertTrue(i + 1 > 24 and len(rdb_list) == 25 or len(rdb_list) == i + 1) self.assertTrue('dump.rdb' in rdb_list) util.log("\nSet the number of rdb backups = 5") redis0.write("config set number-of-rdb-backups 5\r\n") redis0.read_until("+OK\r\n") for i in range(3): # Save current time before Bgsaving redis0.write('time\r\n') redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) ret = redis0.read_until('\r\n', 1) redis_server_time = int(ret.strip()) redis0.read_until('\r\n', 1) redis0.read_until('\r\n', 1) time.sleep(1.1) # Bgsave redis0.write("bgsave\r\n") ret = redis0.read_until('\r\n', 1) self.assertEqual('+Background saving started\r\n', ret) # Wait finishing bgsave while True: redis0.write('lastsave\r\n') ret = redis0.read_until('\r\n', 1) lastsave_time = int(ret[1:].strip()) if lastsave_time > redis_server_time: break time.sleep(0.1) # Count the number of rdb backups rdb_list = [ name for name in os.listdir('.') if os.path.isfile(name) and name.startswith('dump') and name.endswith('.rdb') ] util.log(rdb_list) util.log("Iteration:%d, rdb Backups:%d" % (i + 1, len(rdb_list))) self.assertTrue(len(rdb_list) == 6) self.assertTrue('dump.rdb' in rdb_list) # check consistency of load_generator for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].quit() for i in range(len(self.load_gen_thrd_list)): self.load_gen_thrd_list[i].join() self.assertTrue(self.load_gen_thrd_list[i].isConsistent(), 'Inconsistent after gateway_mgmt test') os.chdir(org_path)
def test_two_slaves_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs( s1 ) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) ) ts_before2 = util.get_timestamp_of_pgs( s2 ) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) ) # hang smr1 = smr_mgmt.SMR( s1['id'] ) ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr2 = smr_mgmt.SMR( s2['id'] ) ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr1.write( 'fi delay sleep 1 8000\r\n' ) reply = smr1.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 7 ) # wait for rejoin as a slave success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s1 ) if ts_after != -1 and ts_before1 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) success = False for i in range( 20 ): role = util.get_role_of_server( s2 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s2 ) if ts_after != -1 and ts_before2 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_random_migrate(self): util.print_frame() # start load generator load_gen_thrd_list = {} util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') leader_cm = self.cluster['servers'][0] cluster_name = self.cluster['cluster_name'] mapping = [-1] * 8192 count = 50 while count > 0: # get PN -> PG map cmd = 'cluster_info %s' % cluster_name result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) ret = json.loads(result) rle = ret['data']['cluster_info']['PN_PG_Map'] print "PN_PG_MAP = %s" % rle sp = rle.split() index = 0 for i in range(len(sp)/2): for j in range(int(sp[i*2+1])): mapping[index] = int(sp[i*2]) index += 1 slot = random.randint(0, 8191) src_pgid = mapping[slot] dst_pgid = (src_pgid+1) % 2 slot_end = slot while random.randint(0,5) <= 4: if slot_end < 8191 and mapping[slot_end+1] == src_pgid: slot_end += 1 else: break print "SLOT=%d, SRC_PGID=%d, DST_PGID=%d" % (slot, src_pgid, dst_pgid) ret = util.migration(self.cluster, src_pgid, dst_pgid, slot, slot_end, 40000) self.assertEqual(True, ret, 'Migration Fail') ok = True for j in range(len(load_gen_thrd_list)): if load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break; count -= 1; # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # Go back to initial configuration cinfo = util.cluster_info(leader_cm['ip'], leader_cm['cm_port'], cluster_name) for slot in util.get_slots(cinfo['cluster_info']['PN_PG_Map'], 1): self.assertTrue(util.migration(self.cluster, 1, 0, slot['begin'], slot['end'], 40000), 'failed to rollback migration')
def master_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 10000\r\n' ) reply = smr.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) time.sleep( 5 ) # wait for forced master election success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True break time.sleep( 1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to forced master election' ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check if the haning server recovered and joined as a slave time.sleep( 7 ) role = util.get_role_of_server( m ) self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_and_slave_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave = smr_mgmt.SMR( s1['id'] ) ret = smr_slave.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_master.write( 'fi delay sleep 1 10000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave.write( 'fi delay sleep 1 10000\r\n' ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) time.sleep( 5 ) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range( 15 ): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role != 'M': success = False break time.sleep( 1 ) util.log( '' ) util.log( 'It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2' ) util.log( '' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to check copy-quorum' ) ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue( ok, 'Cluster state is not normal!' ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write( cmd ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port']) ) if len(self.cluster['servers']) != 3: # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def start_load_generator(self, num): for i in range(num): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_thrd_list[i].start()
def test_all_pgs_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave1 = smr_mgmt.SMR( s1['id'] ) ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_slave2 = smr_mgmt.SMR( s2['id'] ) ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) m_ts = util.get_timestamp_of_pgs( m ) s1_ts = util.get_timestamp_of_pgs( s1 ) s2_ts = util.get_timestamp_of_pgs( s2 ) smr_master.write( 'fi delay sleep 1 8000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave1.write( 'fi delay sleep 1 8000\r\n' ) smr_slave2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 10 ) # check consistency ok = False for try_cnt in xrange(20): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) if ok: break time.sleep(0.5) self.assertTrue(ok, 'Unstable cluster state') util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # set values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0 .write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check new values (s2) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) ) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr( target ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( target ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps= {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr( target ) self.assertEqual( ret, 0, 'failed to start smr' ) util.log('start smr-replicator done') ret = testbase.request_to_start_redis( target, 60 ) self.assertEqual( ret, 0, 'failed to start redis' ) util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) ) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check quorum expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def slave_failover_while_hang( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) self.failover_while_hang( s1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (s1['id'], s2['id']) ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_5_transfer_pgs_to_another_machine(self): util.print_frame() self.load_gen_list = {} # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # incrase master generation number util.log('failover in order to increase master generation number.') max = 0 for i in range(5): key_base = 'key' for i in range(max, max+10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) max = max + 10000 m = util.get_server_by_role(self.cluster['servers'], 'master') util.log('failover pgs%d' % m['id']) ret = util.failover(m, self.leader_cm) self.assertTrue(ret, 'failed to failover pgs%d' % m['id']) # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") m, s1, s2 = util.get_mss(self.cluster) servers = [m, s1, s2] # bgsave for s in servers: ret = util.bgsave(s) self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id']) new_servers = [config.server4, config.server5] # add new slaves for s in new_servers: util.log('delete pgs%d`s check point.' % s['id']) util.del_dumprdb(s['id']) ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'], 'dump.rdb', 0, 8191) self.assertEqual(True, ret, 'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d' % ( m['ip'], m['redis_port'], s['id'])) ret = util.pgs_add(self.cluster, s, self.leader_cm, 0, rm_ckpt=False) self.assertEqual(True, ret, 'failed : util.pgs_add returns false, pgsid=%d' % s['id']) util.log('succeeeded : add a new slave, pgsid=%d' % s['id']) # check consistency ok = True for j in range(self.max_load_generator): if self.load_gen_list[j].isConsistent() == False: ok = False break if not ok: break; for server_to_del in servers: for s in servers: util.pingpong( s['ip'], s['smr_mgmt_port'] ) for s in new_servers: util.pingpong( s['ip'], s['smr_mgmt_port'] ) self.__del_server(server_to_del) util.log('succeeded : delete pgs%d' % server_to_del['id']) new_m = util.get_server_by_role(new_servers, 'master') new_s = util.get_server_by_role(new_servers, 'slave') self.assertNotEqual( new_m, None, 'master is None.' ) self.assertNotEqual( new_s, None, 'slave is None.' ) for s in new_servers: util.pingpong( s['ip'], s['smr_mgmt_port'] ) time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def test_1_role_change(self): util.print_frame() self.load_gen_list = {} # Start load generator util.log("Start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Loop (smr: 3 copy) target_server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(target_server, None, 'Get slave fail.') target = target_server['id'] for i in range(30): print '' util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target)) # Get old timestamp util.log_server_state( self.cluster ) old_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target + 1) % 3 util.log('Change role success.') # Wait until role change finished for s in self.cluster['servers']: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break time.sleep(0.1) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state( self.cluster ) new_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(3): self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change') # Loop (smr: 2 copy) self.__del_server(self.cluster['servers'][0]) servers = [self.cluster['servers'][1], self.cluster['servers'][2]] s = util.get_server_by_role(servers, 'slave') target = s['id'] for i in range(30): print '' util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target)) # Get old timestamp util.log_server_state( self.cluster ) old_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs( s ) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target) % 2 + 1 util.log('Change role success.') # Wait until role change finished for s in servers: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break time.sleep(0.1) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state( self.cluster ) new_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs( s ) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(2): self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')
def pgs_add_and_del(self, upgrade_server, type): util.print_frame() util.log('[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type)) util.log_server_state(self.cluster) # start load generator load_gen_list = {} for i in range(len(self.cluster['servers'])): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 50): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) # attach pgs from cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual(jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret)) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) time.sleep(3) # check new values redis = redis_mgmt.Redis(upgrade_server['id']) ret = redis.connect(upgrade_server['ip'], upgrade_server['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port'])) for i in range(0, 50): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write(cmd) redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i)) util.log('succeeded : check values with get operations on pgs%d.' % (upgrade_server['id'])) # shutdown load generators for i in range(len(load_gen_list)): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state(self.cluster) return 0
def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 1 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state( self.cluster ) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Get new timestamp new_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Compare old timestamps and new timestamps for s in self.cluster['servers']: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] if master in hanging_servers and len(running_servers) != 0: self.assertNotEqual(old_ts, new_ts, 'Timestamp of a hanging server has not changed. %d->%d' % (old_ts, new_ts)) else: self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def failure_recovery(self, role, wait_count=10, redis_only=False): time.sleep(2) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # shutdown server = util.get_server_by_role(self.cluster['servers'], role) if redis_only == False: ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, wait_count) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) # check value cmd = 'get %s\r\n' % (key) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value))